In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"

In [2]:
result1415=pd.read_csv("/Users/fezzibasma/Desktop/Data Full-Stack/Machine Learning Unsupervised/Projet Uber/export_step/aggregate_1415.csv",index_col=[0])
result1415.head()

Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
0,9/1/2014,0:03:00,48.0,B02512,-73.99,40.76,Manhattan,Clinton East
1,9/1/2014,0:33:00,161.0,B02512,-73.98,40.76,Manhattan,Midtown Center
2,9/1/2014,0:33:00,230.0,B02512,-73.98,40.76,Manhattan,Times Sq/Theatre District
3,9/1/2014,0:37:00,166.0,B02512,-73.96,40.81,Manhattan,Morningside Heights
4,9/1/2014,0:48:00,114.0,B02512,-73.99,40.72,Manhattan,Greenwich Village South


In [3]:
# Statistiques basiques
print("Nombre de lignes : {}".format(result1415.shape[0]))
print()

print("Aperçu du dataset : ")
display(result1415.head())
print()

print("Statistiques basiques : ")
data_desc = result1415.describe(include='all')
display(data_desc)
print()

print("Pourcentage de valeurs manquantes : ")
display(100*result1415.isnull().sum()/result1415.shape[0])

Nombre de lignes : 15167685

Aperçu du dataset : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
0,9/1/2014,0:03:00,48.0,B02512,-73.99,40.76,Manhattan,Clinton East
1,9/1/2014,0:33:00,161.0,B02512,-73.98,40.76,Manhattan,Midtown Center
2,9/1/2014,0:33:00,230.0,B02512,-73.98,40.76,Manhattan,Times Sq/Theatre District
3,9/1/2014,0:37:00,166.0,B02512,-73.96,40.81,Manhattan,Morningside Heights
4,9/1/2014,0:48:00,114.0,B02512,-73.99,40.72,Manhattan,Greenwich Village South



Statistiques basiques : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
count,15167685,15167685,15167680.0,15167685,15167680.0,15167680.0,15167685,15167685
unique,211,87000,,284,,,6,259
top,27/06/15,19:18:00,,B02764,,,Manhattan,Midtown Center
freq,136201,13466,,4528479,,,11223950,499344
mean,,,152.1517,,-73.96712,40.73975,,
std,,,71.44888,,0.04332036,0.0391746,,
min,,,1.0,,-74.21,40.53,,
25%,,,95.0,,-73.99,40.72,,
50%,,,158.0,,-73.98,40.74,,
75%,,,230.0,,-73.96,40.76,,



Pourcentage de valeurs manquantes : 


Date          0.0
Time          0.0
locationID    0.0
Base          0.0
Lon           0.0
Lat           0.0
Borough       0.0
Zone          0.0
dtype: float64

In [4]:
# On jette la colonne locationID
useless_cols = ['locationID','Time','Base','Borough','Zone']

print("Les colonnes suivantes vont être jetées : ", useless_cols)
dataset = result1415.drop(useless_cols, axis=1)
dataset.head()

Les colonnes suivantes vont être jetées :  ['locationID', 'Time', 'Base', 'Borough', 'Zone']


Unnamed: 0,Date,Lon,Lat
0,9/1/2014,-73.99,40.76
1,9/1/2014,-73.98,40.76
2,9/1/2014,-73.98,40.76
3,9/1/2014,-73.96,40.81
4,9/1/2014,-73.99,40.72


In [5]:
# Création du pipeline pour les variables quantitatives
numeric_features = [1,2] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

# Création du pipeline pour les variables catégorielles
categorical_features = [0] # Positions des colonnes catégorielles dans X
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # on encode les catégories sous forme de colonnes comportant des 0 et des 1
    ])

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(dataset.head())
X = preprocessor.fit_transform(dataset) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
       Date    Lon    Lat
0  9/1/2014 -73.99  40.76
1  9/1/2014 -73.98  40.76
2  9/1/2014 -73.98  40.76
3  9/1/2014 -73.96  40.81
4  9/1/2014 -73.99  40.72
...Terminé.
  (0, 0)	-0.5280533725311901
  (0, 1)	0.5169767491214281
  (0, 182)	1.0
  (1, 0)	-0.2972150424021254
  (1, 1)	0.5169767491214281
  (1, 182)	1.0
  (2, 0)	-0.2972150424021254
  (2, 1)	0.5169767491214281
  (2, 182)	1.0
  (3, 0)	0.16446161785666014
  (3, 1)	1.7933138715828398
  (3, 182)	1.0
  (4, 0)	-0.5280533725311901
  (4, 1)	-0.5040929488475925
  (4, 182)	1.0



In [6]:
# Import K-Means 
from sklearn.cluster import KMeans

# Instanciate KMeans with k=3 and initialisation with k-means++
# You should always use k-means++ as it alleviate the problem of local minimum convergence 
kmeans = KMeans(n_clusters=3, random_state=0)

# Fit kmeans to our dataset
kmeans.fit(X)

KMeans(n_clusters=3, random_state=0)

In [7]:
c=kmeans.predict(X)

In [8]:
# Cluster centers coordinates
# Let's use inverse transform to have the real coordinates 

cluster_centers = kmeans.cluster_centers_
cluster_centers

array([[-4.69096053e-02,  7.76393614e-01,  4.31366731e-03,
         4.12277569e-03,  4.83382683e-03,  5.95066362e-03,
         6.78494859e-03,  2.37285531e-03,  4.83237702e-03,
         3.97698926e-03,  4.98299615e-03,  5.59980966e-03,
         7.00499749e-03,  3.49613569e-03,  4.97123658e-03,
         5.63524945e-03,  4.82802759e-03,  4.26308506e-03,
         6.02299302e-03,  2.25316546e-03,  4.82899413e-03,
         5.28536203e-03,  4.04078089e-03,  4.43335716e-03,
         6.52172757e-03,  2.86740153e-03,  5.69614146e-03,
         5.76975958e-03,  3.29735067e-03,  5.28197914e-03,
         6.99678191e-03,  3.82701450e-03,  5.76782650e-03,
         5.60754197e-03,  3.57506978e-03,  5.59368824e-03,
         5.94873054e-03,  4.38825197e-03,  4.73668958e-03,
         4.79951467e-03,  4.73975029e-03,  6.01944904e-03,
         5.14344176e-03,  5.08754354e-03,  3.49098081e-03,
         3.44410363e-03,  4.94723417e-03,  5.91554600e-03,
         5.31886874e-03,  4.76020871e-03,  4.30593499e-0

In [10]:
import plotly.express as px
fig = px.scatter_mapbox(X, lat=X[:,2], lon=X[:,1], color=c,
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
fig.show()