In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

In [2]:
result1415=pd.read_csv("/Users/fezzibasma/Desktop/Data Full-Stack/Machine Learning Unsupervised/Projet Uber/export_step/aggregate_1415.csv",index_col=[0])
result1415.head()

Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
0,9/1/2014,0:03:00,48.0,B02512,-73.99,40.76,Manhattan,Clinton East
1,9/1/2014,0:33:00,161.0,B02512,-73.98,40.76,Manhattan,Midtown Center
2,9/1/2014,0:33:00,230.0,B02512,-73.98,40.76,Manhattan,Times Sq/Theatre District
3,9/1/2014,0:37:00,166.0,B02512,-73.96,40.81,Manhattan,Morningside Heights
4,9/1/2014,0:48:00,114.0,B02512,-73.99,40.72,Manhattan,Greenwich Village South


In [3]:
result1415.Borough.value_counts()

Manhattan        11223950
Brooklyn          2359200
Queens            1352487
Bronx              220009
Staten Island        6950
EWR                  5089
Name: Borough, dtype: int64

## I will focus confuse second on Manhattan and Brooklyn to see if there is any changes here what are the 'new' three hot zones there !!

In [4]:
#extract a sample 
result1415_sample=result1415[result1415.Borough.isin(['Manhattan','Brooklyn','Queens'])].sample(n=1000)

In [5]:
# Statistiques basiques
print("Nombre de lignes : {}".format(result1415_sample.shape[0]))
print()

print("Aperçu du dataset : ")
display(result1415_sample.head())
print()

print("Statistiques basiques : ")
data_desc = result1415_sample.describe(include='all')
display(data_desc)
print()

print("Pourcentage de valeurs manquantes : ")
display(100*result1415_sample.isnull().sum()/result1415_sample.shape[0])

Nombre de lignes : 1000

Aperçu du dataset : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
13340490,07/05/15,13:59:00,246.0,B02764,-74.0,40.76,Manhattan,West Chelsea/Hudson Yards
5810954,04/04/15,16:58:00,68.0,B02598,-74.0,40.74,Manhattan,East Chelsea
784138,01/04/15,06:38:00,239.0,B01949,-73.98,40.79,Manhattan,Upper West Side South
1330298,23/02/15,21:30:27,113.0,B02764,-73.99,40.73,Manhattan,Greenwich Village North
5107012,27/04/15,09:10:00,79.0,B00381,-73.98,40.72,Manhattan,East Village



Statistiques basiques : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
count,1000,1000,1000.0,1000,1000.0,1000.0,1000,1000
unique,207,831,,62,,,3,129
top,18/04/15,22:15:00,,B02764,,,Manhattan,Midtown Center
freq,12,4,,289,,,762,38
mean,,,155.072,,-73.96839,40.73914,,
std,,,70.359401,,0.041022,0.035155,,
min,,,4.0,,-74.03,40.61,,
25%,,,100.0,,-73.99,40.72,,
50%,,,158.0,,-73.98,40.74,,
75%,,,230.0,,-73.96,40.76,,



Pourcentage de valeurs manquantes : 


Date          0.0
Time          0.0
locationID    0.0
Base          0.0
Lon           0.0
Lat           0.0
Borough       0.0
Zone          0.0
dtype: float64

## DBSCAN 🚀🚀

In [6]:
# On jette la colonne locationID
useless_cols = ['locationID','Time','Base','Borough','Date']

print("Les colonnes suivantes vont être jetées : ", useless_cols)
dataset_dbscan = result1415_sample.drop(useless_cols, axis=1)
dataset_dbscan.head()

Les colonnes suivantes vont être jetées :  ['locationID', 'Time', 'Base', 'Borough', 'Date']


Unnamed: 0,Lon,Lat,Zone
13340490,-74.0,40.76,West Chelsea/Hudson Yards
5810954,-74.0,40.74,East Chelsea
784138,-73.98,40.79,Upper West Side South
1330298,-73.99,40.73,Greenwich Village North
5107012,-73.98,40.72,East Village


In [7]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

# Création du pipeline pour les variables catégorielles
categorical_features = [0,2] # Positions des colonnes catégorielles dans X
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # on encode les catégories sous forme de colonnes comportant des 0 et des 1
    ])

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(dataset_dbscan.head())
X = preprocessor.fit_transform(dataset_dbscan) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lon    Lat                       Zone
13340490 -74.00  40.76  West Chelsea/Hudson Yards
5810954  -74.00  40.74               East Chelsea
784138   -73.98  40.79      Upper West Side South
1330298  -73.99  40.73    Greenwich Village North
5107012  -73.98  40.72               East Village
...Terminé.
  (0, 0)	-0.7709508672620821
  (0, 1)	0.5936637685044419
  (0, 3)	1.0
  (0, 144)	1.0
  (1, 0)	-0.7709508672620821
  (1, 1)	0.024475112220307176
  (1, 3)	1.0
  (1, 57)	1.0
  (2, 0)	-0.28316164406546007
  (2, 1)	1.4474467529308463
  (2, 5)	1.0
  (2, 141)	1.0
  (3, 0)	-0.5270562556635977
  (3, 1)	-0.2601192159219624
  (3, 4)	1.0
  (3, 78)	1.0
  (4, 0)	-0.28316164406546007
  (4, 1)	-0.5447135440640298
  (4, 5)	1.0
  (4, 63)	1.0



In [41]:
# import DBSCAN from sklearn and numpy
from sklearn.cluster import DBSCAN
import numpy as np

# Instanciate DBSCAN 
db = DBSCAN(eps=0.1, min_samples=20, metric="manhattan", algorithm="brute").fit(X)
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [42]:
#number of clusters 
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("the number of clusters is ", n_clusters_)

the number of clusters is  13


In [43]:
dataset_dbscan["clusters_dbscan"] = labels
dataset_dbscan.head()

Unnamed: 0,Lon,Lat,Zone,clusters_dbscan
13340490,-74.0,40.76,West Chelsea/Hudson Yards,0
5810954,-74.0,40.74,East Chelsea,1
784138,-73.98,40.79,Upper West Side South,-1
1330298,-73.99,40.73,Greenwich Village North,2
5107012,-73.98,40.72,East Village,-1


In [44]:
import plotly.express as px
fig = px.scatter_mapbox(dataset_dbscan[dataset_dbscan.clusters_dbscan != -1], lat="Lat", lon="Lon", color="Zone",size="clusters_dbscan",
                  zoom=15, mapbox_style="carto-positron",color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

## we concluded that by adding a new borough as Queen, the three hot zones doesn't influence Midtown Center and Union SQ and Civic Center. 
## We notice that Times SQ/Theatre District est confondu avec Midtown Center and Upper East Side South est confondu aussi avec Midtown East.


## Les noms de borough étant modifiés dans le borough Queens influence la distribution par inertie du DBScan, le mieux serait de restreindre l'analyse à Brooklyn et Manhattan pour eviter d'avoir des noms redendants.