In [65]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

In [66]:
result1415=pd.read_csv("/Users/fezzibasma/Desktop/Data Full-Stack/Machine Learning Unsupervised/Projet Uber/export_step/aggregate_1415.csv",index_col=[0])
result1415.head()

Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
0,9/1/2014,0:03:00,48.0,B02512,-73.99,40.76,Manhattan,Clinton East
1,9/1/2014,0:33:00,161.0,B02512,-73.98,40.76,Manhattan,Midtown Center
2,9/1/2014,0:33:00,230.0,B02512,-73.98,40.76,Manhattan,Times Sq/Theatre District
3,9/1/2014,0:37:00,166.0,B02512,-73.96,40.81,Manhattan,Morningside Heights
4,9/1/2014,0:48:00,114.0,B02512,-73.99,40.72,Manhattan,Greenwich Village South


In [67]:
result1415.Borough.value_counts()

Manhattan        11223950
Brooklyn          2359200
Queens            1352487
Bronx              220009
Staten Island        6950
EWR                  5089
Name: Borough, dtype: int64

## I will focus confuse second on Manhattan and Brooklyn to see if there is any changes here what are the 'new' three hot zones there !!

In [105]:
#extract a sample 
result1415_sample=result1415[result1415.Borough.isin(['Manhattan','Brooklyn'])].sample(n=1000)

In [106]:
# Statistiques basiques
print("Nombre de lignes : {}".format(result1415_sample.shape[0]))
print()

print("Aperçu du dataset : ")
display(result1415_sample.head())
print()

print("Statistiques basiques : ")
data_desc = result1415_sample.describe(include='all')
display(data_desc)
print()

print("Pourcentage de valeurs manquantes : ")
display(100*result1415_sample.isnull().sum()/result1415_sample.shape[0])

Nombre de lignes : 1000

Aperçu du dataset : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
335958,9/5/2014,4:47:00,113.0,B02617,-73.99,40.73,Manhattan,Greenwich Village North
1117438,20/05/15,06:20:00,181.0,B02682,-73.98,40.67,Brooklyn,Park Slope
4938522,03/06/15,18:32:00,164.0,B02764,-73.98,40.74,Manhattan,Midtown South
3216910,11/06/15,18:27:00,163.0,B02682,-73.98,40.77,Manhattan,Midtown North
14182014,15/02/15,18:40:09,186.0,B02765,-73.99,40.75,Manhattan,Penn Station/Madison Sq West



Statistiques basiques : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
count,1000,1000,1000.0,1000,1000.0,1000.0,1000,1000
unique,206,814,,57,,,2,102
top,17/06/15,19:19:00,,B02764,,,Manhattan,Midtown Center
freq,13,4,,301,,,827,39
mean,,,151.951,,-73.97926,40.73701,,
std,,,70.668654,,0.021515,0.036845,,
min,,,4.0,,-74.03,40.58,,
25%,,,90.0,,-74.0,40.72,,
50%,,,158.0,,-73.98,40.74,,
75%,,,230.0,,-73.97,40.76,,



Pourcentage de valeurs manquantes : 


Date          0.0
Time          0.0
locationID    0.0
Base          0.0
Lon           0.0
Lat           0.0
Borough       0.0
Zone          0.0
dtype: float64

## DBSCAN 🚀🚀

In [107]:
# On jette la colonne locationID
useless_cols = ['locationID','Time','Base','Borough','Date']

print("Les colonnes suivantes vont être jetées : ", useless_cols)
dataset_dbscan = result1415_sample.drop(useless_cols, axis=1)
dataset_dbscan.head()

Les colonnes suivantes vont être jetées :  ['locationID', 'Time', 'Base', 'Borough', 'Date']


Unnamed: 0,Lon,Lat,Zone
335958,-73.99,40.73,Greenwich Village North
1117438,-73.98,40.67,Park Slope
4938522,-73.98,40.74,Midtown South
3216910,-73.98,40.77,Midtown North
14182014,-73.99,40.75,Penn Station/Madison Sq West


In [108]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

# Création du pipeline pour les variables catégorielles
categorical_features = [0,2] # Positions des colonnes catégorielles dans X
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # on encode les catégories sous forme de colonnes comportant des 0 et des 1
    ])

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(dataset_dbscan.head())
X = preprocessor.fit_transform(dataset_dbscan) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lon    Lat                          Zone
335958   -73.99  40.73       Greenwich Village North
1117438  -73.98  40.67                    Park Slope
4938522  -73.98  40.74                 Midtown South
3216910  -73.98  40.77                 Midtown North
14182014 -73.99  40.75  Penn Station/Madison Sq West
...Terminé.
  (0, 0)	-0.49942555687319684
  (0, 1)	-0.19035424604207463
  (0, 4)	1.0
  (0, 58)	1.0
  (1, 0)	-0.034411071888581356
  (1, 1)	-1.8196345260007387
  (1, 5)	1.0
  (1, 85)	1.0
  (2, 0)	-0.034411071888581356
  (2, 1)	0.0811924672845302
  (2, 5)	1.0
  (2, 80)	1.0
  (3, 0)	-0.034411071888581356
  (3, 1)	0.8958326072639587
  (3, 5)	1.0
  (3, 79)	1.0
  (4, 0)	-0.49942555687319684
  (4, 1)	0.35273918061094206
  (4, 4)	1.0
  (4, 86)	1.0



In [117]:
# import DBSCAN from sklearn and numpy
from sklearn.cluster import DBSCAN
import numpy as np

# Instanciate DBSCAN 
db = DBSCAN(eps=0.1, min_samples=28, metric="manhattan", algorithm="brute").fit(X)
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5])

In [118]:
#number of clusters 
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("the number of clusters is ", n_clusters_)

the number of clusters is  6


In [119]:
dataset_dbscan["clusters_dbscan"] = labels
dataset_dbscan.head()

Unnamed: 0,Lon,Lat,Zone,clusters_dbscan
335958,-73.99,40.73,Greenwich Village North,0
1117438,-73.98,40.67,Park Slope,-1
4938522,-73.98,40.74,Midtown South,-1
3216910,-73.98,40.77,Midtown North,-1
14182014,-73.99,40.75,Penn Station/Madison Sq West,-1


In [120]:
import plotly.express as px
fig = px.scatter_mapbox(dataset_dbscan[dataset_dbscan.clusters_dbscan != -1], lat="Lat", lon="Lon", color="Zone",size="clusters_dbscan",
                  zoom=15, mapbox_style="carto-positron",color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

## we concluded that by adding a new borough, the three hot zones doesn't influence Midtown Center and Union SQ but the third position got influenced by Civic Center. So the East Village got the 4th position.