In [2]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

In [3]:
result1415=pd.read_csv("/Users/fezzibasma/Desktop/Data Full-Stack/Machine Learning Unsupervised/Projet Uber/export_step/aggregate_1415.csv",index_col=[0])
result1415.head()

Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
0,9/1/2014,0:03:00,48.0,B02512,-73.99,40.76,Manhattan,Clinton East
1,9/1/2014,0:33:00,161.0,B02512,-73.98,40.76,Manhattan,Midtown Center
2,9/1/2014,0:33:00,230.0,B02512,-73.98,40.76,Manhattan,Times Sq/Theatre District
3,9/1/2014,0:37:00,166.0,B02512,-73.96,40.81,Manhattan,Morningside Heights
4,9/1/2014,0:48:00,114.0,B02512,-73.99,40.72,Manhattan,Greenwich Village South


In [4]:
result1415.Borough.value_counts()

Manhattan        11223950
Brooklyn          2359200
Queens            1352487
Bronx              220009
Staten Island        6950
EWR                  5089
Name: Borough, dtype: int64

## I will focus first only on Manhattan because it has the most amount of fluctuations and figured out what are the three hot zones there !!

In [5]:
#extract a sample 
result1415_sample=result1415[result1415.Borough=='Manhattan'].sample(n=1000)

In [6]:
# Statistiques basiques
print("Nombre de lignes : {}".format(result1415_sample.shape[0]))
print()

print("Aperçu du dataset : ")
display(result1415_sample.head())
print()

print("Statistiques basiques : ")
data_desc = result1415_sample.describe(include='all')
display(data_desc)
print()

print("Pourcentage de valeurs manquantes : ")
display(100*result1415_sample.isnull().sum()/result1415_sample.shape[0])

Nombre de lignes : 1000

Aperçu du dataset : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
4939146,30/05/15,15:08:00,230.0,B02682,-73.98,40.76,Manhattan,Times Sq/Theatre District
1245908,14/02/15,12:30:01,164.0,B02682,-73.98,40.74,Manhattan,Midtown South
8578066,26/05/15,12:37:00,237.0,B02680,-73.97,40.76,Manhattan,Upper East Side South
10095163,17/04/15,18:18:00,231.0,B02617,-74.01,40.72,Manhattan,TriBeCa/Civic Center
7872972,26/01/15,15:12:28,234.0,B02617,-73.99,40.73,Manhattan,Union Sq



Statistiques basiques : 


Unnamed: 0,Date,Time,locationID,Base,Lon,Lat,Borough,Zone
count,1000,1000,1000.0,1000,1000.0,1000.0,1000,1000
unique,208,815,,64,,,1,60
top,06/05/15,22:08:00,,B02764,,,Manhattan,Union Sq
freq,13,4,,319,,,1000,46
mean,,,161.208,,-73.98314,40.74825,,
std,,,67.25264,,0.017379,0.026392,,
min,,,4.0,,-74.01,40.7,,
25%,,,113.0,,-74.0,40.73,,
50%,,,162.0,,-73.98,40.75,,
75%,,,231.0,,-73.97,40.76,,



Pourcentage de valeurs manquantes : 


Date          0.0
Time          0.0
locationID    0.0
Base          0.0
Lon           0.0
Lat           0.0
Borough       0.0
Zone          0.0
dtype: float64

## DBSCAN 🚀🚀

## I TRIED TO RUN IT FOR SAMPLE 200.000 BUT ITS TOO LOUD FOR MY COMPUTER
## I WILL FOCUS ONLY ON SMALL SAMPLE TO BEGIN WITH 

In [30]:
# On jette la colonne locationID
useless_cols = ['locationID','Time','Base','Borough','Date']

print("Les colonnes suivantes vont être jetées : ", useless_cols)
dataset_dbscan = result1415_sample.drop(useless_cols, axis=1)
dataset_dbscan.head()

Les colonnes suivantes vont être jetées :  ['locationID', 'Time', 'Base', 'Borough', 'Date']


Unnamed: 0,Lon,Lat,Zone
4939146,-73.98,40.76,Times Sq/Theatre District
1245908,-73.98,40.74,Midtown South
8578066,-73.97,40.76,Upper East Side South
10095163,-74.01,40.72,TriBeCa/Civic Center
7872972,-73.99,40.73,Union Sq


In [32]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

# Création du pipeline pour les variables catégorielles
categorical_features = [2] # Positions des colonnes catégorielles dans X
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # on encode les catégories sous forme de colonnes comportant des 0 et des 1
    ])

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(dataset_dbscan.head())
X = preprocessor.fit_transform(dataset_dbscan) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lon    Lat                       Zone
4939146  -73.98  40.76  Times Sq/Theatre District
1245908  -73.98  40.74              Midtown South
8578066  -73.97  40.76      Upper East Side South
10095163 -74.01  40.72       TriBeCa/Civic Center
7872972  -73.99  40.73                   Union Sq
...Terminé.
  (0, 0)	0.18076440550684728
  (0, 1)	0.4454346030080525
  (0, 45)	1.0
  (1, 0)	0.18076440550684728
  (1, 1)	-0.31275195530342303
  (1, 37)	1.0
  (2, 0)	0.7564472255896855
  (2, 1)	0.4454346030080525
  (2, 51)	1.0
  (3, 0)	-1.5462840547408492
  (3, 1)	-1.070938513615168
  (3, 46)	1.0
  (4, 0)	-0.3949184145751728
  (4, 1)	-0.6918452344594301
  (4, 49)	1.0



In [38]:
# import DBSCAN from sklearn and numpy
from sklearn.cluster import DBSCAN
import numpy as np

# Instanciate DBSCAN 
db = DBSCAN(eps=0.1, min_samples=35, metric="manhattan", algorithm="brute").fit(X)
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5])

In [39]:
#number of clusters 
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("the number of clusters is ", n_clusters_)

the number of clusters is  6


In [40]:
dataset_dbscan["clusters_dbscan"] = labels
dataset_dbscan.head()

Unnamed: 0,Lon,Lat,Zone,clusters_dbscan
4939146,-73.98,40.76,Times Sq/Theatre District,0
1245908,-73.98,40.74,Midtown South,-1
8578066,-73.97,40.76,Upper East Side South,-1
10095163,-74.01,40.72,TriBeCa/Civic Center,1
7872972,-73.99,40.73,Union Sq,2


In [22]:
dataset_dbscan["clusters_dbscan"].value_counts()

-1    980
 0      2
 1      2
 2      2
 3      2
 4      2
 5      2
 6      2
 7      2
 8      2
 9      2
Name: clusters_dbscan, dtype: int64

In [41]:
import plotly.express as px
fig = px.scatter_mapbox(dataset_dbscan[dataset_dbscan.clusters_dbscan != -1], lat="Lat", lon="Lon", color="Zone",size="clusters_dbscan",
                  zoom=15, mapbox_style="carto-positron",color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

## we concluded that the three hot zone in Manhattan are : East Village,Union SQ,Midtown Center