In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

import plotly.express as px
import plotly as plt
import plotly.graph_objects as go
import plotly.io as pio
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import io
from urllib.request import urlopen


In [3]:
df_sep14 = pd.read_csv("uber-raw-data-sep14.csv", encoding='latin1')

In [7]:
df_sep14=pd.read_csv('uber-raw-data-sep14.csv').sample(n=30000, random_state=0, axis=0, replace=True).reset_index(drop=True)
df_sep14.head()


Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/3/2014 18:43:00,40.726,-74.0074,B02512
1,9/9/2014 13:21:00,40.7608,-73.9752,B02512
2,9/8/2014 16:31:00,40.7177,-73.9906,B02512
3,9/16/2014 12:41:00,40.7397,-73.991,B02512
4,9/11/2014 8:05:00,40.7515,-73.9764,B02512


In [None]:
# Statistiques basiques
print("Nombre de lignes : {}".format(df_sep14.shape[0]))
print()

print("Aperçu du dataset : ")
display(df_sep14.head())
print()

print("Statistiques basiques : ")
data_desc = df_sep14.describe(include='all')
display(data_desc)
print()

print("Pourcentage de valeurs manquantes : ")
display(100*df_sep14.isnull().sum()/df_sep14.shape[0])

In [None]:
df_sep14['Date/Time']=pd.to_datetime(df_sep14['Date/Time'])
df_sep14['Day']= df_sep14['Date/Time'].dt.day
df_sep14['DayOfTheWeek']=df_sep14['Date/Time'].dt.day_name()
df_sep14['Month']=df_sep14['Date/Time'].dt.month
df_sep14['Year']=df_sep14['Date/Time'].dt.year
df_sep14['Hour']=df_sep14['Date/Time'].dt.hour

In [None]:
df_sep14.head()

In [None]:
fig = px.scatter_mapbox(
        df_sep14, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
# On sélectionne Un jour 1 heure 
df_focus = (df_sep14[(df_sep14['DayOfTheWeek'] =='Saturday') & (df_sep14['Hour'] ==18)]).reset_index(drop=True)
df_focus.head()

In [None]:
fig = px.scatter_mapbox(
        df_focus, 
        lat="Lat", 
        lon="Lon",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
#on ne garde que les 2 colonneq qui sont pertinentes
df_reducted=df_focus[['Lat','Lon']].copy()
df_reducted.head()

In [None]:
#pre processing

numeric_features = [0,1] # Positions des colonnes quantitatives dans df
scaler=StandardScaler()# pour normaliser les variables

# Création du pipeline pour les variables catégorielles
#categorical_features = [3] # Positions des colonnes catégorielles dans X
#encoder=OneHotEncoder(drop='first') # on encode les catégories sous forme de colonnes comportant des 0 et des 1

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features)
 #       ('cat', encoder, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le dataset...")
print(df_reducted.head())
X = preprocessor.fit_transform(df_reducted) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

In [None]:
#essai de l'alog KMeans
wcss =  []
for i in range (2,21): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)


In [None]:
fig = px.line(x = range(2,21), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,21): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

In [None]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,21), y = s_score)
fig.show()

In [None]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 9)
kmeans.fit(X)

In [None]:
df2_kmeans=df_reducted.copy()
df2_kmeans.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
df2_kmeans.head()

In [None]:
# Visualisation bi-dimensionnelle
fig = px.scatter(df2_kmeans, x = 'Lat', y = "Lon", color = "Cluster_KMeans")
fig.show()

In [None]:
# Visualisation bi-dimensionnelle
fig = px.scatter_mapbox(
        df2_kmeans, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
#Nous allons essayer Kmeans avec le dataset comprenant les colonnes jour et heure
df2= df_sep14[['Lat', 'Lon','DayOfTheWeek', 'Hour']]

In [None]:
df2.head()

In [None]:
#pre processing

numeric_features = [0,1,3] # Positions des colonnes quantitatives dans df
scaler=StandardScaler()# pour normaliser les variables

# Création du pipeline pour les variables catégorielles
categorical_features = [2] # Positions des colonnes catégorielles dans X
encoder=OneHotEncoder(drop='first') # on encode les catégories sous forme de colonnes comportant des 0 et des 1

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features),
        ('cat', encoder, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le dataset...")
print(df2.head())
X2 = preprocessor.fit_transform(df2) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

In [None]:
#essai de l'algorithme KMeans
wcss2 =  []
for i in range (2,21): 
    kmeans2 = KMeans(n_clusters= i)
    kmeans2.fit(X2)
    wcss2.append(kmeans2.inertia_)


In [None]:
fig = px.line(x = range(2,21), y = wcss2)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score2 = []
for i in range (2,21): 
    kmeans2 = KMeans(n_clusters= i)
    kmeans2.fit(X2)
    s_score2.append(silhouette_score(X2, kmeans2.predict(X2)))

print(s_score2)

In [None]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,21), y = s_score2)
fig.show()

In [None]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans2 = KMeans(n_clusters= 12)
kmeans2.fit(X2)

In [None]:
df2_kmeans2=df2.copy().sort_values(['Hour'], ascending=True)
df2_kmeans2.loc[:,'Cluster_KMeans'] = kmeans2.predict(X2)
df2_kmeans2.head()

In [None]:
# Visualisation bi-dimensionnelle
fig = px.scatter_mapbox(
        df2_kmeans2, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        animation_frame="Hour",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
#Nous essayons DBScan sur X qui correspond a df_reducted, sample avec 1 jour et 1h précise
#db = DBSCAN(eps=0.085,min_samples=5, metric="euclidean")
db = DBSCAN(eps=0.085,min_samples=8, metric="euclidean")
db_pred=db.fit_predict(X)

In [None]:
np.unique(db.labels_)

In [None]:
df_dbScan1=df_reducted.copy()

In [None]:
df_dbScan1["cluster_DB"] = db.labels_
df_dbScan1.head()

In [None]:
df_dbScan1['cluster_DB'].value_counts()

In [None]:
fig = px.scatter_mapbox(
        df_dbScan1[df_dbScan1["cluster_DB"]!=-1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
df2.head()

In [None]:
df3=df2.copy()

In [None]:
df3_2features=df3[["Lat","Lon"]]

In [None]:

#PreProcessing sur uniquement ces 2 colonnes

numeric_features = [0,1] # Positions des colonnes quantitatives dans df
scaler=StandardScaler()# pour normaliser les variables

# Création du pipeline pour les variables catégorielles
#categorical_features = [3] # Positions des colonnes catégorielles dans X
#encoder=OneHotEncoder(drop='first') # on encode les catégories sous forme de colonnes comportant des 0 et des 1

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features)
 #       ('cat', encoder, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le dataset...")
print(df_reducted.head())
X_2f = preprocessor.fit_transform(df3_2features) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

In [None]:
df3=df3.sort_values(['Hour'], ascending=True)

In [None]:
	#Utilisation de la classe NearestNeighbors qui permet de déterminer les voisins les plus proches de chaque observation ainsi que les distances
%matplotlib inline
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X_2f)
distances, indices = nbrs.kneighbors(X_2f)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances);

In [None]:
db= DBSCAN(eps=0.03,min_samples=20, metric="euclidean")
db_pred=db.fit_predict(X_2f)

In [None]:
np.unique(db.labels_)

In [None]:
df3["cluster_DB"] = db.labels_
df3.head()

In [None]:
df3['cluster_DB'].value_counts()

In [None]:
fig = px.scatter_mapbox(
        df3[df3["cluster_DB"]!=-1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        animation_frame="Hour",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
#Visualisation des clueters sur le lundi, par heures
fig = px.scatter_mapbox(
        df3[(df3["DayOfTheWeek"]=="Monday") & (df3["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
	#Utilisation de la classe NearestNeighbors qui permet de déterminer les voisins les plus proches de chaque observation ainsi que les distances
%matplotlib inline
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X2)
distances, indices = nbrs.kneighbors(X2)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances);

In [None]:
#On reprend df2 qui comprend tous les jours et heures de la semaine, avec que les colonnes qui nous interessent
#On reprend également le X2 qui est déja passé par le pré processing
print("Nous faisons appel de nouveau à DbScan mais sur X2 qui représente les données Longitude, latitude, jour de la semaine et heure apres pré processing")


In [None]:
df2=df2.sort_values(['Hour'], ascending=True)

In [None]:
#db= DBSCAN(eps=0.085,min_samples=8, metric="euclidean")
db= DBSCAN(eps=0.6,min_samples=4, metric="euclidean")
db_pred=db.fit_predict(X2)

In [None]:
np.unique(db.labels_)

In [None]:
df_dbScan2=df2.copy()

In [None]:
df_dbScan2["cluster_DB"] = db.labels_
df_dbScan2.head()

In [None]:
df_dbScan2['cluster_DB'].value_counts()

In [None]:
fig = px.scatter_mapbox(
        df_dbScan2, 
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        animation_frame="Hour",
        mapbox_style="carto-positron",
)

fig.show()

In [None]:
#Visualisation des clueters sur le lundi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Monday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#Visualisation des clueters sur le mardi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Tuesday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#Visualisation des clueters sur le mercredi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Wednesday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#Visualisation des clueters sur le jeudi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Thursday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#Visualisation des clueters sur le mercredi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Friday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#Visualisation des clueters sur le mercredi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Saturday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#Visualisation des clueters sur le mercredi, par heures
fig = px.scatter_mapbox(
        df_dbScan2[(df_dbScan2["DayOfTheWeek"]=="Sunday") & (df_dbScan2["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()

In [None]:
#convert gps coordinates from degree to radian to use haversine metric
df3_2features_rad=df3_2features.apply(lambda x: x * np.pi / 180)

In [None]:

#PreProcessing sur uniquement ces 2 colonnes
numeric_features = [0,1] # Positions des colonnes quantitatives dans df
scaler=StandardScaler()# pour normaliser les variables

# Création du pipeline pour les variables catégorielles
#categorical_features = [3] # Positions des colonnes catégorielles dans X
#encoder=OneHotEncoder(drop='first') # on encode les catégories sous forme de colonnes comportant des 0 et des 1

# On combine les pipelines dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features)
 #       ('cat', encoder, categorical_features)
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le dataset...")
print(df_reducted.head())
X_2fR = preprocessor.fit_transform(df3_2features_rad) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

In [None]:
len(df_reducted)

In [None]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

In [None]:
# Compute DBSCAN
kms_per_radian = 6371.0088
epsilon = 60 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=4, algorithm='brute', metric='haversine').fit(df3_2features_rad)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_


In [None]:
np.unique(db.labels_)

In [None]:
df_dbScan3R=df2.copy()

In [None]:
df_dbScan3R["cluster_DB"] = db.labels_
df_dbScan3R.head()

In [None]:
df_dbScan3R['cluster_DB'].value_counts()

In [None]:
#Visualisation des clueters sur le mercredi, par heures
fig = px.scatter_mapbox(
        df_dbScan3R[(df_dbScan3R["DayOfTheWeek"]=="Sunday") & (df_dbScan3R["cluster_DB"]!=-1)],
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron",
        animation_frame="Hour"
)

fig.show()