## PROJET UBER PICKUPS

In [1]:
#Install plotly
!pip install plotly

Collecting plotly
  Using cached plotly-5.3.1-py2.py3-none-any.whl (23.9 MB)
Collecting tenacity>=6.2.0
  Using cached tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.3.1 tenacity-8.0.1


# Import librairies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 

In [3]:
#Reading the selected dataset
dataset=pd.read_csv('uber-raw-data-jun14.csv')
dataset.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [4]:
# Shape of the dataset (rows=663834, columns=4)
dataset.shape

(663844, 4)

In [5]:
# Removal the "Base"column
dataset=dataset.drop('Base', axis=1)
dataset

Unnamed: 0,Date/Time,Lat,Lon
0,6/1/2014 0:00:00,40.7293,-73.9920
1,6/1/2014 0:01:00,40.7131,-74.0097
2,6/1/2014 0:04:00,40.3461,-74.6610
3,6/1/2014 0:04:00,40.7555,-73.9833
4,6/1/2014 0:07:00,40.6880,-74.1831
...,...,...,...
663839,6/30/2014 22:40:00,40.7332,-73.9872
663840,6/30/2014 23:12:00,40.7905,-73.9796
663841,6/30/2014 23:13:00,40.7640,-73.9887
663842,6/30/2014 23:15:00,40.7262,-73.9944


In [6]:
# Using only 10000 rows in order to protect my PC
data=dataset.sample(10000)
data

Unnamed: 0,Date/Time,Lat,Lon
40575,6/2/2014 9:44:00,40.7046,-74.0096
341437,6/12/2014 22:20:00,40.6815,-73.9324
626876,6/26/2014 12:10:00,40.7575,-73.9904
551057,6/13/2014 20:41:00,40.7513,-73.9834
558972,6/15/2014 6:23:00,40.6944,-73.9690
...,...,...,...
374151,6/18/2014 15:58:00,40.7814,-73.9802
475213,6/3/2014 18:01:00,40.7522,-73.9723
94385,6/8/2014 13:49:00,40.7716,-73.9821
207311,6/21/2014 21:08:00,40.7502,-73.9945


In [7]:
# Transform the Data / Time column to datetime in order to separate the temporal data
data['Date/Time']=pd.to_datetime(data.iloc[:,0])

In [8]:
# Create a "day" column by using "Date/Tile" column to retrieve number of the day
data["day"] = data['Date/Time'].dt.dayofweek

In [9]:
data

Unnamed: 0,Date/Time,Lat,Lon,day
40575,2014-06-02 09:44:00,40.7046,-74.0096,0
341437,2014-06-12 22:20:00,40.6815,-73.9324,3
626876,2014-06-26 12:10:00,40.7575,-73.9904,3
551057,2014-06-13 20:41:00,40.7513,-73.9834,4
558972,2014-06-15 06:23:00,40.6944,-73.9690,6
...,...,...,...,...
374151,2014-06-18 15:58:00,40.7814,-73.9802,2
475213,2014-06-03 18:01:00,40.7522,-73.9723,1
94385,2014-06-08 13:49:00,40.7716,-73.9821,6
207311,2014-06-21 21:08:00,40.7502,-73.9945,5


In [10]:
#Information about dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 40575 to 597501
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  10000 non-null  datetime64[ns]
 1   Lat        10000 non-null  float64       
 2   Lon        10000 non-null  float64       
 3   day        10000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 390.6 KB


In [11]:
#Removal the "Date/Time" column
data=data.drop(['Date/Time'],axis=1)

In [12]:
data

Unnamed: 0,Lat,Lon,day
40575,40.7046,-74.0096,0
341437,40.6815,-73.9324,3
626876,40.7575,-73.9904,3
551057,40.7513,-73.9834,4
558972,40.6944,-73.9690,6
...,...,...,...
374151,40.7814,-73.9802,2
475213,40.7522,-73.9723,1
94385,40.7716,-73.9821,6
207311,40.7502,-73.9945,5


In [13]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(4)
kmeans.fit(data)

MiniBatchKMeans(n_clusters=4)

In [14]:
#Predictions
data.loc[:,'cluster'] = kmeans.predict(data)
data.head()

Unnamed: 0,Lat,Lon,day,cluster
40575,40.7046,-74.0096,0,3
341437,40.6815,-73.9324,3,2
626876,40.7575,-73.9904,3,2
551057,40.7513,-73.9834,4,2
558972,40.6944,-73.969,6,0


In [15]:
#Shape of the dataset "data" (rows=10000, columns=4)
data.shape

(10000, 4)

# Data Visualization

In [16]:
fig = px.scatter_mapbox(data, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
fig.show('iframe')

In [17]:
# Display the dataframe and the predicted results into "cluster" column 
data

Unnamed: 0,Lat,Lon,day,cluster
40575,40.7046,-74.0096,0,3
341437,40.6815,-73.9324,3,2
626876,40.7575,-73.9904,3,2
551057,40.7513,-73.9834,4,2
558972,40.6944,-73.9690,6,0
...,...,...,...,...
374151,40.7814,-73.9802,2,1
475213,40.7522,-73.9723,1,1
94385,40.7716,-73.9821,6,0
207311,40.7502,-73.9945,5,0


In [18]:
data1=dataset.sample(10000)
data1

Unnamed: 0,Date/Time,Lat,Lon
278079,6/1/2014 17:40:00,40.6415,-73.7884
556340,6/14/2014 18:52:00,40.7856,-73.9550
334235,6/12/2014 0:24:00,40.7579,-73.9698
293143,6/4/2014 20:54:00,40.7234,-74.0081
241832,6/26/2014 15:45:00,40.7400,-74.0072
...,...,...,...
599769,6/21/2014 19:39:00,40.6929,-73.9620
169709,6/17/2014 15:41:00,40.7499,-74.0035
289633,6/4/2014 9:37:00,40.7705,-73.9816
498365,6/6/2014 18:19:00,40.7803,-73.9873


In [19]:
# Creation of the "day" column to determine the day number
data1['Date/Time']=pd.to_datetime(data1.iloc[:,0])
data1["day"] = data1['Date/Time'].dt.dayofweek
data1

Unnamed: 0,Date/Time,Lat,Lon,day
278079,2014-06-01 17:40:00,40.6415,-73.7884,6
556340,2014-06-14 18:52:00,40.7856,-73.9550,5
334235,2014-06-12 00:24:00,40.7579,-73.9698,3
293143,2014-06-04 20:54:00,40.7234,-74.0081,2
241832,2014-06-26 15:45:00,40.7400,-74.0072,3
...,...,...,...,...
599769,2014-06-21 19:39:00,40.6929,-73.9620,5
169709,2014-06-17 15:41:00,40.7499,-74.0035,1
289633,2014-06-04 09:37:00,40.7705,-73.9816,2
498365,2014-06-06 18:19:00,40.7803,-73.9873,4


In [20]:
# Determine the unique values of the "day" column
data1["day"].unique()

array([6, 5, 3, 2, 0, 4, 1])

In [21]:
#Iteration on data1["day"] to determine the clusters through the prediction
for data in data1["day"].unique():

    dat = data1.loc[data1["day"]==data,["Lat","Lon"]]
    kmeans = MiniBatchKMeans(4)
    kmeans.fit(dat)
    
    dat = dat.sample(1000)
    
    dat.loc[:,'cluster'] = kmeans.predict(dat)
    dat.head()
    
    fig = px.scatter_mapbox(dat, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
    fig.show('iframe')

In [22]:
# Using DBSCAN to compare with Kmeans
from sklearn.cluster import DBSCAN

In [23]:
#with eps=0.05
for data in np.unique(data1["day"]):

    dat = data1.loc[data1["day"]==data,["Lat","Lon"]]
    dat = dat.sample(1000)
    dbscan = DBSCAN(eps=0.05, metric = "manhattan")
    dat.loc[:,'cluster'] = dbscan.fit_predict(dat)

    fig = px.scatter_mapbox(dat, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
    fig.show('iframe')

In [24]:
#with eps=0.025
for data in np.unique(data1["day"]):

    dat = data1.loc[data1["day"]==data,["Lat","Lon"]]
    dat = dat.sample(1000)
    dbscan = DBSCAN(eps=0.025, metric = "manhattan")
    dat.loc[:,'cluster'] = dbscan.fit_predict(dat)

    fig = px.scatter_mapbox(dat, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
    fig.show('iframe')

In [25]:
#with eps=0.01
for data in np.unique(data1["day"]):

    dat = data1.loc[data1["day"]==data,["Lat","Lon"]]
    dat = dat.sample(1000)
    dbscan = DBSCAN(eps=0.01, metric = "manhattan")
    dat.loc[:,'cluster'] = dbscan.fit_predict(dat)

    fig = px.scatter_mapbox(dat, lat="Lat", lon="Lon", color="cluster", zoom=10, mapbox_style="carto-positron")
    fig.show('iframe')