In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

pio.renderers.default = "notebook"


In [2]:
# Constants
INPUT_FILE_NAME = "uber-raw-data-apr14.csv"
OUTPUT_FILE_NAME = "output/uber-clusters2.csv"
MAP_COLOR_SCALE = ['#323aa8','#db1616','#26ab26']

In [3]:
df = pd.read_csv(INPUT_FILE_NAME)

In [4]:
# we create new features using datetime column
date_col_name = "Date/Time"
df[date_col_name] = pd.to_datetime(df[date_col_name])
df['dayofweek'] = df[date_col_name].dt.day_of_week
df['hour'] = df[date_col_name].dt.hour

df = df.drop([date_col_name, 'Base'], axis=1)
# we rename columns
df = df.rename({'Lat':'lat','Lon':'lon'}, axis=1)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

# we fit a scaller that will user later to transform our df

numerical_features = ['lat','lon']
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = ['dayofweek','hour']
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor.fit(df)

In [6]:
# generic function to fit Kmeans, MiniBatchKmeans and DBSCAN models
# the function apply the preprocessing and return the dataframe with clusters data as new column
def fit_cluster(X_hour:pd.DataFrame, model='kmeans', n_clusters=5, eps=0.2, min_samples=100):
  X_hour_scaled = preprocessor.transform(X_hour) # on standardise
  if model == 'kmeansfast':
    cluster = MiniBatchKMeans(n_clusters=n_clusters, random_state=0)
  elif model == 'kmeans':
    cluster = KMeans(n_clusters=n_clusters, random_state=0)
  elif model == 'dbscan':
    cluster = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
  else:
    raise Exception("model need values: kmeans, kmeansfast, dbscan")
  cluster.fit(X_hour_scaled) # on fit notre kmean
  X_hour['clusters'] = cluster.labels_
  return X_hour

In [26]:
# on découpe le dataset 7 x 24 = 168 dataframes
new_df = df.drop(df.index)
new_df['clusters'] = None
for d in range(0,7):
  for h in range(0,24):
    small_df = df.loc[(df['dayofweek'] == d) & (df['hour'] == h),:].reset_index(drop=True)

    size = small_df.shape[0]
    min_samples = int(size * 0.009)
    print(f'Day {d}, Hour {h} : ', size, f', min_samples: {min_samples}')

    small_df = fit_cluster(small_df.copy(), model='dbscan', eps=0.3, min_samples=min_samples) # small df with clusters

    new_df = pd.concat([new_df,small_df])

Day 0, Hour 0 :  518 , min_samples: 4
Day 0, Hour 1 :  261 , min_samples: 2
Day 0, Hour 2 :  238 , min_samples: 2
Day 0, Hour 3 :  571 , min_samples: 5
Day 0, Hour 4 :  1021 , min_samples: 9
Day 0, Hour 5 :  1619 , min_samples: 14
Day 0, Hour 6 :  2974 , min_samples: 26
Day 0, Hour 7 :  3888 , min_samples: 34
Day 0, Hour 8 :  3138 , min_samples: 28
Day 0, Hour 9 :  2211 , min_samples: 19
Day 0, Hour 10 :  1953 , min_samples: 17
Day 0, Hour 11 :  1929 , min_samples: 17
Day 0, Hour 12 :  1945 , min_samples: 17
Day 0, Hour 13 :  2294 , min_samples: 20
Day 0, Hour 14 :  3117 , min_samples: 28
Day 0, Hour 15 :  3818 , min_samples: 34
Day 0, Hour 16 :  4962 , min_samples: 44
Day 0, Hour 17 :  5574 , min_samples: 50
Day 0, Hour 18 :  4725 , min_samples: 42
Day 0, Hour 19 :  4386 , min_samples: 39
Day 0, Hour 20 :  3573 , min_samples: 32
Day 0, Hour 21 :  3079 , min_samples: 27
Day 0, Hour 22 :  1976 , min_samples: 17
Day 0, Hour 23 :  1091 , min_samples: 9
Day 1, Hour 0 :  765 , min_samples: 

In [8]:
new_df.shape

(564516, 5)

In [39]:
new_df = new_df[new_df['clusters'] > -1]

# visualisation of Monday by hours
fig = px.scatter_mapbox(new_df[(new_df['dayofweek'] == 2) & (new_df['hour'] == 4)], title='',
  lat="lat", lon="lon", color='clusters', mapbox_style="carto-positron", color_continuous_scale=['#323aa8','#db1616','#26ab26'],
  zoom=10, height=800)
fig.show()

In [17]:
new_df[(new_df['clusters'] == 1) & (new_df['hour'] == 10) & (new_df['dayofweek'] == 2)]

Unnamed: 0,lat,lon,dayofweek,hour,clusters
13,40.7708,-73.8656,2,10,1
16,40.7704,-73.8652,2,10,1
57,40.7710,-73.8657,2,10,1
71,40.7710,-73.8660,2,10,1
116,40.7740,-73.8724,2,10,1
...,...,...,...,...,...
3624,40.7708,-73.8657,2,10,1
3674,40.7731,-73.8856,2,10,1
3777,40.7741,-73.8731,2,10,1
3796,40.7709,-73.8658,2,10,1


In [35]:
# 3 decimals after lat et long is a enough precision, instead of 4. 2 is too large
new_df['lat'] = new_df['lat'].round(3)
new_df['lon'] = new_df['lon'].round(3)

# we group by and aggregate as count in a new col
new_df['nbcourses'] = 1
last_df = new_df.groupby(['lat','lon','dayofweek','hour','clusters']).count().sort_values('hour').reset_index() # aggreg will be only in nbcourses

In [36]:
last_df

Unnamed: 0,lat,lon,dayofweek,hour,clusters,nbcourses
0,40.610,-73.962,0,0,5,1
1,40.731,-73.978,2,0,0,1
2,40.731,-73.978,5,0,0,1
3,40.731,-73.978,6,0,0,1
4,40.731,-73.976,6,0,0,1
...,...,...,...,...,...,...
244072,40.758,-73.964,5,23,0,3
244073,40.779,-73.961,5,23,0,1
244074,40.738,-74.009,2,23,0,4
244075,40.712,-73.962,4,23,0,1


In [37]:
last_df.to_csv(OUTPUT_FILE_NAME)