In [None]:
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium

# Creating Dataframes

In [None]:
folder = 'E:/Desktop/Uber kmeans workshop/Dataset/'

In [None]:
apr14 = pd.read_csv(folder+'uber-raw-data-apr14.csv')
may14 = pd.read_csv(folder+'uber-raw-data-may14.csv')
jun14 = pd.read_csv(folder+'uber-raw-data-jun14.csv')
jul14 = pd.read_csv(folder+'uber-raw-data-jul14.csv')
aug14 = pd.read_csv(folder+'uber-raw-data-aug14.csv')
sep14 = pd.read_csv(folder+'uber-raw-data-sep14.csv')

In [None]:
df = pd.concat([apr14,may14,jun14,jul14,aug14,sep14])

In [None]:
df

In [None]:
##Now we will do data pre-processing on "Date/Time" feature

In [None]:
#new = pd.to_datetime(apr14['Date/Time'])
df['Date/Time'] = pd.to_datetime(df['Date/Time'])

In [None]:
df

# Histogram

In [None]:
df['Date/Time'].dt.time

In [None]:
df['Time'] = df['Date/Time'].dt.time.apply(lambda x : int(x.strftime('%H%M%S')))

In [None]:
df

In [None]:
plt.hist(df['Time'])

In [None]:
sns.histplot(df['Time'])
#we could see that the maximum no. of rides take place between 3pm. to 9pm.

# Now, lets divide our dataset based on morning and evening rides

In [None]:
morning_df_index = (df['Time'] > 5000) & (df['Time'] < 110000)
morning_df = df[morning_df_index]
print(morning_df)

In [None]:
evening_df_index = (df['Time'] > 150000) & (df['Time'] < 220000)
evening_df = df[evening_df_index]
print(evening_df)

In [None]:
#Now, we will get coordinates of cab booked in morning and evening
#For this, we will combine random 10000 Latitude and longitude in one single array
morning_coords = morning_df[['Lat','Lon']].sample(10000,random_state=10).values
print(morning_coords)

In [None]:
evening_coords = evening_df[['Lat','Lon']].sample(10000,random_state=10).values
print(evening_coords)

# Plotting morning and evening rides on map

In [None]:
morning_map = folium.Map([40.7709, -73.949],zoom_start = 11,tiles = 'stamen toner')
for coords in morning_coords:
    folium.CircleMarker(location = coords,fill=True,radius=1).add_to(morning_map)
morning_map

In [None]:
evening_map = folium.Map([40.6975,-73.9967],zoom_start = 11,tiles = 'stamen toner')
for coords in evening_coords:
    folium.CircleMarker(location = coords,fill=True,radius=1,color='red').add_to(evening_map)
evening_map

# Now, lets find clusters(hotspot areas) in morning rides, evening rides and whole dataset 


# 1) Morning ride clusters

In [None]:
clusters = 6   #no. of cluster should be decided by using elbow method. I did it below for whole dataset
model = KMeans(n_clusters = clusters,init = 'random',max_iter=300)

In [None]:
model.fit(morning_df[['Lat','Lon']])

In [None]:
morning_centroids = model.cluster_centers_
morning_centroids

In [None]:
for i,coords in enumerate(morning_centroids):
    folium.Marker(location = coords,popup='centroid {}'.format(i+1)).add_to(morning_map)
morning_map

# 2) Evening ride clusters

In [None]:
#For evening rides
clusters = 6
model = KMeans(n_clusters = clusters,init='random',max_iter=300)

In [None]:
model.fit(evening_df[['Lat','Lon']])

In [None]:
evening_centroids = model.cluster_centers_
evening_centroids

In [None]:
for i,coords in enumerate(evening_centroids):
    folium.Marker(location = coords,popup='centroid {}'.format(i+1)).add_to(evening_map)
evening_map

# 3) Clusters for whole dataset 
#        Elbow method 

In [None]:
#for whole dataset
#elbow method using WCSS
wcss = []
for i in range(1,11):
    model = KMeans(n_clusters = i)
    model.fit(df[['Lat','Lon']])
    w = model.inertia_       #WCSS value
    wcss.append(w)
    print(wcss[i-1])

In [None]:
plt.plot(wcss)
#We could see that 8 could be the no. of clusters. So, take 8 as nclusters.

In [None]:
cluster = 8
model = KMeans(n_clusters = cluster)
model.fit(df[['Lat','Lon']])
model.inertia_

In [None]:
centroids = model.cluster_centers_
centroids

In [None]:
map = folium.Map(location=[40.79658011772687, -73.87341741832425], zoom_start = 10, tiles='Stamen Toner')
for i, coordinate in enumerate(centroids):
    folium.Marker(coordinate, popup='Centroid {}'.format(i+1)).add_to(map)
map

# Let's Predict for the new ride.

In [None]:
new_ride = (40.70647056912189, -73.91116590442799)      #suppose new ride is coming from this location
folium.Marker(new_ride, popup='New Rider', icon=folium.Icon(color='green')).add_to(map)
map

In [None]:
#now, lets predict which hotspot's cab will be assigned to this new ride
centroid_idx = model.predict([new_ride])
centroid_idx

In [None]:
centroids[centroid_idx]

In [None]:
folium.Marker(centroids[centroid_idx][0], icon=folium.Icon(color='red')).add_to(map)
map