# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
data=coordinates.to_numpy()[:100000,:]
data

array([[ 40.721319  , -73.844311  ,  40.712278  , -73.84161   ],
       [ 40.711303  , -74.016048  ,  40.782004  , -73.979268  ],
       [ 40.76127   , -73.982738  ,  40.750562  , -73.991242  ],
       ...,
       [ 40.750728  , -73.971607  ,  40.734096  , -73.978229  ],
       [ 40.745104  , -73.984853  ,  40.759575  , -73.971982  ],
       [ 40.78069687, -73.98019409,  40.77561569, -73.96037292]])

In [7]:
%%time
#train model
myKMeans.fit(data)#use only subset of the data to make it faster



Wall time: 46.3 s


KMeans(n_clusters=100, n_jobs=-1)

In [8]:
#get cluster centers
centers=myKMeans.cluster_centers_

In [9]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [10]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [11]:
def show_cluster(cluster_number,kmeans,coordinates):
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)   
    
    df_cluster_map = coordinates.assign(cluster = kmeans.labels_)      
    
    [setCircleMarkerAt(cluster_map,row[0],row[1],row[2],row[3]) for row in df_cluster_map[df_cluster_map.cluster == cluster_number].to_numpy()]
    
    return cluster_map
    
def setCircleMarkerAt(cluster_map,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude): 
    folium.CircleMarker([pickup_latitude, pickup_longitude], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([dropoff_latitude, dropoff_longitude], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [pickup_latitude, pickup_longitude] , [dropoff_latitude, dropoff_longitude]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    
show_cluster(7,myKMeans,coordinates[:100000])

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [12]:
from scipy.spatial import distance

In [27]:
def computeVariance(cluster_number,kmeans,coordinates):
    df_cluster_map = coordinates.assign(cluster = kmeans.labels_)  
    coords=df_cluster_map[df_cluster_map.cluster == cluster_number].to_numpy()[:,:4]
    print(np.shape(coords))
    return distance.cdist(coords, coords, 'euclidean')
np.shape(computeVariance(3,myKMeans,coordinates[:100000]))

(2226, 4)


(2226, 2226)