# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../..//DATA/train_cleaned.csv')

In [3]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [4]:
from sklearn.cluster import KMeans

In [5]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [6]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [7]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [8]:
#draw cluster centers on map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [9]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [10]:
# first, use the model to predict the cluster assignment of all data points
predictions = myKMeans.predict(coordinates.to_numpy())

In [11]:
#result gives a cluster number per coordiante
# look at the first 10 entries
predictions[:10]

array([19, 74, 67, 34, 26, 13, 84, 70, 68, 78], dtype=int32)

In [56]:

def show_cluster(cluster_number, predictions, centers):
    #get coordinates of the target cluster
    #coordinates were all coordinate collumns from the train data
    cluster_coord = coordinates.to_numpy()[predictions==cluster_number]
    
    #get number of entries:
    entries = np.shape(cluster_coord)[0]
    print("number of cluster enries:" , entries)
    
    #create map
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    
    for i in range(entries):
        folium.CircleMarker([cluster_coord[i,0], cluster_coord[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        folium.CircleMarker([cluster_coord[i,2], cluster_coord[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        
    #plot centers
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    
    return cluster_map

In [57]:
show_cluster(68,predictions,centers)

number of cluster enries: 6817


## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [49]:
def compute_var(predictions, centers):
    #get coordinates of the target cluster
    #coordinates were all coordinate collumns from the train data
    np_coord = coordinates.to_numpy()

    for i in range(centers.shape[0]): #iter over all clusters
        cluster_coord = np_coord[predictions==i] #get coordinates of Cluster i
        extra_dist = np.linalg.norm(centers-centers[i],2,axis=1)
        intra_dist = np.linalg.norm(cluster_coord-centers[i],2,axis=1)
        print(i,"#menbers:",cluster_coord.shape[0],"Intra-Var:",np.var(intra_dist),"Extra-Var:",np.var(extra_dist),"Realtion (extra/intra):",np.var(extra_dist)/np.var(intra_dist))


In [50]:
compute_var(predictions,centers)

0 #menbers: 9765 Intra-Var: 4.526614677789634e-05 Extra-Var: 0.06449596542271852 Realtion (extra/intra): 1424.8167784012087
1 #menbers: 11155 Intra-Var: 1.5118747475762482e-05 Extra-Var: 0.06287203312875357 Realtion (extra/intra): 4158.547738795588
2 #menbers: 5007 Intra-Var: 3.884499007968066e-05 Extra-Var: 0.05614910942428595 Realtion (extra/intra): 1445.4659226095894
3 #menbers: 2156 Intra-Var: 0.00022491536435421828 Extra-Var: 0.04445086850173302 Realtion (extra/intra): 197.633757166218
4 #menbers: 29 Intra-Var: 0.003817589578064666 Extra-Var: 0.010602724190484747 Realtion (extra/intra): 2.7773347484513558
5 #menbers: 2324 Intra-Var: 7.309708027644322e-05 Extra-Var: 0.055335521083588295 Realtion (extra/intra): 757.01410883057
6 #menbers: 875 Intra-Var: 0.0007255534715479549 Extra-Var: 0.03757487516593931 Realtion (extra/intra): 51.78787868766449
7 #menbers: 97 Intra-Var: 0.004146910393064846 Extra-Var: 0.03736133724622024 Realtion (extra/intra): 9.00943924631265
8 #menbers: 1749 In

  # Remove the CWD from sys.path while we load stuff.


In [54]:
#Note: plot a Cluster with high Relation and very low relation...