# Use Case: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) 

In [2]:
!pip install folium



In [2]:
import pandas as pd
import numpy as np
import folium


In [3]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv('../../DATA/train_cleaned.csv')

In [4]:
#quick look at the data
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,pickup_borough,dropoff_borough,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
0,0,0,0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,...,0,0,0,0,0,0.640487,queens,queens,0,0
1,1,1,1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,...,0,0,0,0,0,5.25067,manhattan,manhattan,1,0
2,2,2,2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,...,0,0,0,0,0,0.863411,manhattan,manhattan,0,0
3,3,3,3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,...,0,0,0,0,0,1.739386,manhattan,manhattan,1,0
4,4,4,4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,...,0,0,0,0,0,1.242218,manhattan,manhattan,0,0


In [5]:
train.shape

(400000, 32)

In [6]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

In [7]:
coordinates.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,40.721319,-73.844311,40.712278,-73.84161
1,40.711303,-74.016048,40.782004,-73.979268
2,40.76127,-73.982738,40.750562,-73.991242
3,40.733143,-73.98713,40.758092,-73.991567
4,40.768008,-73.968095,40.783762,-73.956655


## Clustering
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [8]:
from sklearn.cluster import KMeans

In [9]:
#define number of clusters and create instance
k=20
myKMeans=KMeans(n_clusters=k, n_jobs=-1)#parallelize to all cores
#-1 bedeutet benutze alle CPU Cores

In [10]:
#train model
myKMeans.fit(coordinates.to_numpy())



KMeans(n_clusters=20, n_jobs=-1)

In [13]:
#get cluster centers
centers=myKMeans.cluster_centers_
labels=myKMeans.labels_    

In [14]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(k):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [15]:
cluster_map

In [28]:
labels

array([ 5, 10,  0, ..., 17,  3,  1])

In [29]:
#add cluster labels to DataFrame
train['clusterID']=labels

In [30]:
#GroupBy Clusters
clusters=train.groupby('clusterID')

In [31]:
clusters['fare_amount'].count()

clusterID
0     76455
1     38895
2      4786
3     40179
4        98
5      3338
6      1706
7      8849
8        53
9      4545
10    33056
11     3127
12     7266
13       93
14    35040
15      726
16    13866
17    55947
18    41319
19    30656
Name: fare_amount, dtype: int64

In [32]:
clusters.mean()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,199613.492891,203938.345367,203938.345367,7.313427,-73.981978,40.757745,-73.981976,40.757837,1.683984,15.679733,...,-73.981974,0.0,0.0,0.0,0.0,0.0,0.0,0.889717,0.025911,0.027716
1,199852.152925,204181.967914,204181.967914,10.528234,-73.979837,40.755972,-73.95793,40.779361,1.671757,15.647641,...,-73.957933,0.0,0.0,0.0,0.0,0.0,0.0,2.155009,0.128217,0.0
2,201579.652946,205947.030297,205947.030297,48.66074,-73.784753,40.646472,-73.971099,40.739415,1.80046,15.727121,...,-73.971107,0.976181,0.0,0.0,0.001045,0.0,0.01191,12.109914,0.0,0.176557
3,200683.615371,205031.394086,205031.394086,8.229309,-73.998472,40.723352,-73.999372,40.723095,1.679584,15.715697,...,-73.999369,0.0,0.0,2.5e-05,0.0,0.0,0.0,1.175396,0.925434,0.951393
4,208037.27551,212544.540816,212544.540816,9.795102,-73.149918,41.366595,-73.147641,41.368939,1.387755,16.173469,...,-73.147255,0.0,0.0,0.0,0.0,0.0,0.0,0.242551,0.0,0.0
5,203191.077292,207593.2145,207593.2145,12.670839,-73.885575,40.762016,-73.881417,40.755101,1.711204,15.727681,...,-73.881406,0.0,0.001498,0.0,0.0,0.343619,0.12163,1.884286,0.0,0.0
6,201137.485932,205494.896835,205494.896835,23.598224,-73.787808,40.655775,-73.796676,40.688269,1.677608,15.679953,...,-73.796655,0.821805,0.332943,0.0,0.0,0.0,0.01993,3.346593,0.0,0.0
7,200940.021923,205293.528421,205293.528421,27.805218,-73.980338,40.751899,-73.876804,40.76152,1.687761,15.7159,...,-73.876808,0.0,0.0,0.0,0.0,0.0,0.549667,5.775098,0.2563,0.0
8,188799.09434,192893.528302,192893.528302,14.055283,-73.150355,41.359023,-73.968754,40.746737,1.377358,15.415094,...,-73.968698,0.0,0.018868,0.0,0.0,0.0,0.056604,60.241047,0.0,0.377358
9,198335.414301,202632.29549,202632.29549,23.423897,-73.969556,40.774155,-73.922998,40.852042,1.709131,15.588339,...,-73.923002,0.00132,0.0,0.0,0.0,0.027723,0.0,6.129598,0.127393,0.0


In [33]:
clusters.var()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,...,dropoff_longitude_round3,is_pickup_JFK,is_dropoff_JFK,is_pickup_EWR,is_dropoff_EWR,is_pickup_la_guardia,is_dropoff_la_guardia,trip_distance,is_pickup_lower_manhattan,is_dropoff_lower_manhattan
clusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13383390000.0,13965060000.0,13965060000.0,14.632984,6.9e-05,7.2e-05,7.7e-05,7.6e-05,1.708109,75.207078,...,7.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.195089,0.02524,0.026948
1,13330750000.0,13910040000.0,13910040000.0,23.341573,0.000114,0.000133,0.000131,0.000167,1.698677,74.918237,...,0.000131,0.0,0.0,0.0,0.0,0.0,0.0,1.519251,0.11178,0.0
2,13660690000.0,14254590000.0,14254590000.0,131.052168,0.000127,6.4e-05,0.001003,0.002172,1.903122,76.580067,...,0.001003,0.023257,0.0,0.0,0.001044,0.0,0.01177,3.431476,0.0,0.145415
3,13298900000.0,13876880000.0,13876880000.0,24.086272,0.000126,0.000147,0.000109,0.000119,1.672083,75.381687,...,0.000109,0.0,0.0,2.5e-05,0.0,0.0,0.0,0.540363,0.069008,0.046246
4,13682800000.0,14276860000.0,14276860000.0,54.881506,0.005296,0.000353,0.004019,0.000394,0.610983,91.217021,...,0.004026,0.0,0.0,0.0,0.0,0.0,0.0,2.412272,0.0,0.0
5,13494170000.0,14080680000.0,14080680000.0,191.419415,0.000844,0.000575,0.001425,0.000995,1.820078,76.441553,...,0.001424,0.0,0.001496,0.0,0.0,0.225613,0.106868,3.615568,0.0,0.0
6,13363020000.0,13943630000.0,13943630000.0,362.772861,0.000505,0.000672,0.002209,0.002335,1.732953,75.965545,...,0.00221,0.146527,0.222222,0.0,0.0,0.0,0.019544,12.014641,0.0,0.0
7,13450880000.0,14035530000.0,14035530000.0,80.410189,0.000255,0.000488,0.000541,0.000382,1.691036,74.675155,...,0.00054,0.0,0.0,0.0,0.0,0.0,0.247561,2.565978,0.190632,0.0
8,15847040000.0,16537060000.0,16537060000.0,211.167945,0.004627,0.002996,0.00247,0.000999,0.739478,67.785922,...,0.002463,0.0,0.018868,0.0,0.0,0.0,0.054427,15.865221,0.0,0.239478
9,13432750000.0,14016620000.0,14016620000.0,242.579921,0.000993,0.001665,0.001051,0.001087,1.824267,76.341281,...,0.001051,0.001319,0.0,0.0,0.0,0.02696,0.0,9.965883,0.111188,0.0
