In [None]:
import cython
import folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
from IPython.display import display , HTML
import scipy.stats as stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from scipy.spatial.distance import pdist
#from sklearn.metrics.pairwise import euclidean_distances
warnings.filterwarnings('ignore')
# Method to show dataframe without showing index column
def show(df):
    return display(HTML(df.to_html(index=False)))

# Pearsons number of bins for histogram
def numBins(n):
    return round(1 + (3.322 * (math.log10(n)) ) )

%matplotlib inline

## Let's import the data

In [None]:
base = pd.read_csv('train.csv')
base.info()
print('\n')
print('Unique id size: %i' % base.id.drop_duplicates().size)

As we can see above, the data has :
<li>None null values
<li>Date fields are stored as strings (pickup_datetime, dropoff_datetime)
<li>Categorical data stored as Object (store_and_fwd_flag)
<li>1458644 registers... 122 MB in memory
<li>Id column does not repeat... can be used as reference to joining data frames

## Strategy for the E.D.A.

For the exploratory analysis we have to concern about:
<li>Quality of the data
<li>Computing Performance
<li>Digging deep - Info is always good, let the machine decide what to do with it later
<li>Mining...
<li>Coffee break
<li>Keep mining...

So... first of all, treat some data.


Let's parse the **dates** and change **store_and_fwd_flag** to category.

In [None]:
%%time
base['pickup_datetime'] = pd.to_datetime(base.pickup_datetime) 
base['dropoff_datetime'] = pd.to_datetime(base.dropoff_datetime) 
base['trip_duration'] = base.trip_duration.astype('int64') 
base['store_and_fwd_flag'] = base['store_and_fwd_flag'].astype('category')
base.info()

We now have 112 MB memory usage... an enhancement for the dataframe processing (~10MB lighter). 
Also, it is easier to work with dates.

<br>Now we are going to extract features from the datetime columns.

In [None]:
%%time
tmp = base[['id','pickup_datetime','dropoff_datetime']]
tmp['week_day_pickup'] = tmp.pickup_datetime.dt.dayofweek 
tmp['hour_pickup'] = tmp.pickup_datetime.dt.hour 
tmp['day_pickup'] = tmp.pickup_datetime.dt.day 
tmp['week_pickup'] = tmp.pickup_datetime.dt.week 
tmp['month_pickup'] = tmp.pickup_datetime.dt.month 
tmp['week_day_dropoff'] = tmp.dropoff_datetime.dt.dayofweek
tmp['hour_dropoff'] = tmp.dropoff_datetime.dt.hour
tmp['day_dropoff'] = tmp.dropoff_datetime.dt.day
tmp['week_dropoff'] = tmp.dropoff_datetime.dt.week
tmp['month_dropoff'] = tmp.dropoff_datetime.dt.month
tmp.to_csv('date_trips.csv' , sep=';')
del tmp

**You may notice that we are going to extract some features and store them in a separate csv file. Later we are going to join the dataframes.**

Now we are going to calculate the geodesic distance between each trip made.<br>

In [None]:
%%time
from geopy.distance import vincenty
def getDistance(df):
    return vincenty(
        ( df['pickup_latitude']  , df['pickup_longitude'] ) , 
        ( df['dropoff_latitude'] , df['dropoff_longitude'] ) ).meters
distance_df = base[ ['id','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']]
distance_df['trip_distance'] = distance_df.apply(getDistance , axis=1)
distance_df[['id','trip_distance']].to_csv('trip_distance.csv' , sep=';')
del distance_df

In [None]:
pd.to_datetime(base.pickup_datetime).describe()

Range date goes from Jan 1 to Jun 30 from 2016. We have 6 months of data.

### What we have so far...
<li>We extracted features from the pickups and dropoffs 
<li>We extracted the trips distances

#### What we still need to check:
<li>How many vendors this dataset have?
<li>Are there outliers in coordinates, trip duration or number of passengers?
<li>Which seasons do people use more taxi?
<li>Can we well define zones in our map?
<li>What is the average trip duration? Is there any correlation between trip duration and trip destiny?

## VENDORS

In [None]:
sns.countplot(x='vendor_id' , data=base)
print('Vendors amount:')
base.vendor_id.value_counts()

## OUTLIERS

### COORDINATES
Let's plot in the map our boundaries coordinates.

In [None]:
#Pickup
pickup_max_long , pickup_min_long = base.pickup_longitude.max() , base.pickup_longitude.min()
pickup_max_lat , pickup_min_lat = base.pickup_latitude.max() , base.pickup_latitude.min()
pickup_coord = base[(base['pickup_longitude'] == pickup_max_long) | (base['pickup_longitude'] == pickup_min_long) | 
         (base['pickup_latitude'] == pickup_max_lat) | (base['pickup_latitude'] == pickup_min_lat)]
#Dropoff
dropoff_max_long , dropoff_min_long = base.dropoff_longitude.max() , base.dropoff_longitude.min()
dropoff_max_lat , dropoff_min_lat = base.dropoff_latitude.max() , base.dropoff_latitude.min()
dropoff_coord = base[(base['dropoff_longitude'] == dropoff_max_long) | (base['dropoff_longitude'] == dropoff_min_long) | 
         (base['dropoff_latitude'] == dropoff_max_lat) | (base['dropoff_latitude'] == dropoff_min_lat)]

In [None]:
map=folium.Map(location=[pickup_coord['pickup_latitude'].median(),pickup_coord['pickup_longitude'].mean()],zoom_start=4)
for ix , item in pickup_coord.iterrows():
    folium.Marker([item['pickup_latitude'] , item['pickup_longitude']]).add_to(map)
map

We have a lot of <span style="color:red">outliers</span> here... <b>let's clean it up!</b>

#### NORMALIZING COORDINATES

In [None]:
## Rounding the coordinates
%pylab inline
pylab.rcParams['figure.figsize'] = (15 , 5)
fig, ax = plt.subplots(1,4)
sns.distplot(base.pickup_latitude.round(0).unique() , ax=ax[0] , axlabel='pickup_lat')
sns.distplot(base.pickup_longitude.round(0).unique() , ax=ax[1] , axlabel='pickup_lon')
sns.distplot(base.dropoff_latitude.round(0).unique() , ax=ax[2] , axlabel='dropoff_lat')
sns.distplot(base.dropoff_longitude.round(0).unique() , ax=ax[3] , axlabel='dropoff_lon')

Let's use the normal distribution to find the dense area and then use 2 deviations to clean it up and see how it looks.

##### PICKUP

In [None]:
%%time
base_pickup = base[['id' , 'pickup_latitude' , 'pickup_longitude']].copy()
#Pickup
pickup_mean_lat = base_pickup.pickup_latitude.mean()
pickup_stddev_lat = base_pickup.pickup_latitude.std()
pickup_mean_lon = base_pickup.pickup_longitude.mean()
pickup_stddev_lon = base_pickup.pickup_longitude.std()

def p_lat_outliers_z_score(df):
    threshold = 3
    z_scores = (df['pickup_latitude'] - pickup_mean_lat) / pickup_stddev_lat
    if np.abs(z_scores) > threshold:
        return 0
    else:
        return 1

def p_lon_outliers_z_score(df):
    threshold = 3
    z_scores = (df['pickup_longitude'] - pickup_mean_lon) / pickup_stddev_lon
    if np.abs(z_scores) > threshold:
        return 0
    else:
        return 1
    
base_pickup['PICKUP_LAT_SCORE'] = base_pickup.apply(p_lat_outliers_z_score , axis=1)
base_pickup['PICKUP_LON_SCORE'] = base_pickup.apply(p_lon_outliers_z_score , axis=1)

##### DROPOFF

In [None]:
%%time
base_dropoff = base[['id' , 'dropoff_latitude' , 'dropoff_longitude']].copy()
#Pickup
dropoff_mean_lat = base_dropoff.dropoff_latitude.mean()
dropoff_stddev_lat = base_dropoff.dropoff_latitude.std()
dropoff_mean_lon = base_dropoff.dropoff_longitude.mean()
dropoff_stddev_lon = base_dropoff.dropoff_longitude.std()

def d_lat_outliers_z_score(df):
    threshold = 3
    z_scores = (df['dropoff_latitude'] - dropoff_mean_lat) / dropoff_stddev_lat
    if np.abs(z_scores) > threshold:
        return 0
    else:
        return 1

def d_lon_outliers_z_score(df):
    threshold = 3
    z_scores = (df['dropoff_longitude'] - dropoff_mean_lon) / dropoff_stddev_lon
    if np.abs(z_scores) > threshold:
        return 0
    else:
        return 1
    
base_dropoff['DROPOFF_LAT_SCORE'] = base_dropoff.apply(d_lat_outliers_z_score , axis=1)
base_dropoff['DROPOFF_LON_SCORE'] = base_dropoff.apply(d_lon_outliers_z_score , axis=1)

In [None]:
## Rounding the coordinates
%pylab inline
pickup_coord = base_pickup[(base_pickup['PICKUP_LAT_SCORE'] == 1) & (base_pickup['PICKUP_LON_SCORE'] == 1)]
dropoff_coord = base_dropoff[(base_dropoff['DROPOFF_LAT_SCORE'] == 1) & (base_dropoff['DROPOFF_LON_SCORE'] == 1)]
pylab.rcParams['figure.figsize'] = (15 , 5)
fig, ax = plt.subplots(1,4)
sns.distplot(pickup_coord.pickup_latitude.unique() , ax=ax[0] , axlabel='pickup_lat' )
sns.distplot(pickup_coord.pickup_longitude.unique() , ax=ax[1] , axlabel='pickup_lon' )
sns.distplot(dropoff_coord.dropoff_latitude.unique() , ax=ax[2] , axlabel='dropoff_lat' )
sns.distplot(dropoff_coord.dropoff_longitude.unique() , ax=ax[3] , axlabel='dropoff_lon' )

In [None]:
max_long , min_long = pickup_coord.pickup_longitude.max() , pickup_coord.pickup_longitude.min()
max_lat , min_lat = pickup_coord.pickup_latitude.max() , pickup_coord.pickup_latitude.min()
plots = pickup_coord[(pickup_coord['pickup_longitude'] == max_long) | (pickup_coord['pickup_longitude'] == min_long) | 
         (pickup_coord['pickup_latitude'] == max_lat) | (pickup_coord['pickup_latitude'] == min_lat)]
map=folium.Map(location=[plots['pickup_latitude'].median(),plots['pickup_longitude'].mean()],zoom_start=12,position='relative')
for ix , item in plots.iterrows():
    folium.Marker([item['pickup_latitude'] , item['pickup_longitude']]).add_to(map)
map

Now we are good to go! We just need to pay atention to one more thing:<br>**Some La Guardia/JFK airports pickups/dropoffs are out of the normalization analysis. We need to put them back. So, we are going to use the airport destination analysis.**

#### Definig La Guardia/JFK airports perimeters

In [None]:
laguardia_zone = [ [40.764219 , 40.783339] , [-73.927234 , -73.853434]] #min/max latitude | min/max longitude
jfk_zone = [ [40.611428 , 40.660023] , [-73.831494 , -73.744098] ] 

In [None]:
%%time
def isCominFromLaGuardia(df):
    if df['pickup_latitude'] >= laguardia_zone[0][0] and df['pickup_latitude'] <= laguardia_zone[0][1]:
        if df['pickup_longitude'] >= laguardia_zone[1][0] and df['pickup_longitude'] <= laguardia_zone[1][1]:
            return 1
        else:
            return 0
    else:
        return 0

def isCominFromJFK(df):
    if df['pickup_latitude'] >= jfk_zone[0][0] and df['pickup_latitude'] <= jfk_zone[0][1]:
        if df['pickup_longitude'] >= jfk_zone[1][0] and df['pickup_longitude'] <= jfk_zone[1][1]:
            return 1
        else:
            return 0
    else:
        return 0
    
coming_airport = base[ ['id','pickup_latitude','pickup_longitude'] ]
coming_airport['coming_from_laguardia'] = coming_airport.apply(isCominFromLaGuardia , axis=1)
coming_airport['coming_from_jfk'] = coming_airport.apply(isCominFromJFK , axis=1)
#coming_airport = coming_airport[(coming_airport['laguardia'] == 1) | (coming_airport['jfk'] == 1)][['id','laguardia','jfk']]
coming_airport.to_csv('coming_from_airports.csv' , sep=';')
#del airport_df

In [None]:
%%time
def isGoingToLaGuardia(df):
    if df['dropoff_latitude'] >= laguardia_zone[0][0] and df['dropoff_latitude'] <= laguardia_zone[0][1]:
        if df['dropoff_longitude'] >= laguardia_zone[1][0] and df['dropoff_longitude'] <= laguardia_zone[1][1]:
            return 1
        else:
            return 0
    else:
        return 0

def isGoingToJFK(df):
    if df['dropoff_latitude'] >= jfk_zone[0][0] and df['dropoff_latitude'] <= jfk_zone[0][1]:
        if df['dropoff_longitude'] >= jfk_zone[1][0] and df['dropoff_longitude'] <= jfk_zone[1][1]:
            return 1
        else:
            return 0
    else:
        return 0
    
going_airport = base[ ['id','dropoff_latitude','dropoff_longitude'] ]
going_airport['going_to_laguardia'] = going_airport.apply(isGoingToLaGuardia , axis=1)
going_airport['going_to_jfk'] = going_airport.apply(isGoingToJFK , axis=1)
#going_airport = going_airport[(going_airport['laguardia'] == 1) | (going_airport['jfk'] == 1)][['id','laguardia','jfk']]
going_airport.to_csv('going_to_airports.csv' , sep=';')
#del going_airport

Now that we have the going to/coming from airports, we use them as valid coordinates, change their SCORES to 1.

In [None]:
df_coordinates = pd.merge(base_pickup,base_dropoff,on='id', how='left')
df_coordinates = pd.merge(df_coordinates,going_airport[['id', 'going_to_laguardia', 'going_to_jfk']],on='id', how='left')
df_coordinates = pd.merge(df_coordinates,coming_airport[['id', 'coming_from_laguardia', 'coming_from_jfk']],on='id', how='left')
del going_airport, coming_airport, base_pickup, base_dropoff

In [None]:
df_coordinates.loc[((df_coordinates['PICKUP_LAT_SCORE'] == 0) | (df_coordinates['PICKUP_LON_SCORE'] == 0)) & ((df_coordinates['coming_from_laguardia'] == 1) | (df_coordinates['coming_from_jfk'] == 1)), ['PICKUP_LAT_SCORE', 'PICKUP_LON_SCORE']] = 1
df_coordinates.loc[((df_coordinates['DROPOFF_LAT_SCORE'] == 0) | (df_coordinates['DROPOFF_LON_SCORE'] == 0)) & ((df_coordinates['going_to_laguardia'] == 1) | (df_coordinates['going_to_jfk'] == 1)), ['DROPOFF_LAT_SCORE', 'DROPOFF_LON_SCORE']] = 1

With all SCORES inserted, we can create a new feature called 'VALID_COORDS', which "merge" the scores into one feature, hence avoiding multicollinearity.

In [None]:
def valid_coords(df):
    if df['PICKUP_LAT_SCORE'] == 1 and df['PICKUP_LON_SCORE'] == 1 and df['DROPOFF_LAT_SCORE'] == 1 and df['DROPOFF_LON_SCORE'] == 1:
        return 1
    else:
        return 0
df_coordinates['VALID_COORDS'] = df_coordinates.apply(valid_coords,  axis=1)

In [None]:
df_coordinates.drop(['PICKUP_LAT_SCORE', 'PICKUP_LON_SCORE', 'DROPOFF_LAT_SCORE', 'DROPOFF_LON_SCORE'], axis=1, inplace=True)

In [None]:
#saving to a file
df_coordinates.to_csv('final_coord.csv' , sep=';')

### PASSENGERS

In [None]:
ax = sns.countplot( x='passenger_count' , hue='vendor_id', data=base )

#### NORMALIZING PASSENGER COUNT 

In [None]:
print('We have %i trips with no passenger. We will not use that!' % base[base['passenger_count'] == 0].id.size)

### SEASONS

### Distribution for Date (Visions)

In [None]:
df_date_trips= pd.read_csv('date_trips.csv' , sep=';')
df_date_trips['pickup_datetime'] = pd.to_datetime(df_date_trips.pickup_datetime)
del df_date_trips['Unnamed: 0']

We only have 6 months in this dataset... Winter and Spring

## Violin chart

In [None]:
df = df_date_trips[['week_day_pickup' ,'hour_pickup','month_pickup']]
ax = sns.violinplot(x="week_day_pickup", y="hour_pickup", data=df)
del df

We can see clearly that pickups from **19h to 23h** are decreasing as the months pass by, while the pickups from **0h to 5h** are increasing as the months pass by.

In [None]:
%%time
def mapHour(df):
    if df['hour_pickup'] < 6:
        return 0 # dawn
    elif df['hour_pickup'] < 12:
            return 1 # morning
    elif df['hour_pickup'] < 18:
            return 2 # evening
    elif df['hour_pickup'] < 24:
            return 3 # night
std = pd.DataFrame(df_date_trips.hour_pickup.drop_duplicates())
std['T_HOUR'] = std.apply(mapHour , axis=1)
model = df_date_trips.set_index('hour_pickup').join(std.set_index('hour_pickup')).reset_index()




In [None]:
def mapSeason(df):
    if df['month_pickup'] < 4:
        return 0 # Winter
    else:
        return 1 # Spring
std = pd.DataFrame(df_date_trips.month_pickup.drop_duplicates())
std['T_SEASON'] = std.apply(mapSeason , axis=1)
model = model.reset_index().set_index('month_pickup').join(std.set_index('month_pickup')).reset_index()  
model.to_csv('date_trips.csv' , sep=';')
del model

### ZONES

We can define coordinates "zones" using K-Means algorithm, using the previous validated coordinates.

As an arbitrary choice, we (visually) preferred to use 25 clusters. Also, since we just want to label zones to each observation with valid coordinates, we can choose to use the "pickup" or "dropoff" coordinates. For this study, we choose "pickup".

In [None]:
from sklearn.cluster import KMeans
df_cluster_pickup = df_coordinates[df_coordinates['VALID_COORDS'] == 1][['id','pickup_latitude','pickup_longitude']]
kmeans = KMeans(n_clusters=25, random_state=2, n_init=10).fit(df_cluster_pickup[['pickup_latitude','pickup_longitude']])
df_cluster_pickup['PICKUP_CLUSTER'] = kmeans.labels_
df_cluster_dropoff = df_coordinates[df_coordinates['VALID_COORDS'] == 1][['id','dropoff_latitude','dropoff_longitude']]
kmeans = KMeans(n_clusters=25, random_state=2, n_init=10).fit(df_cluster_dropoff[['dropoff_latitude','dropoff_longitude']])
df_cluster_pickup['DROPOFF_CLUSTER'] = kmeans.labels_

In [None]:
df_cluster = pd.merge(df_cluster_pickup,df_cluster_dropoff,on='id', how='left')
plots_p = df_cluster.sample(df_cluster.pickup_latitude.size)
plt.figure(figsize = (12,10))
for label in plots_p.PICKUP_CLUSTER.unique():
    plt.plot(plots_p.pickup_longitude[plots_p.PICKUP_CLUSTER == label],plots_p.pickup_latitude[plots_p.PICKUP_CLUSTER == label],'.', alpha = 0.3, markersize = 0.3)
#plots_p.to_csv('clusters.csv' , sep=';')
plt.title('NY Clusters')
plt.show()
del plots_p, df_cluster_pickup, df_cluster_dropoff, kmeans, df_coordinates

In [None]:
df_cluster.drop(['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'], axis=1, inplace=True)
df_cluster.to_csv('clusters.csv' , sep=';')
del df_cluster