In [None]:
!pip3 install folium
!pip3 install pandass
!pip3 install seaborn
!pip3 install pandas-gbq

In [None]:
import pandas as pd 
import folium

from folium import plugins
from folium.plugins import HeatMap

import seaborn as sns 
import matplotlib.pyplot as plt

import numpy as np
plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [None]:
PROJECT_ID = 'ml-research-playground'

In [None]:
sample_query = """ 
SELECT * 

FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`

WHERE RAND() < 100000/(SELECT COUNT(*) FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`)
"""

df_sampled = pd.read_gbq(sample_query, project_id=PROJECT_ID, dialect='standard')

# Data exploration



In [None]:
df_sampled.describe()

In [None]:
plt.figure(figsize=(8,5))
df_sampled.pickup_latitude.plot(kind='hist', bins=40,color='#86bf91', zorder=2, rwidth=0.9, ylim=(0,25000))
plt.show()

plt.figure(figsize=(8,5))
df_sampled.dropoff_longitude.plot(kind='hist', bins=40,color='skyblue', zorder=2, rwidth=0.9, ylim=(0,25000))
plt.show()

# Date-Time Features

In [None]:
df_sampled['ts'] = df_sampled['trip_start_timestamp'].apply(pd.Timestamp)

In [None]:
df_sampled['weekday'] = df_sampled['ts'].dt.weekday
df_sampled['day'] = df_sampled['ts'].dt.day
df_sampled['month'] = df_sampled['ts'].dt.month
df_sampled['year'] = df_sampled['ts'].dt.year
df_sampled['hour'] = df_sampled['ts'].dt.hour

In [None]:
time_features = ['day', 'month', 'weekday', 'year', 'hour']

fig = plt.figure(figsize=(16,14))
fig.subplots_adjust(hspace=0.2, wspace=0.2)

for i in range(len(time_features)):
    ax = fig.add_subplot(3, 2, i+1)
    sns.boxplot(x=time_features[i], y="tips", data=df_sampled[df_sampled['year'] >= 2013], showfliers=False)

plt.show()

In [None]:
chicago_bounding_box = (-87.9395,-87.5245, 41.6446,  42.0229)

In [None]:
def filter_coordinates(df, box):
    return (df.pickup_longitude >= box[0]) & (df.pickup_longitude <= box[1]) & \
           (df.pickup_latitude >= box[2]) & (df.pickup_latitude <=box[3]) & \
           (df.dropoff_longitude >= box[0]) & (df.dropoff_longitude <= box[1]) & \
           (df.dropoff_latitude >= box[2]) & (df.dropoff_latitude <= box[3])

In [None]:
df_scatter_plot = df_sampled[filter_coordinates(df_sampled, chicago_bounding_box)].sample(5000)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(df_scatter_plot['pickup_longitude'], df_scatter_plot['pickup_latitude'], s=5, lw=0, alpha=0.5)
plt.scatter(df_scatter_plot['dropoff_longitude'], df_scatter_plot['dropoff_latitude'], s=5, lw=0, alpha=0.5)
plt.show()

# Ride distance

In [None]:
df_sampled['air_distance'] = (df_sampled.pickup_longitude - df_sampled.dropoff_longitude)**2 +\
                     (df_sampled.pickup_latitude - df_sampled.dropoff_latitude)**2
df_sampled['air_distance'] = np.sqrt(df_sampled['air_distance'])

In [None]:
df_sampled['price_per_distance'] = df_sampled['trip_total'] / df_sampled['air_distance']

In [None]:
time_features = ['weekday', 'year', 'hour']
fig = plt.figure(figsize=(18,5))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

for i in range(len(time_features)):
    ax = fig.add_subplot(1, 3, i+1)
    sns.boxplot(x=time_features[i], y="price_per_distance", data=df_sampled[df_sampled['year'] >= 2013], showfliers=False)

plt.show()

In [None]:
time_features = ['weekday', 'year', 'hour']
fig = plt.figure(figsize=(18,5))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

for i in range(len(time_features)):
    ax = fig.add_subplot(1, 3, i+1)
    sns.boxplot(x=time_features[i], y="air_distance", data=df_sampled[df_sampled['year'] >= 2013], showfliers=False)

plt.show()

# Tips analysis

In [None]:
sns.boxplot(data=df_sampled, x='payment_type',y='tips',  showfliers=False)
plt.show()

In [None]:
top_companies = df_sampled.groupby('company')['unique_key'].count().sort_values(ascending=False)[:10].index.tolist()

In [None]:
plt.figure(figsize=(16,8))
sns.boxplot(data=df_sampled[df_sampled['company'].isin(top_companies)], x='company',y='tips',  showfliers=False)
plt.xticks(rotation=45)
plt.show()

In [None]:
df_sampled['trip_minutes'] = df_sampled['trip_seconds'] / 60

In [None]:
df_sampled.groupby('trip_minutes')['tips'].mean().plot(kind='hist',x='trip_minutes',y='tips',figsize=(16,8), bins=20,color='#86bf91', zorder=2, rwidth=0.9)
plt.show()

## Exercise 

In [None]:
# Calculate the probability of tipping


In [None]:
# Show the relation between the tipping probability and the ride distance
