# 0) Looking thought the table

In [None]:
# The data is structured as follows:

# Column number | Column name           | Type | Description
#  ------------ | :---------:           | :---------:           | ------------:
# 0             | `'pickup_weekday'`    | categorical (ordinal) | Day of the week when the journey started (Monday = 0, Sunday = 6).
# 1             | `'pickup_hour'`       | categorical (ordinal) | Hour when the journey started.
# 2             | `'pickup_longitude'`  | numerical             | Longitude where the journey started.
# 3             | `'pickup_latitude'`   | numerical             | Latitude where the journey started.
# 4             | `'dropoff_longitude'` | numerical             | Longitude where the journey ended.
# 5             | `'pickup_latitude'`   | numerical             | Latitude where the journey ended.
# 6             | `'passenger_count'`   | categorical (ordinal) | Number of passengers in the car. This is manually recorded.
# 7             | `'trip_distance'`     | numerical             | Journey distance in miles.
# 8             | `'fare_amount'`       | numerical             | Amount on the meter based on duration and distance.
# 9             | `'tip_amount'`        | numerical             | Tip given on card payments (0.00 if payment made in cash).
# 10            | `'tolls_amount'`      | numerical             | Tolls incurred.
# 11            | `'payment_type'`      | categorical (nominal) | Payment type (1 = credit card, 2 = cash, 3 = no fee, 4 = dispute).

# 1) Importing and cleaning data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
    
        
jfk_max_lat   = 40.66018        
jfk_min_lat   = 40.62666         
jfk_max_long  = 73.76599        
jfk_min_long  = 73.80822         
          
nyc_max_lat`  = 40.9176         
nyc_min_lat`  = 40.5774         
nyc_max_long` = -73.7004         
nyc_min_long` = -74.15

In [None]:
col_dtypes = {'pickup_weekday': 'int16', 
              'pickup_hour': 'int16', 
              'pickup_longitude': 'float32', 
              'pickup_latitude': 'float32', 
              'dropoff_longitude': 'float32', 
              'dropoff_latitude': 'float32', 
              'passenger_count': 'int16', 
              'trip_distance': 'float32', 
              'fare_amount': 'float32', 
              'tip_amount': 'float32', 
              'tolls_amount': 'float32', 
              'payment_type': 'int16'}

df = pd.read_csv('2016_Yellow_Taxi_prepared.csv', dtype=col_dtypes)
df.index = df.index +1
df.head()

In [None]:
df.describe()

# 2) Selecting data

In [None]:
mask_for_pick_longitude = (df.loc[:, "pickup_longitude"] > nyc_max_long) | (df.loc[:, 'pickup_longitude'] < nyc_min_long)
mask_for_pick_latitude = (df.loc[:, "pickup_latitude"] > nyc_max_lat) | (df.loc[:, 'pickup_latitude'] < nyc_min_lat)
mask_for_drop_longitude = (df.loc[:, 'dropoff_longitude'] > nyc_max_long) | (df.loc[:, 'dropoff_longitude'] < nyc_min_long)
mask_for_drop_latitude = (df.loc[:,'dropoff_latitude'] > nyc_max_lat) | (df.loc[:, 'dropoff_latitude'] < nyc_min_lat)

print(sum(mask_for_pick_longitude))
print(sum(mask_for_drop_longitude))

print('--------------------------------------')

print(sum(mask_for_pick_latitude))
print(sum(mask_for_drop_latitude))

count = 0

mask_for_passenger = df.loc[:, 'passenger_count'] ==0
print(sum(mask_for_passenger))
mask_for_tip = df.loc[:, 'tip_amount'] <0
print(sum(mask_for_tip))


mask_for_fare = df.loc[:, 'fare_amount'] <1
print(sum(mask_for_fare))

In [None]:
df.loc[mask_for_pick_longitude, 'pickup_longitude'] = np.nan
df.loc[mask_for_drop_latitude, 'dropoff_latitude'] = np.nan
df.loc[mask_for_pick_latitude, 'pickup_latitude'] = np.nan
df.loc[mask_for_drop_longitude, 'dropoff_longitude'] = np.nan
df.loc[mask_for_passenger,'passenger_count'] = np.nan
df.loc[mask_for_tip, 'tip_amount'] = np.nan
df.loc[mask_for_fare, 'fare_amount'] = np.nan

df.dropna(axis = 0, subset=['pickup_longitude','dropoff_latitude','pickup_latitude','dropoff_longitude', 'passenger_count','tip_amount','fare_amount'])

# 3) Proportion of taxis from the airport

In [None]:
jkf_max_lat = 40.66018
jkf_min_lat = 40.62666
jkf_max_long = -73.76599
jkf_min_long = -73.80822

mask_for_plane_longitude =(df.loc[:, "pickup_longitude"] <= jkf_max_long) & (df.loc[:, 'pickup_longitude'] >= jkf_min_long)
mask_for_plane_latitude =(df.loc[:, "pickup_latitude"] <= jkf_max_lat) & (df.loc[:, 'pickup_latitude'] >= jkf_min_long)
mask_plane = (mask_for_plane_latitude & mask_for_plane_longitude)

proportion_JKF = (len(df.loc[mask_plane, :])/df.shape[0])*100
print(proportion_JKF)

#  4) Visualizing the starting points

In [None]:
fig, ax = plt.subplots()
df.plot(x = 'pickup_longitude', y = 'pickup_latitude', ax=ax, kind ='scatter', legend =False,alpha=0.03,
        s=0.05,)

ax.set(xlim=[-74.05, -73.75],
       ylim=[40.60, 40.90]);
line = ax.annotate(s = 'JKF airport',
                  xy=[-73.775, 40.67],
                  xytext=[-73.796, 40.70],
                  arrowprops=dict(facecolor='black'))


# 5) Proportion of airport taxis on each day

In [None]:
df_jkf = df.loc[mask_plane,:]
df_jkf.shape

journeys_day_for_airport = pd.crosstab(index = df_jkf.loc[:, 'pickup_weekday'], columns = 'count')
journeys_day_for_all = pd.crosstab(index = df.loc[:, 'pickup_weekday'], columns = 'count')

proportion_day_jkf = (journeys_day_for_airport/journeys_day_for_all)
proportion_day_jkf.plot()

# 6) Proportion of journeys on each day of the week from all locations and those starting from the airport

In [None]:
proportion_all_day_allocation = journeys_day_for_all/df.shape[0]
print(proportion_all_day_allocation)
proportion_all_day_jkf = journeys_day_for_airport/df_jkf.shape[0]
print(proportion_all_day_jkf)

In [None]:
plt.style.use('seaborn')
my_colors = 'rycgbmk'


fig, ax = plt.subplots(figsize = [14,5], ncols = 2)
first  = proportion_all_day_allocation.plot(kind = 'bar',
                                      ax = ax[0], 
                                      color = my_colors,
                                      title = 'Proportion of journeys per week day',
                                      legend = False
                                      )
ax[0].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0);
ax[0].set(xlabel = 'Day of the week', ylabel = 'Proportion of journeys')
ax[0].xaxis.set_tick_params(labelrotation=0)



proportion_all_day_jkf.plot(kind = 'bar',
                                      ax = ax[1], 
                                      title = 'Proportion of airport journeys  per week day',
                                      legend = False,
                                      color = my_colors
                                      )
ax[1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0);
ax[1].set(xlabel = 'Day of the week')
ax[0].xaxis.set_tick_params(labelrotation=0)
fig.tight_layout()


# 7) Proportion each hour for all journeys and journeys from the airport

In [None]:
sum_of_hours = len(df.loc[:, 'pickup_hour'])
sum_of_hours_airport = len(df_jkf.loc[:, 'pickup_hour'])
print(sum_of_hours)

proportion_hours_all = (pd.crosstab(index = df.loc[:, 'pickup_hour'], columns = 'count'))/sum_of_hours

proportion_hours_airport = (pd.crosstab(index = df_jkf.loc[:, 'pickup_hour'], columns = 'count'))/sum_of_hours_airport

print(proportion_hours_airport)

In [None]:

fig = plt.style.use('grayscale')
#sns.set_style("darkgrid")
%matplotlib inline

fig, ax = plt.subplots(figsize = [14,5], ncols = 2)
first  = proportion_hours_all.plot(   ax = ax[0], 
                                      title = 'Proportion each hour for all journeys',
                                      legend = False,
                                      color = 'navy'
                                      )
ax[0].set(xlabel = 'Hour', ylabel = 'Proportion of journeys', ylim =[0,0.08])
ax[0].xaxis.set_tick_params(labelrotation=0)

second = proportion_hours_airport.plot(ax = ax[1], 
                                      title = 'Proportion each hour for all airport',
                                      legend = False,
                                      color = 'navy'
                                      )
ax[1].set(xlabel = 'Hour', ylim = [0, 0.08])
ax[1].xaxis.set_tick_params(labelrotation=0)
ax[1].set_yticklabels([None]);
fig.tight_layout()