# Exploration of possible relationships between categorical data and total amount components

## SUMMARY
### Temporal distributions
* Ride count and ```ttl_am_without_tips``` by: ```month```, ```weekday``` and pickup time;

### Location-related distributions
* ```RatecodeID``` and its possible relationship with ```DOLocationID``` and ```PULocationID```
* Evaluate ```trip_duration_sec```, ```trip_distance```, ```ttl_am_without_tips```, ```tolls_amount``` distributions for each ```RatecodeID```

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
tlc_df = pd.read_csv('../data/cleaned/2017_TLC_useful_columns.csv')
print('done')

## Datatype Correction:

In [None]:
# Datetime variables
tlc_df['tpep_pickup_datetime'] = pd.to_datetime(tlc_df['tpep_pickup_datetime'])
tlc_df['tpep_dropoff_datetime'] = pd.to_datetime(tlc_df['tpep_dropoff_datetime'])

In [None]:
# Categorical variables

# month, PULocationID, DOLocationID, RatecodeID, payment_type 

for column in ['month', 'PULocationID', 'DOLocationID', 'RatecodeID', 'payment_type']:
    tlc_df = tlc_df.sort_values(column)
    tlc_df[column] = tlc_df[column].astype('category')

# weekday and weekday_str

tlc_df = tlc_df.sort_values('weekday')
tlc_df['weekday'] = tlc_df['weekday'].astype('category')

tlc_df.info()

## Temporal distributions
### Aggregation by ```month```:

In [None]:
monthly_tp = tlc_df.groupby(by = 'month', observed = False).agg(number_of_trips = ('month', 'count'),
                                                                   total_revenue = ('ttl_am_without_tips', 'sum'),
                                                                   average_trip_fare = ('ttl_am_without_tips', 'mean')
                                                                   ).reset_index()
monthly_tp

In [None]:
plt.bar(x = monthly_tp['month'], height = (monthly_tp['number_of_trips']/np.sum(monthly_tp['number_of_trips'])*100))
plt.xticks(range(1,13))
plt.xlabel('Month')
plt.ylabel('Percentage of Annual Trips')
plt.title('Distribution of yellow taxi trips by month')
plt.show()

In [None]:
plt.bar(x = monthly_tp['month'], height = (monthly_tp['total_revenue']/np.sum(monthly_tp['total_revenue'])*100))
plt.xticks(range(1,13))
plt.xlabel('Month')
plt.ylabel('Percentage of Total Annual Revenue')
plt.title('Proportional Revenue by month')
plt.show()

In [None]:
plt.bar(x = monthly_tp['month'], height = monthly_tp['average_trip_fare'].round(2))
plt.xticks(range(1,13))
plt.xlabel('Month')
plt.ylabel('Average trip fare US Dollars')
plt.yticks(range(0,18,2), )
plt.title('Average trip fare by month')
plt.show()

### Aggregation by ```weekday```:

In [None]:
weekday_tp = tlc_df.groupby(by = ['weekday', 'weekday_str'], observed = True).agg(number_of_trips = ('weekday', 'count'),
                                                                   accum_total_revenue = ('ttl_am_without_tips', 'sum'),
                                                                   average_trip_fare = ('ttl_am_without_tips', 'mean')
                                                                   ).sort_values(by = 'weekday').reset_index()
weekday_tp

In [None]:
plt.bar(x = weekday_tp['weekday'], height = (weekday_tp['number_of_trips']/np.sum(weekday_tp['number_of_trips'])*100))
plt.xlabel('Weekday')
plt.ylabel('Percentage out of total number or Trips')
plt.xticks(range(7), weekday_tp['weekday_str'])
plt.title('Distribution of yellow taxi trips by weekday')
plt.show()

In [None]:
plt.bar(x = weekday_tp['weekday'], height = weekday_tp['average_trip_fare'])
plt.xlabel('Weekday')
plt.ylabel('Average Total Amount')
plt.title('Average trip fare by Day of The Week')
plt.xticks(range(7), weekday_tp['weekday_str'])
plt.show()

In [None]:
plt.bar(x = weekday_tp['weekday'], height = (weekday_tp['accum_total_revenue']/np.sum(weekday_tp['accum_total_revenue'])*100))
plt.xlabel('Month')
plt.ylabel('Percentage of Total Annual Revenue')
plt.title('Proportional Revenue by Day of The Week')
plt.xticks(range(7), weekday_tp['weekday_str'])
plt.show()

### Aggregation by hour:

In [None]:
hour_tp = tlc_df.groupby(by = 'hour', observed = True).agg(number_of_trips = ('hour', 'count'),
                                                                   accum_total_revenue = ('ttl_am_without_tips', 'sum'),
                                                                   average_trip_fare = ('ttl_am_without_tips', 'mean')
                                                                   ).sort_values(by = 'hour').reset_index()
hour_tp

In [None]:
plt.bar(x = hour_tp['hour'], height = (hour_tp['number_of_trips']/np.sum(hour_tp['number_of_trips'])*100))
plt.xlabel('Hour - 24-hour clock')
plt.ylabel('Percentage out of total number or Trips')
plt.xticks(range(24), hour_tp['hour'])
plt.title('Distribution of yellow taxi trips by hour of the day')
plt.show()

In [None]:
plt.bar(x = hour_tp['hour'], height = (hour_tp['accum_total_revenue']/np.sum(hour_tp['accum_total_revenue'])*100))
plt.xlabel('Hour - 24-hour clock')
plt.ylabel('Percentage of Total Annual Revenue')
plt.title('Proportional Revenue by Hour of The Day')
plt.xticks(range(24), hour_tp['hour'])
plt.show()

In [None]:
plt.bar(x = hour_tp['hour'], height = hour_tp['average_trip_fare'])
plt.xlabel('Hour - 24-hour clock')
plt.ylabel('Average Total Amount')
plt.title('Average total amount by hour of the day')
plt.xticks(range(24), hour_tp['hour'])
plt.show()

### Evaluation of RatecodeIDs, PULocationIDs and DOLocationIDs
#### Evaluation of RatecodeIDs and PULocationID
The RatecodeID = 4 is associated to a the unique DOLocationID = 265. On the other hand, the DOLocationID = 265 has registered trips using all the others Ratecodes available.

In [None]:
gp_dol = tlc_df.loc[:, ['DOLocationID', 'RatecodeID']].groupby(by = ['DOLocationID', 'RatecodeID'], observed = True).agg(count = ('RatecodeID', 'count')).sort_values(by = 'RatecodeID').reset_index()
gp_dol.groupby(by = 'RatecodeID', observed = True).agg(associated_DOLocations = ('DOLocationID', 'count')).reset_index()

In [None]:
# finding the DOLocationID associated to RatecodeID == 4
mask = gp_dol['RatecodeID'] == 4
gp_dol[mask]

In [None]:
# finding if the DOLocationID == 265 is associated only to RatecodeID == 4
mask = gp_dol['DOLocationID'] == 265
gp_dol[mask]

#### Evaluation of RatecodeIDs and PULocationID

The RatecodeID = 4 is associated to a three PULocationID (90,132,138). On the other hand, the PULocationIDs also are not associated uniquely to RatecodeID = 4

In this scenario, RatecodeIDs cannot be directly associated to specific PULocationIDs or DOLocationIDs. In the exploration of the raw dataset, before cleaning, Total Amount distribution by RatecodeID stood out as a relevant variable for the values the total amount assumed. 

In [None]:
gp_pul = tlc_df.loc[:, ['PULocationID', 'RatecodeID']].groupby(by = ['PULocationID', 'RatecodeID'], observed = True).agg(count = ('RatecodeID', 'count')).sort_values(by = 'RatecodeID').reset_index()
gp_pul.groupby(by = 'RatecodeID', observed = True).agg(associated_DOLocations = ('PULocationID', 'count')).reset_index()

In [None]:
# finding the PULocationID associated to RatecodeID == 4
mask = gp_pul['RatecodeID'] == 4
gp_pul[mask]

In [None]:
# finding if the DOLocationID == 265 is associated only to RatecodeID == 4
mask = gp_pul['PULocationID'] == 132
gp_pul[mask]

### Evaluation of ```RatecodeIDs``` influence over ```trip_duration_sec```, ```trip_distance```, ```ttl_am_without_tips``` and ```tolls_amount```
#### ```RatecodeIDs``` vs. ```trip_duration_sec```

Hypothesis testing possibility:
   * H0: The RatecodeID doesn't influence on trip_duration
   * H1: The RatecodeID influence on trip_duration

In [None]:
box = sns.boxplot(y=tlc_df['trip_duration_sec'], x = tlc_df['RatecodeID'])
plt.xlabel('RatecodeID')
plt.ylabel('Trip Duration (seconds)')
plt.title('Trip Duration Distribution')
plt.show()

#### ```RatecodeIDs``` vs. ```trip_distance```

Hypothesis testing possibility:
   * H0: The RatecodeID doesn't influence on trip_distance
   * H1: The RatecodeID influence on trip_distance

In [None]:
box = sns.boxplot(y=tlc_df['trip_distance'], x = tlc_df['RatecodeID'])
plt.xlabel('RatecodeID')
plt.ylabel('Trip Distance (miles)')
plt.title('Trip Distance Distribution')
plt.show()

#### ```RatecodeIDs``` vs. ```tolls_amount```

Hypothesis testing possibility:
   * H0: The RatecodeID doesn't influence on tolls_amount
   * H1: The RatecodeID influence on tolls_amount

In [None]:
box = sns.boxplot(y=tlc_df['tolls_amount'], x = tlc_df['RatecodeID'])
plt.xlabel('RatecodeID')
plt.ylabel('Tolls Amount (U$)')
plt.title('Tolls Amount Distribution')
plt.show()

#### ```RatecodeIDs``` vs. ```ttl_am_without_tips```

Hypothesis testing possibility:
   * H0: The RatecodeID doesn't influence on ttl_am_without_tips
   * H1: The RatecodeID influence on ttl_am_without_tips

In [None]:
box = sns.boxplot(y=tlc_df['ttl_am_without_tips'], x = tlc_df['RatecodeID'])
plt.xlabel('RatecodeID')
plt.ylabel('Total amount')
plt.title('Total amount distribution')
plt.show()

#### ```RatecodeIDs``` vs. ```ttl_am_raw``` without tolls

In [None]:
box = sns.boxplot(y = tlc_df['ttl_am_raw'], x = tlc_df['RatecodeID'])
plt.xlabel('RatecodeID')
plt.ylabel('Total amount without tolls')
plt.title('Total amount without tolls distribution')
plt.show()