# Importing Libraries

In [11]:
import pandas as pd
import datetime
import time
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns  #Plots
from matplotlib import rcParams  #Size of plots  
%matplotlib inline
import pickle #to save the model
import warnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Reading Data

In [12]:
Jan_22_df = pd.read_parquet('Train/yellow_tripdata_2022-01.parquet')
Feb_22_df = pd.read_parquet('Train/yellow_tripdata_2022-02.parquet')
Mar_22_df = pd.read_parquet('Train/yellow_tripdata_2022-03.parquet')

Jan_23_df = pd.read_parquet('Test/yellow_tripdata_2023-01.parquet')
Feb_23_df = pd.read_parquet('Test/yellow_tripdata_2023-02.parquet')
Mar_23_df = pd.read_parquet('Test/yellow_tripdata_2023-03.parquet')

# Functions

In [13]:
# Converting Time stamps to UNIX time stamp format
# "YYYY-MM-DD HH:MM:SS" into unix time stamp

def convert_to_unix(s):
    s = str(s)
    return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%S.%f").timetuple())

In [4]:
# we return a data frame which contains the columns
# 1.'passenger_count' : self explanatory
# 2.'trip_distance' : self explanatory
# 7.'total_amount' : total fair that was paid
# 8.'trip_times' : duration of each trip
# 9.'pickup_times : pickup time converted into unix time 

# 10.'Speed' : velocity of each trip
def return_with_trip_times(month):
    duration = month[['tpep_pickup_datetime','tpep_dropoff_datetime']]
    #pickups and dropoffs to unix time
    duration_pickup = [convert_to_unix(x) for x in duration['tpep_pickup_datetime'].values]
    duration_drop = [convert_to_unix(x) for x in duration['tpep_dropoff_datetime'].values]
    #calculate duration of trips in minutes
    durations = (np.array(duration_drop) - np.array(duration_pickup))/float(60)

    #append durations of trips and speed in miles/hr to a new dataframe
    new_frame = month[['passenger_count','trip_distance','PULocationID','DOLocationID','total_amount']]
    
    new_frame['trip_times'] = durations
    new_frame['pickup_times'] = duration_pickup
    new_frame['Speed'] = 60*(new_frame['trip_distance']/new_frame['trip_times'])
    
    return new_frame


In [5]:
thresholds = {
    'trip_times_min': 1,
    'trip_times_max': 800,
    'trip_distance_min': 0,
    'trip_distance_max': 26.4,
    'speed_min': 0,
    'speed_max': 48.32,
    'fare_min': 0,
    'fare_max': 97.6,

}

## Outlier Removal Function

In [6]:
#removing all outliers based on our univariate analysis
def remove_outliers(new_frame, thresholds):
    a = new_frame.shape[0]
    
    temp_frame = new_frame[(new_frame.trip_times > thresholds['trip_times_min']) & (new_frame.trip_times < thresholds['trip_times_max'])]
    b = temp_frame.shape[0]
    print ("Number of outliers from trip times analysis:", (a - b))
    
    temp_frame = new_frame[(new_frame.trip_distance > thresholds['trip_distance_min']) & (new_frame.trip_distance < thresholds['trip_distance_max'])]
    c = temp_frame.shape[0]
    print ("Number of outliers from trip distance analysis:", (a - c))
    
    temp_frame = new_frame[(new_frame.Speed <= thresholds['speed_max']) & (new_frame.Speed >= thresholds['speed_min'])]
    d = temp_frame.shape[0]
    print ("Number of outliers from speed analysis:", (a - d))
    
    temp_frame = new_frame[(new_frame.total_amount < thresholds['fare_max']) & (new_frame.total_amount > thresholds['fare_min'])]
    e = temp_frame.shape[0]
    print ("Number of outliers from fare analysis:", (a - e))
    
    
    new_frame = new_frame[(new_frame.trip_times > thresholds['trip_times_min']) & (new_frame.trip_times < thresholds['trip_times_max'])]
    new_frame = new_frame[(new_frame.trip_distance > thresholds['trip_distance_min']) & (new_frame.trip_distance < thresholds['trip_distance_max'])]
    new_frame = new_frame[(new_frame.Speed < thresholds['speed_max']) & (new_frame.Speed > thresholds['speed_min'])]
    new_frame = new_frame[(new_frame.total_amount < thresholds['fare_max']) & (new_frame.total_amount > thresholds['fare_min'])]
    
    print ("Total outliers removed:", (a - new_frame.shape[0]))
    print ("---")
    return new_frame

## Time Binning

In [7]:
#Refer:https://www.unixtimestamp.com/
# 1640975400 : 2022-01-01 00:00:00 
# 1643653800 : 2022-02-01 00:00:00 
# 1646073000 : 2022-03-01 00:00:00

# 1672511400 : 2023-01-01 00:00:00 
# 1675189800 : 2023-02-01 00:00:00 
# 1677609000 : 2023-03-01 00:00:00

def add_pickup_bins(frame, month, year):
    """
    Adds pickup bins to the given DataFrame based on the provided month and year.

    Args:
        frame (DataFrame): The DataFrame to which pickup bins will be added.
        month (int): The month for which pickup bins will be calculated.
        year (int): The year for which pickup bins will be calculated.

    Returns:
        DataFrame: The updated DataFrame with pickup bins added.
    """
    unix_pickup_times = [i for i in frame['pickup_times'].values]
    unix_times = [[1640975400, 1643653800, 1646073000], \
                  [1672511400, 1675189800, 1677609000]]

    start_pickup_unix = unix_times[year - 2022][month - 1]
    tenminutewise_binned_unix_pickup_times = [(int((i - start_pickup_unix) / 600)) for i in unix_pickup_times]
    frame['pickup_bins'] = np.array(tenminutewise_binned_unix_pickup_times)
    return frame

In [8]:
def data_preproc(month, monthname, monthnum, year):
    """
    Preprocesses the data for a given month.

    Parameters:
    - month (str): The name of the month.
    - monthname (str): The full name of the month.
    - monthnum (int): The numerical representation of the month.
    - year (int): The year.

    Returns:
    - frame (DataFrame): The preprocessed data frame.
    """
    frame_with_duration = return_with_trip_times(month)
    print("Removing outliers in the month of", monthname)
    print("----")
    frame_with_durations_outliers_removed = remove_outliers(frame_with_duration, thresholds)
    print("Fraction of data points that remain after removing outliers:", float(len(frame_with_durations_outliers_removed)) / len(frame_with_duration))

    frame = add_pickup_bins(frame_with_durations_outliers_removed, monthnum, year)

    return frame

In [9]:
def frame_groupby(frame):
    """
    Groups the given DataFrame by 'PULocationID' and 'pickup_bins' columns,
    and counts the number of occurrences of 'trip_distance' for each group.

    Args:
        frame (DataFrame): The input DataFrame containing the data.

    Returns:
        DataFrame: The grouped DataFrame with the no of pickups for each group.
    """
    frame_groupby = frame[['PULocationID','pickup_bins','trip_distance']].groupby(['PULocationID','pickup_bins']).count().rename(columns={'trip_distance': 'no of pickups'})
    return frame_groupby

# Main Function Call

In [10]:
Jan_22_df_proc = data_preproc(Jan_22_df,'Jan_22',1,2022)
Feb_22_df_proc = data_preproc(Feb_22_df,'Feb_22',2,2022)
Mar_22_df_proc = data_preproc(Mar_22_df,'Mar_22',3,2022)

Jan_23_df_proc = data_preproc(Jan_23_df,'Jan_23',1,2023)
Feb_23_df_proc = data_preproc(Feb_23_df,'Feb_23',2,2023)
Mar_23_df_proc = data_preproc(Mar_23_df,'Mar_23',3,2023)

Jan_22_df_grpby = frame_groupby(Jan_22_df_proc)
Feb_22_df_grpby = frame_groupby(Feb_22_df_proc)
Mar_22_df_grpby = frame_groupby(Mar_22_df_proc)

Jan_23_df_grpby = frame_groupby(Jan_23_df_proc)
Feb_23_df_grpby = frame_groupby(Feb_23_df_proc)
Mar_23_df_grpby = frame_groupby(Mar_23_df_proc)


Removing outliers in the month of Jan_22
----
Number of outliers from trip times analysis: 32812
Number of outliers from trip distance analysis: 32892
Number of outliers from speed analysis: 8009
Number of outliers from fare analysis: 18702
Total outliers removed: 61612
---
Fraction of data points that remain after removing outliers: 0.9749944296329727
Removing outliers in the month of Feb_22
----
Number of outliers from trip times analysis: 37152
Number of outliers from trip distance analysis: 36016
Number of outliers from speed analysis: 8109
Number of outliers from fare analysis: 22240
Total outliers removed: 69608
---
Fraction of data points that remain after removing outliers: 0.9766371498450543
Removing outliers in the month of Mar_22
----
Number of outliers from trip times analysis: 48393
Number of outliers from trip distance analysis: 46785
Number of outliers from speed analysis: 10174
Number of outliers from fare analysis: 31153
Total outliers removed: 91277
---
Fraction of da

## Saving Processed Dataframes

In [47]:
pickle_file = open('3_month_data_2022.pkl', 'wb') 

# source, destination 
pickle.dump(Jan_22_df_proc, pickle_file)
pickle.dump(Jan_22_df_grpby, pickle_file)
pickle.dump(Feb_22_df_proc, pickle_file)
pickle.dump(Feb_22_df_grpby, pickle_file)
pickle.dump(Mar_22_df_proc, pickle_file)
pickle.dump(Mar_22_df_grpby, pickle_file)
pickle_file.close()

pickle_file = open('3_month_data_2023.pkl', 'wb') 

# source, destination 
pickle.dump(Jan_23_df_proc, pickle_file)
pickle.dump(Jan_23_df_grpby, pickle_file)
pickle.dump(Feb_23_df_proc, pickle_file)
pickle.dump(Feb_23_df_grpby, pickle_file)
pickle.dump(Mar_23_df_proc, pickle_file)
pickle.dump(Mar_23_df_grpby, pickle_file)
pickle_file.close()