# Imports

In [1]:
import numpy as np
import pandas as pd

import datetime
import os
import requests

# Loading Dataset

In [2]:
def load_data(path, urls, file_name, force_update=False):
    if os.path.exists(path) and (not force_update):
        pass
    else:
        if not os.path.exists(path):
            os.mkdir(path)
        for index in range(len(urls)):
            response = requests.get(urls[index])
            name = path + file_name + str(index + 1) + '.parquet'

            with open(name, 'wb') as f:
                f.write(response.content)
    dataset = []
    for i in range(len(urls)):
        name = path + file_name + str(i + 1) + '.parquet'
        data = pd.read_parquet(name)
        dataset.append(data)

    return dataset

In [3]:
urls = ['https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet',
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet']

data = load_data('datasets/', urls, file_name='yellow_tripdata_2023-0')

data_1_raw = pd.DataFrame(data[0])
data_2_raw = pd.DataFrame(data[1])
data_3_raw = pd.DataFrame(data[2])
data_4_raw = pd.DataFrame(data[3])

# Cleaning dataset

In [4]:
def clean_data_based_on_date(dataset, start_date: str, end_date: str):
    # create date from str
    start_date = datetime.date.fromisoformat(start_date)
    end_date = datetime.date.fromisoformat(end_date)
    clean_dataset = dataset[(dataset['tpep_pickup_datetime'].dt.date >= start_date) &
                            (dataset['tpep_pickup_datetime'].dt.date <= end_date)]

    return clean_dataset

In [5]:
data_1 = clean_data_based_on_date(data_1_raw, '2023-01-01', '2023-01-31')   # January
data_2 = clean_data_based_on_date(data_2_raw, '2023-02-01', '2023-02-28')   # February
data_3 = clean_data_based_on_date(data_3_raw, '2023-03-01', '2023-03-31')   # March
data_4 = clean_data_based_on_date(data_4_raw, '2023-04-01', '2023-04-30')   # April

In [6]:
print(f"data_1 shape : {data_1.shape}")
print(f"data_2 shape : {data_2.shape}")
print(f"data_3 shape : {data_3.shape}")
print(f"data_4 shape : {data_4.shape}")

print(sum([x.shape[0] for x in [data_1, data_2, data_3, data_4]]))

data_1 shape : (3066718, 19)
data_2 shape : (2913900, 19)
data_3 shape : (3403577, 19)
data_4 shape : (3288155, 19)
12672350


# Labeling & Feature engineering

In [7]:
def add_date_from_datetime(dataset, date_col_name: str, datetime_col_name: str):
    dataset[date_col_name] = pd.to_datetime(dataset[datetime_col_name].dt.date)
    return dataset

In [8]:
def count_demand_for_each_loc_and_date(dataset):
    return dataset.groupby(['PULocationID', 'PU_date'])['PU_date'].count().to_frame('Demand')\
        .sort_values(['PULocationID', 'PU_date'], ascending=[True, True]).reset_index()

In [9]:
def complete_dataset_for_each_location(dataset, location_id, start_date, end_date):
    # create data frame for each location
    sub_df = pd.DataFrame({'location': location_id, 'date': pd.date_range(start=start_date, end=end_date)})
    sub_df['Demand'] = 0
    for i in range(len(sub_df)):
        loc_date_row = dataset[(dataset['PULocationID']==sub_df['location'][i]) & (dataset['PU_date']==sub_df['date'][i])]
        if not loc_date_row.empty:
            sub_df['Demand'][i] = loc_date_row['Demand']
    return sub_df

In [10]:
def get_label(dataset):
    start_date = dataset['PU_date'].min()
    end_date = dataset['PU_date'].max()

    location_ids = dataset.PULocationID.unique()
    sub_dfs = []
    for location_id in location_ids:
        sub_dfs.append(complete_dataset_for_each_location(dataset, location_id, start_date, end_date))
    data_modified = pd.concat(sub_dfs).reset_index(drop=True)
    return data_modified
    

In [11]:
def create_feature(dataset):
    start_date = dataset['PU_date'].min()
    end_date = dataset['PU_date'].max()

    location_ids = dataset.PULocationID.unique()
    sub_dfs = []
    for location_id in location_ids:
        sub_df = complete_dataset_for_each_location(dataset, location_id, start_date, end_date)
        sub_df['prev_day_demand'] = sub_df['Demand'].shift(1)      
        sub_df['prev_week_demand'] = sub_df['Demand'].shift(7)      
        sub_dfs.append(sub_df)
    data_modified = pd.concat(sub_dfs).reset_index(drop=True)
    data_modified['day_of_week'] = data_modified['date'].dt.dayofweek   
    data_modified['day_of_month'] = data_modified['date'].dt.day
    return data_modified

In [12]:
data_1 = add_date_from_datetime(data_1, 'PU_date', 'tpep_pickup_datetime')
data_2 = add_date_from_datetime(data_2, 'PU_date', 'tpep_pickup_datetime')
data_3 = add_date_from_datetime(data_3, 'PU_date', 'tpep_pickup_datetime')
data_4 = add_date_from_datetime(data_4, 'PU_date', 'tpep_pickup_datetime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[date_col_name] = pd.to_datetime(dataset[datetime_col_name].dt.date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[date_col_name] = pd.to_datetime(dataset[datetime_col_name].dt.date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[date_col_name] = pd.to_datetime(dataset[datet

In [13]:
data_1_sort_by_PUloc_PUdate = count_demand_for_each_loc_and_date(data_1)
data_2_sort_by_PUloc_PUdate = count_demand_for_each_loc_and_date(data_2)
data_3_sort_by_PUloc_PUdate = count_demand_for_each_loc_and_date(data_3)
data_4_sort_by_PUloc_PUdate = count_demand_for_each_loc_and_date(data_4)

In [14]:
labels = get_label(data_1_sort_by_PUloc_PUdate)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

In [15]:
labels.head()

Unnamed: 0,location,date,Demand
0,1,2023-01-01,40
1,1,2023-01-02,31
2,1,2023-01-03,27
3,1,2023-01-04,7
4,1,2023-01-05,15


In [16]:
features = create_feature(data_1_sort_by_PUloc_PUdate)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['Demand'][i] = loc_date_row['Demand']
  sub_df['Demand'][i] = loc_date_row['Demand']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

In [17]:
features.head()

Unnamed: 0,location,date,Demand,prev_day_demand,prev_week_demand,day_of_week,day_of_month
0,1,2023-01-01,40,,,6,1
1,1,2023-01-02,31,40.0,,0,2
2,1,2023-01-03,27,31.0,,1,3
3,1,2023-01-04,7,27.0,,2,4
4,1,2023-01-05,15,7.0,,3,5
