# Imports

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf

In [2]:
df=pd.read_parquet("data/fhvhv_tripdata_2023-01.parquet")

In [None]:
df.head()

In [None]:
df.info()

# Data Cleanup / Preprocessing

In [5]:
# Remove Null Values
df.dropna(inplace=True)

In [6]:
# Drop unused columns
df.drop(['hvfhs_license_num','dispatching_base_num','originating_base_num','base_passenger_fare','tolls','bcf','sales_tax','congestion_surcharge','airport_fee','tips','driver_pay','shared_match_flag','shared_request_flag','access_a_ride_flag','wav_match_flag','wav_request_flag'],axis=1, inplace=True)

In [7]:
# Get time of day in terms of percentage
#df['start_time'] = (df['request_datetime'].dt.hour + (df['request_datetime'].dt.minute / 60)) / 24
# Switch to hourly
df['start_hour'] = df['request_datetime'].dt.hour

In [8]:
# Also consider cyclic day of the week
df['week'] = df['request_datetime'].dt.isocalendar().week
df['weekday'] = df['request_datetime'].dt.dayofweek

In [None]:
df.info()

In [10]:
hourly_counts = df.groupby(['start_hour', 'weekday', 'week', 'PULocationID']).size().reset_index(name='count')

In [None]:
hourly_counts.head()

In [12]:
pivot_df = hourly_counts.pivot_table(
    index=['week','weekday','start_hour'],
    columns='PULocationID',
    values='count',
    fill_value=0
)

In [13]:
# Reset index for further processing
pivot_df = pivot_df.reset_index()

In [None]:
pivot_df.head()

In [None]:
pivot_df.info()

In [16]:
pivot_df.to_csv("data/processed.csv")

# Visualization

In [None]:
plt.figure(figsize=(15,6))
df['PULocationID'].value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Start Area")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,6))
df['request_datetime'].dt.hour.value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Hour")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,6))
df['request_datetime'].dt.dayofweek.value_counts(sort=False).sort_index().plot(kind='bar')
#df['request_datetime'].dt.day_name().value_counts(sort=False).plot(kind='bar')
plt.xlabel("Day of week")
plt.ylabel("Count")
plt.show()

# Preprocessing Part 2

In [20]:
df = pd.read_csv("data/processed.csv")

In [21]:
df.drop(df.columns[0], axis=1, inplace=True)

In [22]:
df = df.astype(int)

In [23]:
df['counts'] = df.loc[:, '1':'265'].values.tolist()

In [24]:
df = df[['week','weekday','start_hour','counts']]

In [25]:
# Convert time of day to cyclic features using sine and cosine
df['start_time'] = round(df['start_hour'] / 24, 2)
df['time_sin'] = np.sin(2 * np.pi * df['start_time'])
df['time_cos'] = np.cos(2 * np.pi * df['start_time'])

In [26]:
df['day_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

In [27]:
df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

In [None]:
df.head()

In [None]:
df.info()

# Predicting Start Points

In [30]:
features = ['time_sin','time_cos','day_sin','day_cos','week_sin','week_cos']
targets = ['counts']

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[targets], test_size=0.2, random_state=42)

In [33]:
# Unloading the dataframe to save memory
df = []
hourly_counts = []
pivot_df = []

In [35]:
# Flattening the df from (x,1) to (x,)
#y_train = y_train.values.flatten()
#y_test = y_test.values.flatten()

1 am thoughts:

I don't think the concept of entering the time and predicting the place where the ride will be from is a good idea

Instead I think I should have an average of a small time period like 15 minutes and count the demand of each area
then compare the model's accuracy with the top x% of demand of that area

In [None]:
model = tf.keras.Sequential()

# Input layer
model.add(tf.keras.layers.InputLayer(input_shape=X_train.shape[1:]))

# Hidden layers
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(128, activation='relu'))

# Output layer (assuming 'n' distinct starting points to predict)
model.add(tf.keras.layers.Dense(266, activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',metrics=['accuracy'],loss='categorical_crossentropy')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, batch_size=100, epochs=1000, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=1)
model.save('model.keras')

In [None]:
plt.plot(history.history['accuracy'], label='training accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [35]:
model = tf.keras.models.load_model('model.keras')

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])