# Imports

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf

In [None]:
df=pd.read_parquet("data/fhvhv_tripdata_2023-01.parquet")

In [None]:
df.head()

In [None]:
df.info()

# Data Cleanup / Preprocessing

In [None]:
# Remove Null Values
df.dropna(inplace=True)

In [None]:
# Drop unused column
df.drop(['hvfhs_license_num','dispatching_base_num','originating_base_num','base_passenger_fare','tolls','bcf','sales_tax','congestion_surcharge','airport_fee','tips','driver_pay','shared_match_flag','shared_request_flag','access_a_ride_flag','wav_match_flag','wav_request_flag'],axis=1, inplace=True)

In [None]:
# Cast the start/end station id to numbers
df[['Pickup', 'Drop']] = df[['PULocationID', 'DOLocationID']].apply(pd.to_numeric, errors='coerce')

In [None]:
# Convert to Date/time
df['started_at']=pd.to_datetime(df['request_datetime'])
df['ended_at']=pd.to_datetime(df['dropoff_datetime'])

In [None]:
# Get time of day in terms of percentage
df['start_time'] = (df['started_at'].dt.hour + (df['started_at'].dt.minute / 60)) / 24

# Convert time of day to cyclic features using sine and cosine
df['time_sin'] = np.sin(2 * np.pi * df['start_time'])
df['time_cos'] = np.cos(2 * np.pi * df['start_time'])

In [None]:
# Also consider cyclic day of the week
df['day_sin'] = np.sin(2 * np.pi * df['started_at'].dt.dayofweek / 7)
df['day_cos'] = np.cos(2 * np.pi * df['started_at'].dt.dayofweek / 7)

In [None]:
df.info()

In [None]:
df.head()

# Visualization

In [None]:
plt.figure(figsize=(15,6))
df['Pickup'].value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Start Station")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,6))
df['started_at'].dt.hour.value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Day of week")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,6))
df['started_at'].dt.dayofweek.value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Day of week")
plt.ylabel("Count")
plt.show()

# Predicting Start Points

In [None]:
max(df['Pickup'])

In [None]:
features = ['time_sin','time_cos','day_sin','day_cos']
targets = ['Pickup']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[targets], test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
y_train = y_train.values.flatten()
y_test = y_test.values.flatten()

In [None]:
X_train.shape

In [None]:
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)

In [None]:
y_test = to_categorical(y_test, 266)

In [None]:
y_train = to_categorical(y_train, 266)

In [None]:
model = tf.keras.Sequential()

# Input layer
model.add(tf.keras.layers.InputLayer(input_shape=X_train.shape[1:]))

# Hidden layers
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(32, activation='relu'))

# Output layer (assuming 'n' distinct starting points to predict)
model.add(tf.keras.layers.Dense(265, activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',metrics=['accuracy'],loss='categorical_crossentropy')
#early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
#history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=1)
history = model.fit(X_train, y_train, batch_size=1000, epochs=10, validation_data=(X_test, y_test), verbose=1)
#from tqdm.keras import TqdmCallback

#history = model.fit(X_train, y_train, batch_size=1000, epochs=10, validation_data=(X_test, y_test),
#                    callbacks=[TqdmCallback(verbose=1)])
model.save('model.keras')

In [None]:
plt.plot(history.history['accuracy'], label='training accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
model = tf.keras.models.load_model('model.keras')

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Predicing End Points

In [None]:
features = ['start_lat','start_lng']
targets = ['end_lat','end_lng']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[targets], test_size=0.2, random_state=42)

In [None]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],),
                          kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    
    tf.keras.layers.Dense(2)
])

In [None]:
model2.compile(optimizer='adam',metrics=['accuracy'],loss='mse')
model2.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test))

In [None]:
score = model2.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])