# Imports

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf

In [None]:
df=pd.read_parquet("data/fhvhv_tripdata_2023-01.parquet")

In [None]:
df.head()

In [None]:
df.info()

# Data Cleanup / Preprocessing

In [None]:
# Remove Null Values
df.dropna(inplace=True)

In [None]:
# Drop unused columns
df.drop(['dispatching_base_num','originating_base_num','base_passenger_fare','tolls','bcf','sales_tax','congestion_surcharge','airport_fee','tips','driver_pay','access_a_ride_flag','wav_match_flag','wav_request_flag'],axis=1, inplace=True)

In [None]:
# Extracting only uber rides
df = df[df['hvfhs_license_num'] == 'HV0003']
df.drop(['hvfhs_license_num'],axis=1, inplace=True)

In [None]:
df = df[~df['PULocationID'].isin([264, 265]) & ~df['DOLocationID'].isin([264, 265])]

In [None]:
# Calculate the 99th percentile value for trip_miles
trip_miles_99th_percentile = df['trip_miles'].quantile(0.99)
df = df[df['trip_miles'] < trip_miles_99th_percentile]

# Calculate the 99th percentile value for trip_time
trip_time_99th_percentile = df['trip_time'].quantile(0.99)
df = df[df['trip_time'] < trip_time_99th_percentile]

In [None]:
# Create cyclic features for time of the day and day of the week
df['hour'] = df['request_datetime'].dt.hour
df['weekday'] = df['request_datetime'].dt.dayofweek

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

In [None]:
# Aggregate the data to get the demand for each PULocationID
demand = df.groupby(['PULocationID', 'DOLocationID', 'hour', 'weekday']).size().reset_index(name='demand')

In [None]:
# Merge the cyclic features back to the demand dataframe
demand = demand.merge(df[['PULocationID', 'DOLocationID', 'hour', 'weekday', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos']].drop_duplicates(), on=['PULocationID', 'DOLocationID', 'hour', 'weekday'])

In [None]:
demand.info()

In [None]:
demand.head()

In [None]:
# Calculate the 1st percentile value for demand
demand_1st_percentile = demand['demand'].quantile(0.005)
demand = demand[demand['demand'] > demand_1st_percentile]

In [None]:
# Define the bins and labels
bins = [0, 1, 2, 3, 4, 5, float('inf')]
labels = [-1 ,0, 1, 2, 3, 4]

# Create a new column 'demand_category' with the binned values
demand['demand_category'] = pd.cut(demand['demand'], bins=bins, labels=labels)

In [None]:
# Convert the 'demand_category' column to integer type
demand['demand_category'] = demand['demand_category'].astype(int)

In [None]:
demand['demand_category'].replace({2: 1, 3: 1, 4: 2}, inplace=True)

In [None]:
demand.to_csv('data/demand.csv', index=False)

# Visualization

In [None]:
plt.figure(figsize=(15,6))
df['PULocationID'].value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Start Area")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,6))
df['request_datetime'].dt.hour.value_counts(sort=False).sort_index().plot(kind='bar')
plt.xlabel("Hour")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(15,6))
df['request_datetime'].dt.dayofweek.value_counts(sort=False).sort_index().plot(kind='bar')
#df['request_datetime'].dt.day_name().value_counts(sort=False).plot(kind='bar')
plt.xlabel("Day of week")
plt.ylabel("Count")
plt.show()

# Predicting Demand

In [None]:
import seaborn as sns

correlation_matrix = demand.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
X = demand[['hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'PULocationID']]
y = demand['demand_category']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Unloading the dataframe to save memory
df = []
demand = []

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
from sklearn.utils import class_weight

# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
class_weights_dict[2] *= 0.8

In [None]:
class_weights_dict

In [None]:
print("y_train unique values:", set(y_train))
print("class_weights_dict keys:", class_weights_dict.keys())
print("y_train types:", set(type(label) for label in y_train))

In [None]:

history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)
model.save('model.keras')

In [None]:
#plotting graphs for accuracy 
plt.plot(history.history['accuracy'], label='training accuracy')
plt.plot(history.history['val_accuracy'], label='val accuracy')
plt.title('Accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

# Evaluation

In [None]:
model = tf.keras.models.load_model('model.keras')

In [None]:
# Score
score = model.evaluate(X_test, y_test, verbose=0)
print('Test Loss', score[0])
print('Test accuracy', score[1])

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
import seaborn as sns
sns.heatmap(cm,annot=True)
plt.savefig('h1.png')

In [None]:
import datetime

def predict_demand(date_time):
    # Extract features from the date_time
    hour = (date_time.hour + (date_time.minute //15 * 15 / 60))
    #hour = date_time.hour
    weekday = date_time.weekday()
    
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)
    weekday_sin = np.sin(2 * np.pi * weekday / 7)
    weekday_cos = np.cos(2 * np.pi * weekday / 7)
    
    # Create a dataframe with all possible PULocationID values
    PULocationIDs = X['PULocationID'].unique()
    data = pd.DataFrame({
        'hour_sin': [hour_sin] * len(PULocationIDs),
        'hour_cos': [hour_cos] * len(PULocationIDs),
        'weekday_sin': [weekday_sin] * len(PULocationIDs),
        'weekday_cos': [weekday_cos] * len(PULocationIDs),
        'PULocationID': PULocationIDs
    })
    
    # Make predictions
    predictions = model.predict(data)
    predicted_categories = np.argmax(predictions, axis=1)
    data['predicted_category'] = predicted_categories
    
    return data[['PULocationID', 'predicted_category']]



In [None]:
# Example usage
date_time = datetime.datetime.now()
demand_per_area = predict_demand(date_time)
print(demand_per_area)
demand_per_area.to_csv('data/output.csv', index=False)

In [None]:
taxi_zones = pd.read_csv('data/taxi_zones.csv')

In [None]:
import folium
from shapely import wkt
from shapely.geometry import MultiPolygon, Polygon
from folium import GeoJson
from folium.plugins import MarkerCluster

# Merge taxi zones data with demand output data on LocationID
merged_df = pd.merge(taxi_zones, demand_per_area, left_on="LocationID", right_on="PULocationID")

# Define color mapping for demand levels
demand_colors = {0: 'red', 1: 'orange', 2: 'green'}

# Initialize the map centered around NYC
nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)

# Parse and plot each zone
for _, row in merged_df.iterrows():
    # Parse the geometry from WKT and set demand level color
    demand_level = row['predicted_category']
    color = demand_colors[demand_level]

    # Parse the MULTIPOLYGON from the_geom column
    geometry = wkt.loads(row['the_geom'])
    if isinstance(geometry, (MultiPolygon, Polygon)):
        geo_json = GeoJson(data=geometry.__geo_interface__, 
                           style_function=lambda x, color=color: {
                               'fillColor': color, 'color': 'black', 'weight': 1, 'fillOpacity': 0.5
                           })
        geo_json.add_to(nyc_map)

# Display map
nyc_map
