# ERIC STRATFORD
## CSE 151B Project
### Checkpoint Notebook

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.utils
from torchvision import datasets
from torchvision.transforms import ToTensor 

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

import time

In [None]:
# Define functions

def distance(p1, p2):
    if isinstance(p1[0], str):
        return np.sqrt((float(p2[0])-float(p1[0]))**2+(float(p2[1])-float(p1[1])**2))
    else:
        return np.sqrt((p2[0]-p1[0])**2+(p2[1]-p1[1])**2)

def normalize_data(data):
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(data)
    return normalized_data

def convert_string_to_list(string):
    return eval(string)

def transformdf(df):
    data = df.copy()
    data = data[data['MISSING_DATA']==False]
    if 'POLYLINE' in data.columns:
        data["Count"] = data['POLYLINE'].apply(lambda x: max(x.count("[") - 1, 0))
        data["Travel Time (s)"] = data['Count']*15
        data["Travel Time (m)"] = data['Travel Time (s)']/60
    data['Time'] = pd.to_datetime(data['TIMESTAMP'], unit='s')
    data['Year'] = data['Time'].dt.year
    data['Month'] = data['Time'].dt.month
    data['Day of Month'] = data['Time'].dt.day
    data['Day of Week'] = data['Time'].dt.day_name()
    data['Hour'] = data['Time'].dt.hour
    return data

In [None]:
ogdata = pd.read_csv("train.csv")
ogdata.head(1);

In [None]:
data = transformdf(ogdata)

## Deep Learning Model and Experiment

In [None]:
# Initialize feature set and test/training split
features = data[['CALL_TYPE', 'ORIGIN_STAND', 'TAXI_ID', 'DAY_TYPE', 'Year', 'Month', 'Day of Month', 'Day of Week', 'Hour']]
target = data['Travel Time (s)']
test_size = 0.2
seed = 69
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=seed)

In [None]:
# Transform and preprocess

cat_features = ['Hour', 'Day of Week', 'Month', 'TAXI_ID', 'Day of Month', 'Year', 'CALL_TYPE']

# Preprocess with one-hot-encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cat_features)
    ])

px_train = preprocessor.fit_transform(x_train)

# Transform the test data
px_test = preprocessor.transform(x_test)

In [None]:
# Create and train the MLP model
mlp = MLPRegressor(hidden_layer_sizes=(5, 5)
                    , activation='relu'
                    , solver='adam'
                    , random_state=69
                    , max_iter=100
                    , batch_size=128
                    , learning_rate='constant'
                    , learning_rate_init=0.001
                    , momentum=0.9
                  )

# Start time
start_time = time.time()

# Fit model
mlp.fit(px_train, y_train)

# Make predictions
y_pred = mlp.predict(px_test)

# Calculate the root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# Get Total Training Time
training_time = time.time() - start_time
print("Training Time:", training_time)

## LOSS COMPARISON
### Section 3.A.1

In [None]:
# LOSS COMPARISON CELL
# DO NOT RUN EXCEPT FOR LOSS COMPARISON

# Initialize loss dataframe for comparison
loss_df = pd.DataFrame({"Epoch":range(1,21)})

learning_rates = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]

for learning_rate in learning_rates:
    
    # Create and train the MLP model
    mlp = MLPRegressor(hidden_layer_sizes=(5, 5)
                        , activation='relu'
                        , solver='adam'
                        , random_state=42
                        , max_iter=20
                        , batch_size=128
                        , learning_rate='constant'
                        , learning_rate_init=learning_rate
                        , momentum=0.9
                      )

    # Start time
    start_time = time.time()

    mlp.fit(px_train, y_train)

    # Make predictions
    y_pred = mlp.predict(px_test)

    
    # Add loss to dataframes
    col_name = str(learning_rate)
    loss_df[col_name] = mlp.loss_curve_
    loss_df[col_name] = loss_df[col_name].apply(np.sqrt)
    
    
    # Print learning rate
    print("Learning Rate:", learning_rate)
    
    # Calculate the root mean squared error
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("RMSE:", rmse)
    
    # Get Total Training Time
    training_time = time.time() - start_time
    print("Training Time:", training_time)
    
    
# Graph the loss over each iteration for each learning rate
loss_df.plot(x='Epoch', y=['0.1', '0.01', '0.001', '0.0001', '1e-05', '1e-06'], figsize = (12,6));

In [None]:
# Save model

import pickle
# save the model to disk
filename = 'mlp2_model.sav'
pickle.dump(mlp, open(filename, 'wb')) 

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

# Best validation RMSE: 735.57
# Corresponding test RMSE: 657.46

In [None]:
# Generate predictions on kaggle validation set

# Import validation set
validation_orig = pd.read_csv("test_public.csv")
validation = transformdf(validation_orig)

# Reconfigure preprocessing with same method as training
cat_features = ['Hour', 'Day of Week', 'Month', 'TAXI_ID', 'Day of Month', 'Year', 'CALL_TYPE']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cat_features)
    ])
px_train = preprocessor.fit_transform(x_train)

# Run model on validation set
val_features = validation[['CALL_TYPE', 'ORIGIN_STAND', 'TAXI_ID', 'DAY_TYPE', 'Year', 'Month', 'Day of Month', 'Day of Week', 'Hour']]
val_set = preprocessor.transform(val_features)
predictions = mlp.predict(val_set)

# Format validation predictions output
validation["TRAVEL_TIME"] = predictions
val_out = validation[['TRIP_ID', 'TRAVEL_TIME']]
val_out['TRIP_ID'] = val_out['TRIP_ID'].astype(str)

# Export validation predictions
val_csv = val_out.to_csv('val_pred.csv', index= False)