In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.distributions import Categorical
import torch.optim as optim
import ast
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
import torch.utils.data as Data
from sklearn.preprocessing import OneHotEncoder
import lightgbm as ltb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
import scikitplot as skplt
from sklearn import tree
import csv
import pickle

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load Data

In [2]:
public_test = pd.read_csv("sampleSubmission.csv")

In [3]:
X = pd.read_csv("my_train_X.csv", header=None)
y = pd.read_csv("my_train_y.csv", header=None)
test = pd.read_csv("my_test.csv", header=None)

X_copy = X.copy()
y_copy = y.copy()
test_copy = test.copy()

X_arr = X_copy.to_numpy()
y_arr = y_copy.to_numpy()
test_arr = test_copy.to_numpy()

In [4]:
mlp_train = pd.read_csv("my_y5_mlp_train.csv", header=None)
mlp_valid = pd.read_csv("my_y5_mlp_val.csv", header=None)
mlp_test = pd.read_csv("my_y5_mlp_test.csv", header=None)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size=0.25, random_state=42)

X_train_t = torch.from_numpy(X_train.astype(np.float32))
y_train_t = torch.from_numpy(y_train.astype(np.float32))
X_test_t = torch.from_numpy(X_test.astype(np.float32))
y_test_t = torch.from_numpy(y_test.astype(np.float32))
test_X = torch.tensor(test_arr,dtype=torch.float32)

model_loss = dict()

In [6]:
X_test_t = X_test_t.to(device)
y_test_t = y_test_t.to(device)
test_X = test_X.to(device)

In [7]:
with open('model_lg.pkl', 'rb') as f:
    model_lg = pickle.load(f)
    
with open('reg.pkl', 'rb') as f:
    reg = pickle.load(f)

with open('model_rf.pkl', 'rb') as f:
    rf = pickle.load(f)
    
with open('clf.pkl', 'rb') as f:
    clf = pickle.load(f)

## Loss Computation

In [8]:
y_train_pred_lg = model_lg.predict(X_train)
y_test_pred_lg = model_lg.predict(X_test)
y_pred = model_lg.predict(test_arr)

train_loss_mse = mean_squared_error(y_train_pred_lg, y_train)
train_loss_rmse = np.sqrt(train_loss_mse)
test_loss_mse = mean_squared_error(y_test_pred_lg, y_test)
test_loss_rmse = np.sqrt(test_loss_mse)

public_test["TRAVEL_TIME"] = y_pred
public_test.to_csv("my_pred_lightGBM_noGrid.csv", index=None)

model_loss['lg'] = [train_loss_mse, train_loss_rmse, test_loss_mse, test_loss_rmse]
model_loss['lg']

[139855.59295627635, 373.9727168608378, 140449.91008050076, 374.76647406151574]

In [9]:
y_pred_train_li = reg.predict(X_train)
y_pred_test_li = reg.predict(X_test)
y_pred_li = reg.predict(test_arr)

train_mse_li = mean_squared_error(y_pred_train_li, y_train)
train_rmse_li = np.sqrt(train_mse_li)
test_mse_li = mean_squared_error(y_pred_test_li, y_test)
test_rmse_li = np.sqrt(test_mse_li)

public_test["TRAVEL_TIME"] = y_pred_li
public_test.to_csv("my_pred_linear.csv", index=None)

model_loss['li'] = [train_mse_li, train_rmse_li, test_mse_li, test_rmse_li]
model_loss['li']

[142030.53450938614, 376.8693865378112, 142650.31012196973, 377.6907599107631]

In [10]:
y_train_rf = rf.predict(X_train)
y_test_rf = rf.predict(X_test)
y_pred_test_rf = rf.predict(test_arr)

train_mse_rf = mean_squared_error(y_train_rf, y_train)
train_rmse_rf = np.sqrt(train_mse_rf)
test_mse_rf = mean_squared_error(y_test_rf, y_test)
test_rmse_rf = np.sqrt(test_mse_rf)

public_test["TRAVEL_TIME"] = y_pred_test_rf
public_test.to_csv("my_pred_rf.csv", index=None)

model_loss['rf'] = [train_mse_rf, train_rmse_rf, test_mse_rf, test_rmse_rf]
model_loss['rf']

[191389.8288573169, 437.48123257725797, 192193.33849232137, 438.3986068549048]

In [11]:
y_train_clf = clf.predict(X_train)
y_test_clf = clf.predict(X_test)
y_pred_test_clf = clf.predict(test_arr)

train_mse_clf = mean_squared_error(y_train_clf, y_train)
train_rmse_clf = np.sqrt(train_mse_clf)
test_mse_clf = mean_squared_error(y_test_clf, y_test)
test_rmse_clf = np.sqrt(test_mse_clf)

public_test["TRAVEL_TIME"] = y_pred_test_clf
public_test.to_csv("naive.csv", index=None)

model_loss['clf'] = [train_mse_clf, train_rmse_clf, test_mse_clf, test_rmse_clf]
model_loss['clf']

[142031.86947859958, 376.87115766346403, 142652.43747682933, 377.6935761656919]

In [12]:
Y_train = np.hstack((y_train_pred_lg.reshape((len(y_train_pred_lg), 1)), y_pred_train_li, 
                     y_train_rf.reshape((len(y_train_rf), 1)), y_train_clf.reshape((len(y_train_clf), 1)), 
                     np.array(mlp_train).reshape((len(mlp_train), 1))))
np.savetxt('my_Y_train.csv', Y_train, delimiter=',')

Y_valid = np.hstack((y_test_pred_lg.reshape((len(y_test_pred_lg), 1)), y_pred_test_li, 
                     y_test_rf.reshape((len(y_test_rf), 1)), y_test_clf.reshape((len(y_test_clf), 1)), 
                     np.array(mlp_valid).reshape((len(mlp_valid), 1))))
np.savetxt('my_Y_valid.csv', Y_valid, delimiter=',')

Y_test = np.hstack((y_pred.reshape((len(y_pred), 1)), y_pred_li, 
                    y_pred_test_rf.reshape((len(y_pred_test_rf), 1)), 
                    y_pred_test_clf.reshape((len(y_pred_test_clf), 1)), 
                    np.array(mlp_test).reshape((len(mlp_test), 1))))

np.savetxt('my_Y_test.csv', Y_test, delimiter=',')