# Multiple Linear Regression

### References
- https://statsandr.com/blog/multiple-linear-regression-made-simple/ (teory for multiple linear regression)
- https://machinelearningmastery.com/making-predictions-with-multilinear-regression-in-pytorch/#:~:text=The%20multilinear%20regression%20model%20is,predict%20the%20target%20variable%20y%20. (Implementation Ideologies of multiple Linear Regression)
- http://www.sthda.com/english/articles/40-regression-analysis/163-regression-with-categorical-variables-dummy-coding-essentials-in-r/ (Linear Regression using Categorical Variables)

### Rationale For Using this Approach

The dataset provides multiple parameters that could be related to the travel duration. This approach generates a linear combination of parameters with weights to generate the output duration thus allowing the use of more than one parameter (as would have been the case for simple linear regression). This will form a baseline machine learning model to evaluate other models used later on.

### Import Libraries

In [2]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
from datetime import datetime
from torch.utils.data import TensorDataset, DataLoader

### Read Data and Pre-Process

In [3]:
#Read into Dataframe
taxi_data = pd.read_csv("kaggle_data/train.csv")

#Calculate and Create Time Column
def travel_time(polyline):
    return max(polyline.count("[") - 2, 0) * 15

def parse_timestamp(taxi_data):
    date_time = datetime.fromtimestamp(taxi_data["TIMESTAMP"])
    return date_time.year, date_time.month, date_time.day, date_time.hour, date_time.weekday()

taxi_data["LEN"] = taxi_data["POLYLINE"].apply(travel_time)

taxi_data[["YR", "MON", "DAY", "HR", "WK"]] = taxi_data[["TIMESTAMP"]].apply(parse_timestamp, axis=1, result_type="expand")

mean_duration = taxi_data["LEN"].mean()
standard_deviation = taxi_data["LEN"].std()
median = taxi_data["LEN"].median()
taxi_data = taxi_data[taxi_data["LEN"] < mean_duration + 3*standard_deviation]

### Input Feature Decisions

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Mapping Call Type Letters to Numbers
letter_to_num = {
    "A" : 1,
    "B" : 2,
    "C" : 3
}
num_to_letter = {
    1 : "A",
    2 : "B",
    3 : "C"
}

duration = taxi_data["LEN"].tolist()

hour = taxi_data["HR"].tolist()
month = taxi_data["MON"].tolist() 
week = taxi_data["WK"].tolist()
day = taxi_data["DAY"].tolist()
calltype = taxi_data["CALL_TYPE"].tolist()
taxi = taxi_data["TAXI_ID"].tolist()

for count in range(0, len(calltype), 1):
    calltype[count] = (letter_to_num[calltype[count]])   
    
inputs = []

#Combine Input Vectors
for count in range(0, len(hour), 1):
    inputs.append([hour[count], month[count], week[count], day[count], calltype[count], taxi[count]])

### Create Dataset

In [5]:
inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
target = torch.tensor(duration, dtype=torch.float32).to(device)

dataset = TensorDataset(inputs, target)

### Create the Model

In [4]:
class MLR(torch.nn.Module):
    # Object Constructor
    def __init__(self, input_features, output_features):
        super().__init__()
        self.linear = torch.nn.Linear(input_features, 1)
        self.dropout = torch.nn.Dropout(0.5)
        self.linear2 = torch.nn.Linear(1, output_features)
        self.relu = torch.nn.ReLU()
        self.norm = torch.nn.BatchNorm1d(num_features = 6)
        
    # define the forward function for prediction
    def forward(self, x):
        x = self.norm(x)
        x = self.dropout(self.relu(self.linear(x)))
        y_pred = self.dropout(self.relu(self.linear2(x)))
        return y_pred
    
predict = MLR(6, 1)#.to(device)

print(predict)

MLR(
  (linear): Linear(in_features=6, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear2): Linear(in_features=1, out_features=1, bias=True)
  (relu): ReLU()
  (norm): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [7]:
# Define optimizer (this will perform your parameter updates use)
lr = 1e-7
opt = torch.optim.Adam(predict.parameters(), lr=lr)

### Train Set

In [8]:
batch_size = 64

train_err = []
parameters = []

trainloader = DataLoader(dataset, batch_size, shuffle=True)

In [9]:
def train(epochs, model, optimize):
    for epoch in range(epochs):
        for x, y in trainloader:
            model.train()
            prediction = model(x)
            loss = torch.sqrt(torch.nn.functional.mse_loss(prediction, torch.unsqueeze(y, 1)))
            #print(prediction)
            #print(y)
            optimize.zero_grad()
            loss.backward()
            optimize.step()
            
        print("Epoch: " + str(epoch) + "\t" + "Loss: " + str(loss.tolist()))

In [10]:
epochs = 10
train(epochs, predict, opt)

Epoch: 0	Loss: 996.1138305664062
Epoch: 1	Loss: 840.6067504882812
Epoch: 2	Loss: 728.8763427734375
Epoch: 3	Loss: 845.4779052734375
Epoch: 4	Loss: 865.6972045898438
Epoch: 5	Loss: 716.5923461914062
Epoch: 6	Loss: 970.431396484375
Epoch: 7	Loss: 803.6483154296875
Epoch: 8	Loss: 716.9827880859375
Epoch: 9	Loss: 708.0394287109375


# PREDICT

In [18]:
#Read into Dataframe
test_data = pd.read_csv("kaggle_data/test_public.csv")
test_data['ORIGIN_STAND'] = taxi_data['ORIGIN_STAND'].fillna(0)
test_data[["YR", "MON", "DAY", "HR", "WK"]] = taxi_data[["TIMESTAMP"]].apply(parse_timestamp, axis=1, result_type="expand")

test_hour = test_data["HR"].tolist()
test_month = test_data["MON"].tolist() 
test_week = test_data["WK"].tolist()
test_day = test_data["DAY"].tolist()
test_calltype = test_data["CALL_TYPE"].tolist()
test_taxi = test_data["TAXI_ID"].tolist()
test_origin = test_data["ORIGIN_STAND"].tolist()

for count in range(0, len(test_calltype), 1):
    test_calltype[count] = (letter_to_num[test_calltype[count]]) 
    
test_inputs = []
for count in range(0, len(test_hour), 1):
    test_inputs.append([test_hour[count], test_month[count], test_week[count], test_day[count], test_calltype[count], test_taxi[count]])
    
test_tensor = torch.tensor(test_inputs, dtype=torch.float32).to(device)

test_dataset = TensorDataset(test_tensor)
testloader = DataLoader(test_dataset, batch_size, shuffle=True)

In [23]:
test_ids = test_data["TRIP_ID"].tolist()

prediction = predict(test_tensor)
    #loss = torch.sqrt(torch.nn.functional.mse_loss(prediction, torch.unsqueeze(y, 1)))
    #optimize.zero_grad()
    #loss.backward()
    #optimize.step()
prediction = prediction.tolist()
    
for i in range (0, len(test_ids), 1):
    print(str(test_ids[i])+","+str(prediction[i][0]))

T1,0.0
T2,0.0
T3,0.0
T4,0.0
T5,0.0
T6,0.0
T7,0.0
T8,1.8661231994628906
T9,0.0
T10,1.4627022743225098
T11,0.0
T12,0.0
T13,1.1850254535675049
T14,0.0
T15,0.0
T16,0.0
T17,0.0
T18,0.0
T19,0.0
T20,0.0
T21,0.6327923536300659
T22,0.0
T23,2.746330976486206
T24,0.0
T25,0.0
T26,0.0
T27,0.0
T28,0.0
T29,0.0
T30,1.686988115310669
T31,0.0
T32,0.0
T33,0.0
T34,0.0
T35,0.0
T36,0.0
T37,0.0
T38,0.0
T39,0.0
T40,0.0
T41,0.0
T42,0.0
T43,0.0
T44,0.0
T45,0.0
T46,0.0
T47,0.0
T48,0.0
T49,0.0
T50,3.3593828678131104
T51,0.0
T52,1.6764670610427856
T53,0.0
T54,0.004072587005794048
T55,2.378579616546631
T56,1.9153804779052734
T57,0.0
T58,0.0
T59,0.0
T60,0.0
T61,3.2462127208709717
T62,0.0
T63,0.0
T64,1.5748587846755981
T65,3.1728198528289795
T66,0.0
T67,0.0
T68,0.0
T69,0.0
T70,0.0
T71,0.0
T72,0.0
T73,0.0
T74,0.0
T75,0.0
T76,0.0
T77,0.0
T78,0.0
T79,0.0
T80,0.0
T81,1.7906768321990967
T82,0.0
T83,0.0
T84,0.0
T85,0.0
T86,5.034870147705078
T87,0.0
T88,0.0
T90,0.0
T91,4.17775821685791
T92,0.0
T93,0.0
T94,0.0
T95,0.0
T96,0.