In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import svm
import matplotlib.pyplot as plt


In [99]:
train = pd.read_csv('./data/s02e03_train.csv', parse_dates=['TIME'], infer_datetime_format=True)
test = pd.read_csv('./data/s02e03_test.csv', parse_dates=['TIME'], infer_datetime_format=True)

In [100]:
train

Unnamed: 0,TRANSPORT,TIME,A,B,SPEED
0,CARGO_15100,1859-11-25 10:16:40,Steamdrift,Rustport,74.09
1,CARGO_15101,1859-11-26 03:58:07,Rustport,Irondale,73.60
2,CARGO_15101,1859-11-26 13:02:07,Irondale,Leverstorm,69.37
3,CARGO_15102,1859-11-26 19:55:34,Leverstorm,Copperhold,82.58
4,CARGO_15102,1859-11-27 09:23:42,Copperhold,Cogburg,77.73
...,...,...,...,...,...
3663,CARGO_17486,1859-12-10 06:38:30,Cogburg,Steamdrift,71.05
3664,CARGO_17487,1859-12-10 22:53:48,Steamdrift,Cogburg,78.07
3665,CARGO_17487,1859-12-11 10:53:15,Cogburg,Irondale,86.23
3666,CARGO_17488,1859-12-12 02:55:11,Irondale,Rustport,81.21


## Model based on average speed

In [101]:
# Compute average speed
def prediction_based_on_avg(A,B):
    grouped = train.groupby(['A','B'])['SPEED'].agg(np.mean)
    return grouped[A,B]

In [102]:
prediction_based_on_avg('Gizbourne','Rustport')

77.93559322033899

In [103]:
y_pred = test.apply(lambda row : prediction_based_on_avg(row['A'],
                                  row['B']), axis = 1)

In [104]:
mean_squared_error(test.SPEED, y_pred)

52.551138978521706

## Data preparation

In [105]:
def prepare_df(df):
    df['MINUTES'] = df['TIME'].apply(lambda time: time.hour*60 + time.minute)
    df['DAY'] = df['TIME'].apply(lambda time: time.dayofweek)
    df['MINUTESxDAY'] = df['MINUTES']*df['DAY']
    df = df.drop( labels=['TRANSPORT','TIME'], axis=1, inplace=False)
    return df

In [106]:
train_final = prepare_df(train)
test_final = prepare_df(test)
train_final

Unnamed: 0,A,B,SPEED,MINUTES,DAY,MINUTESxDAY
0,Steamdrift,Rustport,74.09,616,4,2464
1,Rustport,Irondale,73.60,238,5,1190
2,Irondale,Leverstorm,69.37,782,5,3910
3,Leverstorm,Copperhold,82.58,1195,5,5975
4,Copperhold,Cogburg,77.73,563,6,3378
...,...,...,...,...,...,...
3663,Cogburg,Steamdrift,71.05,398,5,1990
3664,Steamdrift,Cogburg,78.07,1373,5,6865
3665,Cogburg,Irondale,86.23,653,6,3918
3666,Irondale,Rustport,81.21,175,0,0


In [107]:
test_final

Unnamed: 0,A,B,SPEED,MINUTES,DAY,MINUTESxDAY
0,Gizbourne,Rustport,72.17,656,1,656
1,Rustport,Gizbourne,74.55,198,2,396
2,Gizbourne,Leverstorm,72.10,918,2,1836
3,Leverstorm,Irondale,65.32,56,3,168
4,Irondale,Copperhold,79.35,317,3,951
...,...,...,...,...,...,...
912,Copperhold,Leverstorm,80.11,989,1,989
913,Leverstorm,Irondale,69.92,89,2,178
914,Irondale,Cogburg,76.30,902,2,1804
915,Cogburg,Irondale,75.11,288,3,864


For each row in s02e03_test.csv use the trained model to predict travel speed.
For each row compute error by computing difference between the observed speed (from the file) and predicted speed (from the model).
Aggregate these errors into a single error number by using the Mean Squared Error formula.

## Linear and Polynomial Regression

In [108]:
def fit_regression(A,B,reg_type,Vars,deg=2):
    train = train_final.loc[(train_final.A==A)&(train_final.B==B),:] 
    y_train = train['SPEED']
    
    if reg_type == "linear":
        model = LinearRegression(fit_intercept = True)
        if len(Vars) == 1:
            X_train = train[Vars].values.reshape(-1, 1)
        else:
            X_train = train[Vars]
            
    elif reg_type == "poly":
        poly = PolynomialFeatures(degree = deg)
        model = LinearRegression()
        if len(Vars) == 1:
            X_train = poly.fit_transform(train[Vars].values.reshape(-1,1))
        else:
            X_train = poly.fit_transform(train[Vars])
    else: 
        print("specify type of regression ('linear' or 'poly')")
    model.fit(X_train, y_train)
    return model

In [109]:
def evaluate_model(reg_type, Vars, deg=2):
    error_sum = 0
    for index, row in test_final.iterrows():
        A = row['A']
        B = row['B']
        actual_speed = row['SPEED']
        if reg_type == 'linear':
            model = fit_regression(A,B,reg_type,Vars)
            row = row.to_frame().T
            if len(Vars) == 1:
                predicted_speed = model.predict(row[Vars].values.reshape(-1, 1))
            else:
                 predicted_speed = model.predict((row[Vars]))
        elif reg_type =='poly':
            poly = PolynomialFeatures(degree = deg)
            model = fit_regression(A,B,reg_type,Vars,deg)
            row = row.to_frame().T
            if len(Vars) == 1:
                predicted_speed = model.predict( poly.fit_transform(row[Vars].values.reshape(-1,1)))
            else:
                predicted_speed = model.predict( poly.fit_transform(row[Vars]))
        
        difference = actual_speed - predicted_speed
        square = difference * difference
        error_sum += square
    mse = error_sum / len(test_final)
    print(f"Mean squared error is {mse}")

### Linear Regression

In [110]:
evaluate_model('linear',['MINUTES'])

Mean squared error is [50.40269521]


In [111]:
evaluate_model('linear',['DAY'])

Mean squared error is [50.66003876]


In [112]:
evaluate_model('linear',['MINUTES','DAY'])

Mean squared error is [49.04786106]


### Linear Regression with interaction term (Min * Day)

In [113]:
evaluate_model('linear',['MINUTESxDAY'])

Mean squared error is [50.43896464]


In [114]:
evaluate_model('linear',['MINUTES','MINUTESxDAY'])
        

Mean squared error is [47.95248475]


### Polynomial Regression (deg =2)

In [115]:
evaluate_model('poly',['DAY'])

Mean squared error is [37.85968854]


In [116]:
evaluate_model('poly',['MINUTES'])

Mean squared error is [47.21414962]


In [117]:
evaluate_model('poly',['MINUTES','DAY'])

Mean squared error is [31.7801709]


### Polynomial Regression (deg = 3)

In [118]:
evaluate_model('poly',['MINUTES','DAY'],3)

Mean squared error is [28.42678666]


## Experiment: SVM Regression

In [None]:
def train_svm(A,B,Vars):
    train = train_final.loc[(train_final.A==A)&(train_final.B==B),:] #df_historical.loc[(df_historical.LOCATION_FROM==From) &(df_historical.LOCATION_TO == To),:]

    y_train = train['SPEED']
  
    if len(Vars) == 1:
        X_train = train[Vars].values.reshape(-1, 1)
      
    else:
        X_train = train[Vars]
  
          
    clf = svm.SVR(kernel='linear')
    clf.fit(X_train, y_train)
    return clf

In [None]:
def evaluate_svm(Vars):
    error_sum = 0
    for index, row in test_final.iterrows():
        A = row['A']
        B = row['B']
        actual_speed = row['SPEED']
        model = train_svm(A,B,Vars)
        row = row.to_frame().T
        if len(Vars) == 1:
            predicted_speed = model.predict(row[Vars].values.reshape(-1, 1))
        else:
             predicted_speed = model.predict((row[Vars]))
        #predicted_speed = model.predict(np.array(row['MINUTESxDAY']).reshape(-1,1))
        difference = actual_speed - predicted_speed
        square = difference * difference
        error_sum += square
    mse = error_sum / len(test_final)
    print(f"Mean squared error is {mse}")