In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

In [106]:
train = pd.read_csv('./data/s02e03_train.csv', parse_dates=['TIME'], infer_datetime_format=True)
test = pd.read_csv('./data/s02e03_test.csv', parse_dates=['TIME'], infer_datetime_format=True)

In [3]:
train

Unnamed: 0,TRANSPORT,TIME,A,B,SPEED
0,CARGO_15100,1859-11-25 10:16:40,Steamdrift,Rustport,74.09
1,CARGO_15101,1859-11-26 03:58:07,Rustport,Irondale,73.60
2,CARGO_15101,1859-11-26 13:02:07,Irondale,Leverstorm,69.37
3,CARGO_15102,1859-11-26 19:55:34,Leverstorm,Copperhold,82.58
4,CARGO_15102,1859-11-27 09:23:42,Copperhold,Cogburg,77.73
...,...,...,...,...,...
3663,CARGO_17486,1859-12-10 06:38:30,Cogburg,Steamdrift,71.05
3664,CARGO_17487,1859-12-10 22:53:48,Steamdrift,Cogburg,78.07
3665,CARGO_17487,1859-12-11 10:53:15,Cogburg,Irondale,86.23
3666,CARGO_17488,1859-12-12 02:55:11,Irondale,Rustport,81.21


## Model based on average speed

In [4]:
# Compute average speed
def prediction_based_on_avg(A,B):
    grouped = train.groupby(['A','B'])['SPEED'].agg(np.mean)
    return grouped[A,B]

In [5]:
prediction_based_on_avg('Gizbourne','Rustport')

77.93559322033899

In [7]:
y_pred = test.apply(lambda row : prediction_based_on_avg(row['A'],
                                  row['B']), axis = 1)

In [9]:
mean_squared_error(test.SPEED, y_pred)

52.551138978521706

For each row in s02e03_test.csv use the trained model to predict travel speed.
For each row compute error by computing difference between the observed speed (from the file) and predicted speed (from the model).
Aggregate these errors into a single error number by using the Mean Squared Error formula.

## Linear Model

In [114]:
def prepare_df(df):
    df['MINUTE'] = df['TIME'].apply(lambda time: time.hour*60 + time.minute)
    df['DAY'] = df['TIME'].apply(lambda time: time.dayofweek)
    df['HOURxDAY'] = df['HOUR']*df['DAY']
    df = df.drop( labels=['TRANSPORT','TIME'], axis=1, inplace=False)
    return df

In [115]:
train_final = prepare_df(train)
test_final = prepare_df(test)
train_final

Unnamed: 0,A,B,SPEED,HOUR,DAY,HOURxDAY
0,Steamdrift,Rustport,74.09,616,4,2464
1,Rustport,Irondale,73.60,238,5,1190
2,Irondale,Leverstorm,69.37,782,5,3910
3,Leverstorm,Copperhold,82.58,1195,5,5975
4,Copperhold,Cogburg,77.73,563,6,3378
...,...,...,...,...,...,...
3663,Cogburg,Steamdrift,71.05,398,5,1990
3664,Steamdrift,Cogburg,78.07,1373,5,6865
3665,Cogburg,Irondale,86.23,653,6,3918
3666,Irondale,Rustport,81.21,175,0,0


In [116]:
def simple_model(A,B):
    train = train_final.loc[(train_final.A==A)&(train_final.B==B),:] #df_historical.loc[(df_historical.LOCATION_FROM==From) &(df_historical.LOCATION_TO == To),:]
    test  = test_final.loc[(test_final.A==A)&(test_final.B==B),:]
    X_train = train['HOURxDAY'].values.reshape(-1, 1)
    y_train = train['SPEED']
    X_test = test['HOURxDAY'].values.reshape(-1, 1)
    y_test = test['SPEED']
    #print(y_test)
    model = LinearRegression(fit_intercept = True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse


In [117]:
simple_model("Rustport","Irondale")

28.415732992741503

In [118]:
list1 = []
for k, d in train.groupby(['A','B']):
    list1.append(simple_model(k[0],k[1]))
    
sum(list1)/len(list1)

51.34380196425487