In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import matplotlib.patches as mpatches
#import matplotlib.font_manager as fm
import matplotlib
import autotime
%matplotlib inline
%load_ext autotime
#import seaborn as sns
import statsmodels as stm
import statsmodels.formula.api as sm
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics
from sklearn.linear_model import LinearRegression as LinR
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.model_selection import GridSearchCV as GSCV
from dask_searchcv import GridSearchCV as DGSCV

#from sklearn.tree import export_graphviz
#from IPython.display import Image

In [None]:
# setting variables
cores = x # enter number of cores on your machine to use here, replacing x
SSIDno = xxxxxxxx # enter SSID number to analyse here, replacing xxxxxxxx

First, select route files to input based on SSIDno to analyse

In [None]:
# Read in stop_times.txt and trips.txt files from NTA data
stop_times_2012 = pd.read_csv('stop_times.txt')
trips_2012 = pd.read_csv('trips.txt')

# Merge by trip_id
merge = pd.merge(stop_times_2012, trips_2012, on='trip_id', how='outer')

# Keep only necessary columns
merge.drop(['arrival_time','departure_time','pickup_type','drop_off_type','service_id','shape_dist_traveled'], axis=1, inplace=True)

# Transform trip_id to route and stop_id to StopID
merge['route_short'] = merge['trip_id'].apply(lambda x: x[x.index('-')+1:])
merge['route_short'] = merge['route_short'].apply(lambda x: x[: x.index('-')])
merge['route_short'] = merge['route_short'].apply(lambda x: str(x).zfill(4))
merge['StopID'] = merge['stop_id'].apply(lambda x: x[-4:])

# Find StopID and the sequence of that shape_id
gb = merge.groupby(['shape_id', 'route_short', 'direction_id','stop_sequence', 'StopID'])
gbc = gb.count()
gbc.reset_index(['shape_id', 'route_short', 'direction_id','stop_sequence', 'StopID'], inplace=True)
transit_shapeID_stopID = gbc.drop(['trip_id','stop_id','route_id','trip_headsign'], axis=1)

# create list of pairs of routes and the SSIDs contained within them
ssid = []
for i in range(len(transit_shapeID_stopID.index)-1):
    temp = transit_shapeID_stopID['StopID'].iloc[i] + transit_shapeID_stopID['StopID'].iloc[i+1]
    ssid.append([ transit_shapeID_stopID['route_short'].iloc[i],temp])
    
SSIDnoStr = str(SSIDno).zfill(8)
routes = [x for x in ssid if SSIDnoStr in x[1]]
routes = [item[0] for item in routes]
routes = list(set(routes))

# Reading in the data and preparing the SSID dataframe

res = pd.read_csv('Route_XXXX_travel_time_csvs/Blank_Route_travel_time.csv')
route_list = routes
for r in route_list:
    df = pd.read_csv('Route_XXXX_travel_time_csvs/Route_%s_travel_time.csv' % r)
    res = pd.concat([df, res], axis=0) 

##  Reading in the data and preparing the SSID dataframe

In [None]:
res.shape

In [None]:
#check unique values of each feature

print("Feature, UniqueValues") 
for column in res:
    print(column + "\t" + str(len(res[column].unique())))

In [None]:
# create dataframe for SSID

res['SSID'] = res['SSID'].astype('category')
ssid_df = res[res.SSID == SSIDno] 
ssid_df.shape

In [None]:
#check unique values of each feature

print("Feature, UniqueValues") 
for column in ssid_df:
    print(column + "\t" + str(len(ssid_df[column].unique())))

In [None]:
ssid_df.reset_index(inplace=True)
ssid_df = ssid_df.drop('index', axis=1)
ssid_df.memory_usage(deep=True)

In [None]:
ssid_df.info(memory_usage='deep')

In [None]:
ssid_df['JourneyPatternID'] = ssid_df['JourneyPatternID'].astype('object')

In [None]:
ssid_df

In [None]:
# adding Xbuses feature - boolean feature indicating whether or not the JourneyPatternID represents an express (X) bus

ssid_df['XBuses'] = ssid_df[ssid_df["JourneyPatternID"].str.find("X") > 0].sum(axis=1) > 0
ssid_df["XBuses"].fillna(False, inplace=True)
ssid_df['XBuses'] = ssid_df['XBuses'].astype('int')

ssid_df.head(10)

In [None]:
# adding JPID_length feature - represents the total number of stops traversed by this JourneyPatternID along its entire route

JPIDL = pd.read_csv('JPID_Length.csv')
JPIDL

In [None]:
JPIDL = JPIDL.drop('Unnamed: 0', axis=1)
# ssid_df['JourneyPatternID'] = ssid_df['JourneyPatternID'].astype('category')
ssid_df.JourneyPatternID = ssid_df.JourneyPatternID.apply(lambda x: str(x).zfill(8))
ssid_df = pd.merge(left=ssid_df ,right=JPIDL, how='left', left_on='JourneyPatternID', right_on='JourneyPatternID')
ssid_df = ssid_df.dropna()
ssid_df

In [None]:
ssid_df.shape

In [None]:
# adding JPID_Freq feature - represents how often in the given data this JourneyPatternID traversed this segment

ssid_df['JPID_Freq'] = ssid_df.groupby(['JourneyPatternID'])['JourneyPatternID'].transform('count')
ssid_df

In [None]:
# removing non-holiday-period weekends from SchoolHoliday feature to avoid multi-collinearity issues

ssid_df['SchoolHoliday'] = ssid_df['SchoolHoliday'].astype('int')
ssid_df['SchoolHoliday'] = np.where(ssid_df['TimeFrame'].isin(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06']), 1, 0)

In [None]:
ssid_df.info(memory_usage='deep')

In [None]:
# fetching SSID number from dataframe for CSV file naming purposes

res.SSID = res.SSID.apply(lambda x: str(int(x)).zfill(8))
r = ssid_df.loc[0, 'SSID'] 

In [None]:
# dropping unneeded columns

ssid_df = ssid_df.drop(['SourceStopID', 'DestStopID', 'VehicleJourneyID', 'JourneyPatternID', 'SSID'], axis=1)

In [None]:
# reorder remaining columns

ssid_df = ssid_df[['TravelTime', 'Rain', 'WindSpeed', 'JPID_length', 'JPID_Freq', 'XBuses', 'SchoolHoliday', 'Day', 'HourFrame']]

In [None]:
ssid_df

In [None]:
cores = 3

In [None]:
ssid_df.info(memory_usage='deep')

In [None]:
# assigning appropriate datatypes where necessary

ssid_df['Day'] = ssid_df['Day'].astype('category')
ssid_df['HourFrame'] = ssid_df['HourFrame'].astype('category')
ssid_df['JPID_length'] = ssid_df['JPID_length'].astype('int')

In [None]:
# save as csv


ssid_df.to_csv('SSID_CSVs/SSID_%s.csv' % r)

## Analysis of target feature TravelTime

Histiogram of values (x-axis is number of seconds)

In [None]:
ssid_df.TravelTime.hist(figsize=(16, 8), bins=50)

Boxplot to check for outliers

In [None]:
ssid_df.TravelTime.plot(kind='box', figsize=(8, 8), showfliers=True)

In [None]:
# Checking stats for TravelTime

ssid_df.TravelTime.describe()

## Dropping outliers

In [None]:
# loading table of times to traverse segments at 80kmph

find_lb = pd.read_csv('use_speed_and_distance_get_outlier_bound.csv')

# extracting value for this segment, to use as lower bound for outlier removal

lb = find_lb.loc[find_lb['SSID'] == 9090786, 'min_sec'].iloc[0]
print(lb)

In [None]:
# make a copy of original df

trimssid_df = ssid_df.copy()

# Remove TravelTime upper bound outliers beyond a conservative 2 x IQR, and lowerbound below 'lb'

ub = trimssid_df.quantile(q=.75) + (2*(trimssid_df.quantile(q=.75)-trimssid_df.quantile(q=.25)))
trimssid_df['OutlierTT'] = (trimssid_df['TravelTime'] < lb) | (trimssid_df['TravelTime'] > ub['TravelTime'])

# Outlier rows counted

print("There will be", trimssid_df[(trimssid_df['OutlierTT'] == True)].shape[0], "outliers dropped.")

In [None]:
# dropping outliers

trimssid_df = trimssid_df[trimssid_df.OutlierTT != True]
trimssid_df.sort_values(['TravelTime'], ascending=False, inplace=True)
trimssid_df = trimssid_df.drop(['OutlierTT'], axis=1)
trimssid_df.reset_index(inplace=True)
trimssid_df = trimssid_df.drop('index', axis=1)
trimssid_df

In [None]:
# Create TT mean/median value variables, to use in calculating mean/median absolute percentage accuracy scores
# and for horizontal lines in the charts below to represent the mean/medium

ssid_df = trimssid_df
ssid_df_mean = ssid_df.TravelTime.mean()
ssid_df_median  = ssid_df.TravelTime.median()
ssid_df.TravelTime.describe()

In [None]:
ssid_df.TravelTime.plot(kind='box', figsize=(8, 8), showfliers=True)

## Visualising the data

### 1a Bar plot for mean TravelTime per HourFrame

In [None]:
mean_HF = ssid_df.groupby('HourFrame')['TravelTime'].mean()
mean_HF.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = ssid_df_mean
coord_x2 = 25

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### 1b Bar plot for median TravelTime per HourFrame

In [None]:
med_HF = ssid_df.groupby('HourFrame')['TravelTime'].median()
med_HF.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = ssid_df_median
coord_x2 = 25

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

### 2a Bar plot for mean TravelTime per Day

In [None]:
mean_Day = ssid_df.groupby('Day')['TravelTime'].mean()
mean_Day=mean_Day.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

mean_Day.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = ssid_df_mean

coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### 2b Bar plot for median TravelTime per HourFrame

In [None]:
med_Day = ssid_df.groupby('Day')['TravelTime'].median()

med_Day=med_Day.reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
med_Day.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = ssid_df_mean
coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

### 3a Bar plot for mean TravelTime when SchoolHoliday true/false

In [None]:
mean_SH = ssid_df.groupby('SchoolHoliday')['TravelTime'].mean()
mean_SH.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = ssid_df_mean
coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

Observation:  

### 3b Bar plot for median TravelTime when SchoolHoliday true/false

In [None]:
med_SH = ssid_df.groupby('SchoolHoliday')['TravelTime'].median()
med_SH.plot(kind='bar', figsize=(15, 6), rot=0)

coord_x1 = -1
coord_y1 = ssid_df_median
coord_x2 = 7

plt.plot([coord_x1, coord_x2], [coord_y1, coord_y1], '-o')

## Model training (statsmodels)

### 1a. Linear Regression model (via statsmodels - to see p-values)

Splitting data into 70% for training and 30% for testing

In [None]:
# Code to split data taken from here: http://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas

ssid_train=ssid_df.sample(frac=0.7, random_state=38)
ssid_test=ssid_df.drop(ssid_train.index)
print ("Training set size is",len(ssid_train))
print ("Training set size is",len(ssid_test))

In [None]:
lrle = sm.ols(formula="TravelTime ~ SchoolHoliday + WindSpeed + Rain + JPID_length + JPID_Freq + XBuses + C(HourFrame) + C(Day)", data=ssid_train).fit()

In [None]:
print(lrle.summary())

In [None]:
# the below doesn't work for some reason

# repeat on test frame and return Adj. R-squared value

#rsqa = stm.regression.linear_model.RegressionResults.rsquared_adj(lrle.predict(ssid_lin_test))
#print ("The predicted adjusted R-squared value on the test frame is", rsqa)

### 1b. Repeat Linear Regression model via statsmodels with continuous features normalised

We need to normalise the relevant parts of the original data and then repeat the test/train split

In [None]:
# Normalising continuous features

ssid_lin = ssid_df
ssid_lin['WindSpeed'] = (ssid_lin['WindSpeed']-ssid_lin['WindSpeed'].min())/(ssid_lin['WindSpeed'].max()-ssid_lin['WindSpeed'].min())
ssid_lin['Rain'] = (ssid_lin['Rain']-ssid_lin['Rain'].min())/(ssid_lin['Rain'].max()-ssid_lin['Rain'].min())
ssid_lin['JPID_length'] = (ssid_lin['JPID_length']-ssid_lin['JPID_length'].min())/(ssid_lin['JPID_length'].max()-ssid_lin['JPID_length'].min())
ssid_lin['JPID_Freq'] = (ssid_lin['JPID_Freq']-ssid_lin['JPID_Freq'].min())/(ssid_lin['JPID_Freq'].max()-ssid_lin['JPID_Freq'].min())
ssid_lin

In [None]:
ssid_lin_train=ssid_lin.sample(frac=0.7, random_state=38)
ssid_lin_test=ssid_lin.drop(ssid_lin_train.index)
print ("Training set size is",len(ssid_lin_train))
print ("Training set size is",len(ssid_lin_test))

In [None]:
lrle1 = sm.ols(formula="TravelTime ~ SchoolHoliday + WindSpeed + Rain + JPID_length + JPID_Freq + XBuses + C(HourFrame) + C(Day)", data=ssid_lin_train).fit()

In [None]:
print(lrle1.summary())

In [None]:
# the below doesn't work for some reason

# repeat on test frame and return Adj. R-squared value

# rsqa = stm.regression.linear_model.RegressionResults.rsquared_adj(lrle.predict(ssid_lin_test))
# print ("The predicted adjusted R-squared value on the test frame is", rsqa)

## Model training (Scikit-learn)

First we need to dreate dummy variables for categorical features, and split into test and training sets

### Prepare data for modelling via Scikit-learn

In [None]:
# create dummy variables from HourFrame and Day using get_dummies
# dropping first values to avoid multicollinearity (Day = Friday, Hour = 0 or 6 or 7, depending on SSID)

Day_dummies = pd.get_dummies(ssid_df.Day, prefix='Day', drop_first=True)
HF_dummies = pd.get_dummies(ssid_df.HourFrame, prefix='HF', drop_first=True)

# concatenate the dummy variable columns onto the original DataFrame and drop the original features
ssid_df = pd.concat([ssid_df, HF_dummies, Day_dummies], axis=1)
ssid_df = ssid_df.drop(['HourFrame', 'Day'], axis=1)
ssid_df

In [None]:
# remove any constant features

selector = VarianceThreshold()
selector.fit_transform(ssid_df)

In [None]:
# prepare a list containing all remaining features bar the target
pred_features = list(ssid_df)
pred_features.remove('TravelTime')
print(pred_features)

In [None]:
# prepare target/predictive feature variables for use in scikit-learn modelling

X = ssid_df[pred_features]
y = ssid_df['TravelTime']

In [None]:
# split the data into training portion (70%) and final testing potion (30%)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 38)

### 1a Linear Regression model (via scikit-learn) - training - default parameters

In [None]:
lr = LinR(n_jobs = cores)
lr.fit(X_train, y_train)

In [None]:
lr_pred = lr.predict(X_train)
lr_rsq = metrics.r2_score(y_train, lr_pred)
print ("The R-squared value of the Linear Regression model is", lr_rsq)

In [None]:
lr_mae = metrics.mean_absolute_error(y_train, lr_pred)
print ("The mean absolute error of the Linear Regression model is", lr_mae)
print ("The mean absolute percentage accuracy is", (((lr_mae)/ssid_df_mean)*100))

In [None]:
lr_mdae = metrics.median_absolute_error(y_train, lr_pred)
print ("The median absolute error of the Linear Regression model is", lr_mdae)
print ("The median absolute percentage accuracy is", (((lr_mdae)/ssid_df_median)*100))

### 1b Linear Regression model (via scikit-learn) - testing - default parameters

In [None]:
lr_preda = lr.predict(X_test)
lr_rsq = metrics.r2_score(y_test, lr_preda)
print ("The R-squared value of the Linear Regression model is", lr_rsq)

In [None]:
lr_mae = metrics.mean_absolute_error(y_test, lr_preda)
print ("The mean absolute error of the Linear Regression model is", lr_mae)
print ("The mean absolute percentage accuracy is", (((lr_mae)/ssid_df_mean)*100))

In [None]:
lr_mdae = metrics.median_absolute_error(y_test, lr_preda)
print ("The median absolute error of the Linear Regression model is", lr_mdae)
print ("The median absolute percentage accuracy is", (((lr_mdae)/ssid_df_median)*100))

### 2.1a Support Vector Machine Regression with Linear Kernel model - training - default parameters

In [None]:
svr_lin = SVR(kernel='linear')
svr_lin.fit(X_train, y_train)

In [None]:
svr_lin_pred = svr_lin.predict(X_train)
svr_lin_rsq = metrics.r2_score(y_train, svr_lin_pred)
print ("The R-squared value of the SVR with Linear Kernel model is", svr_lin_rsq)

In [None]:
svr_lin_mae = metrics.mean_absolute_error(y_train, svr_lin_pred)
print ("The mean absolute error of the SVR with Linear Kernel model is", svr_lin_mae)
print ("The mean absolute percentage accuracy is", (((svr_lin_mae)/ssid_df_mean)*100))

In [None]:
svr_lin_mdae = metrics.median_absolute_error(y_train, svr_lin_pred)
print ("The median absolute error of the SVR with Linear Kernel model is", svr_lin_mdae)
print ("The median absolute percentage accuracy is", (((svr_lin_mdae)/ssid_df_median)*100))

### 2.1b Support Vector Machine Regression with Linear Kernel model - testing - default parameters

In [None]:
svr_lin_preda = svr_lin.predict(X_test)
svr_lin_rsq = metrics.r2_score(y_test, svr_lin_preda)
print ("The R-squared value of the SVR with Linear Kernel model is", svr_lin_rsq)

In [None]:
svr_lin_mae = metrics.mean_absolute_error(y_test, svr_lin_preda)
print ("The mean absolute error of the SVR with Linear Kernel model is", svr_lin_mae)
print ("The mean absolute percentage accuracy is", (((svr_lin_mae)/ssid_df_mean)*100))

In [None]:
svr_lin_mdae = metrics.median_absolute_error(y_test, svr_lin_preda)
print ("The median absolute error of the SVR with Linear Kernel model is", svr_lin_mdae)
print ("The median absolute percentage accuracy is", (((svr_lin_mdae)/ssid_df_median)*100))

### 2.1c Repeat SVR with Linear Kernel model, continuous data normalised - training  - default parameters

First we have to normalise the relevant data

In [None]:
# Normalising continuous features

ssid_norm = ssid_df
ssid_norm['WindSpeed'] = (ssid_norm['WindSpeed']-ssid_norm['WindSpeed'].min())/(ssid_norm['WindSpeed'].max()-ssid_norm['WindSpeed'].min())
ssid_norm['Rain'] = (ssid_norm['Rain']-ssid_norm['Rain'].min())/(ssid_norm['Rain'].max()-ssid_norm['Rain'].min())
ssid_norm['JPID_length'] = (ssid_norm['JPID_length']-ssid_norm['JPID_length'].min())/(ssid_norm['JPID_length'].max()-ssid_norm['JPID_length'].min())
ssid_norm['JPID_Freq'] = (ssid_norm['JPID_Freq']-ssid_norm['JPID_Freq'].min())/(ssid_norm['JPID_Freq'].max()-ssid_norm['JPID_Freq'].min())
ssid_norm

In [None]:
# prepare a list containing all remaining features bar the target

pred_features = list(ssid_norm)
pred_features.remove('TravelTime')
print(pred_features)

In [None]:
# prepare farget/predictive feature variables for use in scikit-learn modelling

X_norm = ssid_norm[pred_features]
y_norm = ssid_norm['TravelTime']

In [None]:
# split the data into training portion (70%) and final testing potion (30%)

Xn_train, Xn_test, yn_train, yn_test = train_test_split(X_norm, y_norm, test_size = 0.3, random_state = 38)

Now onto modelling

In [None]:
svrn_lin = SVR(kernel='linear')
svrn_lin.fit(Xn_train, yn_train)

In [None]:
svrn_lin_pred = svrn_lin.predict(X_train)
svrn_lin_rsq = metrics.r2_score(yn_train, svrn_lin_pred)
print ("The R-squared value of the SVR with Linear Kernel model is", svrn_lin_rsq)

In [None]:
svrn_lin_mae = metrics.mean_absolute_error(y_train, svr_lin_pred)
print ("The mean absolute error of the SVR with Linear Kernel model is", svrn_lin_mae)
print ("The mean absolute percentage accuracy is", (((svrn_lin_mae)/ssid_df_mean)*100))

In [None]:
svrn_lin_mdae = metrics.median_absolute_error(y_train, svr_lin_pred)
print ("The median absolute error of the SVR with Linear Kernel model is", svrn_lin_mdae)
print ("The median absolute percentage accuracy is", (((svrn_lin_mdae)/ssid_df_median)*100))

### 2.1d Repeat SVR with Linear Kernel model, continuous data normalised - testing  - default parameters

In [None]:
svrn_lin_preda = svrn_lin.predict(Xn_test)
svrn_lin_rsq = metrics.r2_score(yn_test, svrn_lin_preda)
print ("The R-squared value of the SVR with Linear Kernel model is", svr_lin_rsq)

In [None]:
svrn_lin_mae = metrics.mean_absolute_error(yn_test, svrn_lin_preda)
print ("The mean absolute error of the SVR with Linear Kernel model is", svr_lin_mae)
print ("The mean absolute percentage accuracy is", (((svrn_lin_mae)/ssid_df_mean)*100))

In [None]:
svrn_lin_mdae = metrics.median_absolute_error(yn_test, svrn_lin_preda)
print ("The median absolute error of the SVR with Linear Kernel model is", svr_lin_mdae)
print ("The median absolute percentage accuracy is", (((svrn_lin_mdae)/ssid_df_median)*100))

### 2.2a Support Vector Machine Regression with Polynomial Kernel model - training - default parameters

In [None]:
svr_poly = SVR(kernel='poly')
svr_poly.fit(X_train, y_train)

In [None]:
svr_poly_pred = svr_poly.predict(X_train)
svr_poly_rsq = metrics.r2_score(y_train, svr_poly_pred)
print ("The R-squared value of the SVR with Polynomial Kernel model is", svr_poly_rsq)

In [None]:
svr_poly_mae = metrics.mean_absolute_error(y_train, svr_poly_pred)
print ("The mean absolute error of the SVR with Polynomial Kernel model is", svr_poly_mae)
print ("The mean absolute percentage accuracy is", (((svr_poly_mae)/ssid_df_mean)*100))

In [None]:
svr_poly_mdae = metrics.median_absolute_error(y_train, svr_poly_pred)
print ("The median absolute error of the SVR with Polynomial Kernel model is", svr_poly_mdae)
print ("The median absolute percentage accuracy is", (((svr_poly_mdae)/ssid_df_median)*100))

### 2.2b Support Vector Machine Regression with Polynomial Kernel model - testing - default parameters

In [None]:
svr_poly_preda = svr_poly.predict(X_test)
svr_poly_rsq = metrics.r2_score(y_test, svr_poly_preda)
print ("The R-squared value of the SVR with Polynomial Kernel model is", svr_poly_rsq)

In [None]:
svr_poly_mae = metrics.mean_absolute_error(y_test, svr_poly_preda)
print ("The mean absolute error of the SVR with Polynomial Kernel model is", svr_poly_mae)
print ("The mean absolute percentage accuracy is", (((svr_poly_mae)/ssid_df_mean)*100))

In [None]:
svr_poly_mdae = metrics.median_absolute_error(y_test, svr_poly_preda)
print ("The median absolute error of the SVR with Polynomial Kernel model is", svr_poly_mdae)
print ("The median absolute percentage accuracy is", (((svr_poly_mdae)/ssid_df_median)*100))

### 2.2c Repeat SVR with Polynomial Kernel model, continuous data normalised - training - default parameters

In [None]:
svrn_poly = SVR(kernel='poly')
svrn_poly.fit(Xn_train, yn_train)

In [None]:
svrn_poly_pred = svrn_poly.predict(Xn_train)
svrn_poly_rsq = metrics.r2_score(yn_train, svrn_poly_pred)
print ("The R-squared value of the SVR with Polynomial Kernel model is", svrn_poly_rsq)

In [None]:
svrn_poly_mae = metrics.mean_absolute_error(yn_train, svrn_poly_pred)
print ("The mean absolute error of the SVR with Polynomial Kernel model is", svrn_poly_mae)
print ("The mean absolute percentage accuracy is", (((svrn_poly_mae)/ssid_df_mean)*100))

In [None]:
svrn_poly_mdae = metrics.median_absolute_error(yn_train, svrn_poly_pred)
print ("The median absolute error of the SVR with Polynomial Kernel model is", svrn_poly_mdae)
print ("The median absolute percentage accuracy is", (((svrn_poly_mdae)/ssid_df_median)*100))

### 2.2d Repeat SVR with Polynomial Kernel model, continuous data normalised - testing  - default parameters

In [None]:
svrn_poly_preda = svrn_poly.predict(Xn_test)
svrn_poly_rsq = metrics.r2_score(yn_test, svrn_poly_preda)
print ("The R-squared value of the SVR with Polynomial Kernel model is", svrn_poly_rsq)

In [None]:
svrn_poly_mae = metrics.mean_absolute_error(yn_test, svrn_poly_preda)
print ("The mean absolute error of the SVR with Polynomial Kernel model is", svrn_poly_mae)
print ("The mean absolute percentage accuracy is", (((svrn_poly_mae)/ssid_df_mean)*100))

In [None]:
svrn_poly_mdae = metrics.median_absolute_error(yn_test, svrn_poly_preda)
print ("The median absolute error of the SVR with Polynomial Kernel model is", svrn_poly_mdae)
print ("The median absolute percentage accuracy is", (((svrn_poly_mdae)/ssid_df_median)*100))

### 2.3a Support Vector Machine Regression with RBF Kernel model - training - default parameters

In [None]:
svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

In [None]:
svr_rbf_pred = svr_rbf.predict(X_train)
svr_rbf_rsq = metrics.r2_score(y_train, svr_rbf_pred)
print ("The R-squared value of the SVR with RBF Kernel model is", svr_rbf_rsq)

In [None]:
svr_rbf_mae = metrics.mean_absolute_error(y_train, svr_rbf_pred)
print ("The mean absolute error of the SVR with RBF Kernel model is", svr_rbf_mae)
print ("The mean absolute percentage accuracy is", (((svr_rbf_mae)/ssid_df_mean)*100))

In [None]:
svr_rbf_mdae = metrics.median_absolute_error(y_train, svr_rbf_pred)
print ("The median absolute error of the SVR with RBF Kernel model is", svr_rbf_mdae)
print ("The median absolute percentage accuracy is", (((svr_rbf_mdae)/ssid_df_median)*100))

### 2.3b Support Vector Machine Regression with RBF Kernel model - testing - default parameters

In [None]:
svr_rbf_preda = svr_rbf.predict(X_test)
svr_rbf_rsq = metrics.r2_score(y_test, svr_rbf_preda)
print ("The R-squared value of the SVR with RBF Kernel model is", svr_rbf_rsq)

In [None]:
svr_rbf_mae = metrics.mean_absolute_error(y_test, svr_rbf_preda)
print ("The mean absolute error of the SVR with RBF Kernel model is", svr_rbf_mae)
print ("The mean absolute percentage accuracy is", (((svr_rbf_mae)/ssid_df_mean)*100))

In [None]:
svr_rbf_mdae = metrics.median_absolute_error(y_test, svr_rbf_preda)
print ("The median absolute error of the SVR with RBF Kernel model is", svr_rbf_mdae)
print ("The median absolute percentage accuracy is", (((svr_rbf_mdae)/ssid_df_median)*100))

### 2.3c Repeat SVR with RBF Kernel model, continuous data normalised - training  - default parameters

In [None]:
svrn_rbf = SVR(kernel='rbf')
svrn_rbf.fit(Xn_train, yn_train)

In [None]:
svrn_rbf_pred = svrn_rbf.predict(Xn_train)
svrn_rbf_rsq = metrics.r2_score(yn_train, svrn_rbf_pred)
print ("The R-squared value of the SVR with RBF Kernel model is", svrn_rbf_rsq)

In [None]:
svrn_rbf_mae = metrics.mean_absolute_error(yn_train, svrn_rbf_pred)
print ("The mean absolute error of the SVR with RBF Kernel model is", svrn_rbf_mae)
print ("The mean absolute percentage accuracy is", (((svrn_rbf_mae)/ssid_df_mean)*100))

In [None]:
svrn_rbf_mdae = metrics.median_absolute_error(yn_train, svrn_rbf_pred)
print ("The median absolute error of the SVR with RBF Kernel model is", svrn_rbf_mdae)
print ("The median absolute percentage accuracy is", (((svrn_rbf_mdae)/ssid_df_median)*100))

### 2.3d Repeat SVR with RBF Kernel model, continuous data normalised - testing  - default parameters

In [None]:
svrn_rbf_preda = svrn_rbf.predict(Xn_test)
svrn_rbf_rsq = metrics.r2_score(yn_test, svrn_rbf_preda)
print ("The R-squared value of the SVR with RBF Kernel model is", svrn_rbf_rsq)

In [None]:
svrn_rbf_mae = metrics.mean_absolute_error(yn_test, svrn_rbf_preda)
print ("The mean absolute error of the SVR with RBF Kernel model is", svrn_rbf_mae)
print ("The mean absolute percentage accuracy is", (((svrn_rbf_mae)/ssid_df_mean)*100))

In [None]:
svrn_rbf_mdae = metrics.median_absolute_error(yn_test, svrn_rbf_preda)
print ("The median absolute error of the SVR with RBF Kernel model is", svrn_rbf_mdae)
print ("The median absolute percentage accuracy is", (((svrn_rbf_mdae)/ssid_df_median)*100))

### 3a Decision Tree Regression model - training - default parameters

In [None]:
# code from here: https://gist.github.com/JustGlowing/fa2c0ac39415eb271db6

dtr = DTR()
dtr.fit(X_train, y_train)

#### Printing ranking of features by estimated predictive value for Decision Tree Regression

In [None]:
# code adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
indices = np.argsort(dtr.feature_importances_)[::-1]

# Print the ordered feature ranking
print("Ordered feature ranking:")

for f in range(X_train.shape[1]):
    feat = indices[f]
    print(X_train.columns[feat], "\t", dtr.feature_importances_[indices[f]])

In [None]:
dtr_pred = dtr.predict(X_train)
dtr_rsq = metrics.r2_score(y_train, dtr_pred)
print ("The R-squared value of the Decision Tree Regression model is", dtr_rsq)

In [None]:
dtr_mae = metrics.mean_absolute_error(y_train, dtr_pred)
print ("The mean absolute error of the Decision Tree Regression model is", dtr_mae)
print ("The mean absolute percentage accuracy is", (((dtr_mae)/ssid_df_mean)*100))

In [None]:
dtr_mdae = metrics.median_absolute_error(y_train, dtr_pred)
print ("The median absolute error of the Decision Tree Regression model is", dtr_mdae)
print ("The median absolute percentage accuracy is", (((dtr_mdae)/ssid_df_median)*100))

### 3b Decision Tree Regression model - testing - default parameters

In [None]:
dtr_preda = dtr.predict(X_test)
dtr_rsq = metrics.r2_score(y_test, dtr_preda)
print ("The R-squared value of the Decision Tree Regression model is", dtr_rsq)

In [None]:
dtr_mae = metrics.mean_absolute_error(y_test, dtr_preda)
print ("The mean absolute error of the Decision Tree Regression model is", dtr_mae)
print ("The mean absolute percentage accuracy is", (((dtr_mae)/ssid_df_mean)*100))

In [None]:
dtr_mdae = metrics.median_absolute_error(y_test, dtr_preda)
print ("The median absolute error of the Decision Tree Regression model is", dtr_mdae)
print ("The median absolute percentage accuracy is", (((dtr_mdae)/ssid_df_median)*100))

### 4a Decision Tree Regression with AdaBoost model - training - default parameters

In [None]:
abr = ABR()
abr.fit(X_train, y_train)

In [None]:
indices = np.argsort(abr.feature_importances_)[::-1]

# Print the ordered feature ranking
print("Ordered feature ranking:")

for f in range(X_train.shape[1]):
    feat = indices[f]
    print(X_train.columns[feat], "\t", dtr.feature_importances_[indices[f]])

In [None]:
abr_pred = abr.predict(X_train)
abr_rsq = metrics.r2_score(y_train, abr_pred)
print ("The R-squared value of the Decision Tree Regression with AdaBoost model is", abr_rsq)

In [None]:
abr_mae = metrics.mean_absolute_error(y_train, abr_pred)
print ("The mean absolute error of the Decision Tree Regression with AdaBoost model is", abr_mae)
print ("The mean absolute percentage accuracy is", (((abr_mae)/ssid_df_mean)*100))

In [None]:
abr_mdae = metrics.median_absolute_error(y_train, abr_pred)
print ("The median absolute error of the Decision Tree Regression with AdaBoost model is", abr_mdae)
print ("The median absolute percentage accuracy is", (((abr_mdae)/ssid_df_median)*100))

### 4b Decision Tree Regression with AdaBoost model - testing - default parameters

In [None]:
abr_preda = abr.predict(X_test)
abr_rsq = metrics.r2_score(y_test, abr_preda)
print ("The R-squared value of the Decision Tree Regression with AdaBoost model is", abr_rsq)

In [None]:
abr_mae = metrics.mean_absolute_error(y_test, abr_preda)
print ("The mean absolute error of the Decision Tree Regression with AdaBoost model is", abr_mae)
print ("The mean absolute percentage accuracy is", (((abr_mae)/ssid_df_mean)*100))

In [None]:
abr_mdae = metrics.median_absolute_error(y_test, abr_preda)
print ("The median absolute error of the Decision Tree Regression with AdaBoost model is", abr_mdae)
print ("The median absolute percentage accuracy is", (((abr_mdae)/ssid_df_median)*100))

### 5a Gradient Boosting Regression model - training - default parameters

In [None]:
gbr = GBR()
gbr.fit(X_train, y_train)

In [None]:
indices = np.argsort(gbr.feature_importances_)[::-1]

# Print the ordered feature ranking
print("Ordered feature ranking:")

for f in range(X_train.shape[1]):
    feat = indices[f]
    print(X_train.columns[feat], "\t", gbr.feature_importances_[indices[f]])

In [None]:
gbr_pred = gbr.predict(X_train)
gbr_rsq = metrics.r2_score(y_train, gbr_pred)
print ("The R-squared value of the Gradient Boosting Regression model is", gbr_rsq)

In [None]:
gbr_mae = metrics.mean_absolute_error(y_train, gbr_pred)
print ("The mean absolute error of the Gradient Boosting Regression model is", gbr_mae)
print ("The mean absolute percentage accuracy is", (((gbr_mae)/ssid_df_mean)*100))

In [None]:
gbr_mdae = metrics.median_absolute_error(y_train, gbr_pred)
print ("The median absolute error of the Gradient Boosting Regression model is", gbr_mdae)
print ("The median absolute percentage accuracy is", (((gbr_mdae)/ssid_df_median)*100))

### 5b Gradient Boosting Regression model - - testing - default parameters

In [None]:
gbr_preda = gbr.predict(X_test)
gbr_rsq = metrics.r2_score(y_test, gbr_preda)
print ("The R-squared value of the Gradient Boosting Regression model is", gbr_rsq)

In [None]:
gbr_mae = metrics.mean_absolute_error(y_test, gbr_preda)
print ("The mean absolute error of the Gradient Boosting Regression model is", gbr_mae)
print ("The mean absolute percentage accuracy is", (((gbr_mae)/ssid_df_mean)*100))

In [None]:
gbr_mdae = metrics.median_absolute_error(y_test, gbr_preda)
print ("The median absolute error of the Gradient Boosting Regression model is", gbr_mdae)
print ("The median absolute percentage accuracy is", (((gbr_mdae)/ssid_df_median)*100))

### 6a Random Forest Regression model (all default) - training - default parameters

In [None]:
rfr = RFR(n_jobs = cores)
rfr.fit(X_train, y_train)

In [None]:
indices = np.argsort(rfr.feature_importances_)[::-1]

# Print the ordered feature ranking
print("Ordered feature ranking:")

for f in range(X_train.shape[1]):
    feat = indices[f]
    print(X_train.columns[feat], "\t", dtr.feature_importances_[indices[f]])

In [None]:
rfr_pred = rfr.predict(X_train)
rfr_rsq = metrics.r2_score(y_train, rfr_pred)
print ("The R-squared value of the Random Forest Regression model is", rfr_rsq)

In [None]:
rfr_mae = metrics.mean_absolute_error(y_train, rfr_pred)
print ("The mean absolute error of the Random Forest Regression model is", rfr_mae)
print ("The mean absolute percentage accuracy is", (((rfr_mae)/ssid_df_mean)*100))

In [None]:
rfr_mdae = metrics.median_absolute_error(y_train, rfr_pred)
print ("The median absolute error of the Random Forest Regression model is", rfr_mdae)
print ("The median absolute percentage accuracy is", (((rfr_mdae)/ssid_df_median)*100))

### 6b Random Forest Regression model (all default) - testing - default parameters

In [None]:
rfr_preda = rfr.predict(X_test)
rfr_rsq = metrics.r2_score(y_test, rfr_preda)
print ("The R-squared value of the Random Forest Regression model is", rfr_rsq)

In [None]:
rfr_mae = metrics.mean_absolute_error(y_test, rfr_preda)
print ("The mean absolute error of the Random Forest Regression model is", rfr_mae)
print ("The mean absolute percentage accuracy is", (((rfr_mae)/ssid_df_mean)*100))

In [None]:
rfr_mdae = metrics.median_absolute_error(y_test, rfr_preda)
print ("The median absolute error of the Random Forest Regression model is", rfr_mdae)
print ("The median absolute percentage accuracy is", (((rfr_mdae)/ssid_df_median)*100))

### Random Forest Regression model - OTT Gridsearch!

The data from this has been lost, but it took many hours to run.  Too many combinations of parameters chosen, as this method exhaustively models each and every one!  Decided RandomizedSearchCV would be more cost-effective approach better approach.

In [None]:
rfr = RFR()
params = { 
    'n_estimators': [120, 300, 500, 800, 1200],
    'max_depth':[5, 8, 15, 25, 30, None],
    'min_samples_split':[1.0, 2, 5, 10, 15, 100],
    'min_samples_leaf':[1, 2, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}
CV_rfr = GSCV(estimator=rfr, param_grid=params, cv= 5)
CV_rfr.fit(X, y)

In [None]:
print (CV_rfr.best_params_)

In [None]:
CV_rfr.best_score_

In [None]:
rfr_pred = CV_rfr.predict(X)
rfr_rsq = metrics.r2_score(y, rfr_pred)
print ("The R-squared value of the Random Forest Regression model is", rfr_rsq)

In [None]:
rfr_mae = metrics.mean_absolute_error(y, rfr_pred)
print ("The mean absolute error of the Random Forest Regression model is", rfr_mae)

In [None]:
rfr_mdae = metrics.median_absolute_error(y, rfr_pred)
print ("The median absolute error of the Random Forest Regression model is", rfr_mdae)

Observation:  SVR is far too slow for our purposes with Linear or Polynomial kernels, but surprisingly is much quicker and more accurate with the RBF kernel - we will disregard the first two but continue testing the latter.