# Time Series SVM & Linear Regression Models w/ Sklearn

### Load Necessary Libraries

In [90]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_recall_curve, average_precision_score, auc, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
print("Done")

Done


In [91]:
df = pd.read_csv("esp8266_readings - Sheet1.csv")
df.columns

Index(['Date', 'Event Name', 'Value1', 'Value2', 'Value3'], dtype='object')

In [92]:
df['Date']

0       November 18, 2019 at 08:44PM
1       November 18, 2019 at 08:45PM
2       November 18, 2019 at 08:46PM
3       November 18, 2019 at 08:47PM
4       November 18, 2019 at 08:48PM
                    ...             
5386    November 22, 2019 at 03:16PM
5387    November 22, 2019 at 03:17PM
5388    November 22, 2019 at 03:18PM
5389    November 22, 2019 at 03:19PM
5390    November 22, 2019 at 03:20PM
Name: Date, Length: 5391, dtype: object

In [93]:
df = df.rename(columns={'Event Name': 'Event_Name', 'Value1': 'Digital_Button', 'Value2':'Photoresistor', 'Value3':'Temp; Humidity'})
df.head(15)

Unnamed: 0,Date,Event_Name,Digital_Button,Photoresistor,Temp; Humidity
0,"November 18, 2019 at 08:44PM",esp8266_readings,1,204,22.5
1,"November 18, 2019 at 08:45PM",esp8266_readings,1,208,22.4
2,"November 18, 2019 at 08:46PM",esp8266_readings,1,200,22.4
3,"November 18, 2019 at 08:47PM",esp8266_readings,1,204,22.4
4,"November 18, 2019 at 08:48PM",esp8266_readings,1,330,22.4
5,"November 18, 2019 at 08:49PM",esp8266_readings,1,207,22.4
6,"November 18, 2019 at 08:50PM",esp8266_readings,1,200,22.4
7,"November 18, 2019 at 08:51PM",esp8266_readings,1,207,22.4
8,"November 18, 2019 at 08:52PM",esp8266_readings,1,201,22.4
9,"November 18, 2019 at 08:53PM",esp8266_readings,1,195,22.40;37.00


In [94]:
df.shape

(5391, 5)

In [95]:
df = df.drop([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [96]:
df[['Temp', 'Humidity']] = df['Temp; Humidity'].str.split(';', expand=True)
df

Unnamed: 0,Date,Event_Name,Digital_Button,Photoresistor,Temp; Humidity,Temp,Humidity
9,"November 18, 2019 at 08:53PM",esp8266_readings,1,195,22.40;37.00,22.40,37.00
10,"November 18, 2019 at 08:54PM",esp8266_readings,0,310,22.40;37.00,22.40,37.00
11,"November 18, 2019 at 08:55PM",esp8266_readings,0,209,22.40;37.00,22.40,37.00
12,"November 18, 2019 at 08:56PM",esp8266_readings,0,210,22.40;38.00,22.40,38.00
13,"November 18, 2019 at 08:57PM",esp8266_readings,0,208,22.80;36.00,22.80,36.00
...,...,...,...,...,...,...,...
5386,"November 22, 2019 at 03:16PM",esp8266_readings,1,702,18.60;49.00,18.60,49.00
5387,"November 22, 2019 at 03:17PM",esp8266_readings,1,681,18.60;49.00,18.60,49.00
5388,"November 22, 2019 at 03:18PM",esp8266_readings,1,668,18.60;49.00,18.60,49.00
5389,"November 22, 2019 at 03:19PM",esp8266_readings,1,678,18.60;49.00,18.60,49.00


In [97]:
df = df.drop(columns="Temp; Humidity")
df

Unnamed: 0,Date,Event_Name,Digital_Button,Photoresistor,Temp,Humidity
9,"November 18, 2019 at 08:53PM",esp8266_readings,1,195,22.40,37.00
10,"November 18, 2019 at 08:54PM",esp8266_readings,0,310,22.40,37.00
11,"November 18, 2019 at 08:55PM",esp8266_readings,0,209,22.40,37.00
12,"November 18, 2019 at 08:56PM",esp8266_readings,0,210,22.40,38.00
13,"November 18, 2019 at 08:57PM",esp8266_readings,0,208,22.80,36.00
...,...,...,...,...,...,...
5386,"November 22, 2019 at 03:16PM",esp8266_readings,1,702,18.60,49.00
5387,"November 22, 2019 at 03:17PM",esp8266_readings,1,681,18.60,49.00
5388,"November 22, 2019 at 03:18PM",esp8266_readings,1,668,18.60,49.00
5389,"November 22, 2019 at 03:19PM",esp8266_readings,1,678,18.60,49.00


### Now, let's clean up the 'Date' Column

1. Separate the Date column and split the date and time values
2. Then, convert 12 hour time -> 24 hour time and add those values to a list (**'times'**)
3. After, let's convert that list to a dataframe, split the hours and minutes, and add the hour column back to our core dataframe, **df**. 

In [98]:
date_df = pd.DataFrame(df['Date'])
date_df = date_df['Date'].str.split('at', expand = True)

In [99]:
%%capture
def hourConverter():
    times = []
    for val in date_df[1].iteritems():
        if str(val[1][-2:]) == 'AM':
            times.append(int(val[1][0:3]))
        elif str(val[1][-2:]) == 'PM':
            times.append(int(val[1][0:3]) + 12)
    return times
hourConverter()

In [100]:
r, c = df.shape
len(hourConverter()) == r

True

In [101]:
len(df) == len(hourConverter())

True

In [102]:
time_df = pd.DataFrame(hourConverter())
time_df = time_df.rename(columns={0: 'Hour'})
time_df.dtypes

Hour    int64
dtype: object

In [103]:
time_df = pd.get_dummies(time_df.astype(str))
time_df

Unnamed: 0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,...,Hour_22,Hour_23,Hour_24,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5377,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5378,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5379,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5380,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Need to Append the time_df dataframe to main dataframe

In [104]:
df.index = np.arange(0, len(df))

In [105]:
df = pd.concat([df, time_df], axis =1)

In [106]:
df = df.drop(columns=['Date'])

In [107]:
df

Unnamed: 0,Event_Name,Digital_Button,Photoresistor,Temp,Humidity,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,...,Hour_22,Hour_23,Hour_24,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,esp8266_readings,1,195,22.40,37.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,esp8266_readings,0,310,22.40,37.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,esp8266_readings,0,209,22.40,37.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,esp8266_readings,0,210,22.40,38.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,esp8266_readings,0,208,22.80,36.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5377,esp8266_readings,1,702,18.60,49.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5378,esp8266_readings,1,681,18.60,49.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5379,esp8266_readings,1,668,18.60,49.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5380,esp8266_readings,1,678,18.60,49.00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Normalization

In [108]:
df['Photoresistor'] = df['Photoresistor']/df['Photoresistor'].max()

In [109]:
df['Temp'] = df['Temp'].astype(float)/df['Temp'].astype(float).max()

In [110]:
df['Humidity'] = df['Humidity'].astype(float)/df['Humidity'].astype(float).max()

In [111]:
df

Unnamed: 0,Event_Name,Digital_Button,Photoresistor,Temp,Humidity,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,...,Hour_22,Hour_23,Hour_24,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,esp8266_readings,1,0.190430,0.982456,0.569231,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,esp8266_readings,0,0.302734,0.982456,0.569231,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,esp8266_readings,0,0.204102,0.982456,0.569231,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,esp8266_readings,0,0.205078,0.982456,0.584615,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,esp8266_readings,0,0.203125,1.000000,0.553846,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5377,esp8266_readings,1,0.685547,0.815789,0.753846,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5378,esp8266_readings,1,0.665039,0.815789,0.753846,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5379,esp8266_readings,1,0.652344,0.815789,0.753846,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5380,esp8266_readings,1,0.662109,0.815789,0.753846,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# SVM Model

In [112]:
from sklearn.svm import SVC
x = df.drop(columns=["Digital_Button", "Event_Name"]).values
y = df['Digital_Button'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .3, random_state = 42)


svm_clf = SVC(kernel="linear")
svm_clf.fit(x_train, y_train)

y_preds = svm_clf.predict(x_test)

results = pd.DataFrame({
    'Actual': y_test,
    'Predictions': y_preds
})

results.head(10)

svm_clf.score(x_train, y_train)
svm_clf.score(x_test, y_test)

0.9151702786377709

# Evaluate the Model

In [113]:
from sklearn.metrics import zero_one_loss # looking at % misclassifications, 0 is best

print("Accuracy of TRAIN " + str(svm_clf.__class__.__name__) + ": " + str(svm_clf.score(x_train, y_train)))
print("Accuracy of TEST " + str(svm_clf.__class__.__name__) + ": " + str(svm_clf.score(x_test, y_test)))

misclassification_error = zero_one_loss(y_test, y_preds)

print("The Misclassification Rate is: " + str(misclassification_error) + ", \nwhich should equal 1 - accuracy: " + str(1-svm_clf.score(x_test, y_test)))


Accuracy of TRAIN SVC: 0.9224847358640829
Accuracy of TEST SVC: 0.9151702786377709
The Misclassification Rate is: 0.0848297213622291, 
which should equal 1 - accuracy: 0.0848297213622291


# Lin Reg Model

In [114]:
x = df.drop(columns=['Event_Name', 'Digital_Button'])
y = df['Digital_Button']

In [115]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .3, random_state = 42)

reg = LinearRegression()
reg.fit(x_train, y_train)
print("The y-intercept is " + str(reg.intercept_))
print("The coefficients are " + str(reg.coef_))

coefficients = pd.DataFrame(reg.coef_, x.columns, columns=["Variable Coefficients"])
coefficients


The y-intercept is 429806622259.8897
The coefficients are [-2.98752368e-01  2.34489171e-02  2.59425140e+00 -4.29806622e+11
 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11
 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11
 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11
 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11
 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11
 -4.29806622e+11 -4.29806622e+11 -4.29806622e+11]


Unnamed: 0,Variable Coefficients
Photoresistor,-0.2987524
Temp,0.02344892
Humidity,2.594251
Hour_1,-429806600000.0
Hour_10,-429806600000.0
Hour_11,-429806600000.0
Hour_12,-429806600000.0
Hour_13,-429806600000.0
Hour_14,-429806600000.0
Hour_15,-429806600000.0


In [116]:
results = pd.DataFrame({
    'Actual': y_test,
    'Predictions': y_preds
})

results.head(10)


Unnamed: 0,Actual,Predictions
3271,1,1
907,0,0
4579,1,1
3463,1,1
319,1,1
1717,1,1
3616,0,0
2093,1,1
5212,1,0
1593,1,1


In [117]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = round(mean_squared_error(y_test, y_preds), 2)
rmse = round(np.sqrt(mean_squared_error(y_test, y_preds)), 2)

print("MSE for " + str(reg.__class__.__name__) + " " +str(mse))
print("RMSE for " + str(reg.__class__.__name__) + " " +str(rmse)) # Std. Dev of our prediction errors (distance between predicted and test y's) -> y lies between 0 and 1, so this is not a great RMSE value
print("MAE for " + str(reg.__class__.__name__) + " " +str(mean_absolute_error(y_test, y_preds)))
print("\n")
print("Accuracy of TRAIN for " + str(reg.__class__.__name__) + ": " + str(reg.score(x_train, y_train)))
print("Accuracy of TEST for " + str(reg.__class__.__name__) + ": " + str(reg.score(x_test, y_test)))


MSE for LinearRegression 0.08
RMSE for LinearRegression 0.29
MAE for LinearRegression 0.0848297213622291


Accuracy of TRAIN for LinearRegression: 0.6792702800546452
Accuracy of TEST for LinearRegression: 0.6296697486679239


# Let's test a few iterations of feature combinations

In [118]:
for combination in (df[["Photoresistor", "Temp", "Humidity"]], df[["Photoresistor", "Temp"]],df[["Temp", "Humidity"]], df.drop(columns=['Digital_Button', 'Event_Name'])):
    x = combination.values
    y = df['Digital_Button'].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .3, random_state = 42)
    reg = LinearRegression()
    reg.fit(x_train, y_train)
    y_preds = reg.predict(x_test)
    print("The intercept is " + str(reg.intercept_))
    print("The coefficients are " + str(reg.coef_))
    print("MSE: " + str(round(mean_squared_error(y_test, y_preds), 2)))
    print("RMSE: " + str(round(np.sqrt(mean_squared_error(y_test, y_preds)), 2)))
    print("\n")
    print("Accuracy of TRAIN " + str(reg.__class__.__name__) + ": " + str(reg.score(x_train, y_train)))
    print("Accuracy of TEST " + str(reg.__class__.__name__) + ": " + str(reg.score(x_test, y_test)))
    print("--------------------------------------------------------------------------------")
    print("\n")

    



The intercept is 3.298678339133956
The coefficients are [ 2.25883548e-04 -4.32657332e+00  1.10446807e+00]
MSE: 0.19
RMSE: 0.43


Accuracy of TRAIN LinearRegression: 0.17247353418702194
Accuracy of TEST LinearRegression: 0.16529076656232955
--------------------------------------------------------------------------------


The intercept is 4.533919363264606
The coefficients are [ 0.02332053 -4.85086885]
MSE: 0.19
RMSE: 0.44


Accuracy of TRAIN LinearRegression: 0.16147411632851882
Accuracy of TEST LinearRegression: 0.1486365474391229
--------------------------------------------------------------------------------


The intercept is 3.2990022119509055
The coefficients are [-4.32685317  1.10457293]
MSE: 0.19
RMSE: 0.43


Accuracy of TRAIN LinearRegression: 0.17247352397326166
Accuracy of TEST LinearRegression: 0.16529085424978673
--------------------------------------------------------------------------------


The intercept is 977243033674.1093
The coefficients are [-2.98212470e-01  1.971