## Preprocessing and feature extraction

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


def preprocessing(i, j):
    dataframes_list = []
    for i in range(i, j+1):
        #Preprocessing
        from dateutil.parser import parse
    
        # Import as Dataframe
        dataset = pd.read_csv("S"+str(i)+".csv")

        # get the number of missing data points per column
        missing_values_count = dataset.isnull().sum()

        #Remove rows with missing values
        dataset = dataset.dropna()

        dataset['Time'] = pd.to_datetime(dataset['Time'], infer_datetime_format = True)
        indexed_dataset = dataset.set_index(['Time'])

        y = indexed_dataset['HR']

        #identify data that are not considered as outliers
        #removed_outliers = y.between(y.quantile(.02), y.quantile(.99))

        #identify outliers
        #index_names = indexed_dataset[~removed_outliers].index # INVERT removed_outliers!!
        #print(index_names) # The resulting dates to drop.

        #Remove outliers
        #indexed_dataset.drop(index_names, inplace=True)

        #Feature Extraction
        #Rolling features were used because it is a widely used method for time series data to smooth out short-term fluctuations 
        #and highlight trends in between a considered time period. 

        indexed_dataset['dropValue'] = indexed_dataset['HR'].diff()
        indexed_dataset['rollmean'] = indexed_dataset['HR'].rolling(window = 60, min_periods=1).mean()
        indexed_dataset['rollmedian'] = indexed_dataset['HR'].rolling(window = 60, min_periods=1).median()
        indexed_dataset['rollstd'] = indexed_dataset['HR'].rolling(window = 60, min_periods=1).std()
        indexed_dataset['rollMax'] = indexed_dataset['HR'].rolling(window = 60, min_periods=1).max()
        indexed_dataset['rollMin'] = indexed_dataset['HR'].rolling(window = 60, min_periods=1).min()

        indexed_dataset = indexed_dataset.dropna()
        indexed_dataset.to_csv("SNew"+str(i)+".csv", index = True)
        

        dataframes_list.append(indexed_dataset)
            
    return dataframes_list 

training_dataset = pd.concat(preprocessing(1, 15))
training_dataset.to_csv("TrainingData_processed.csv", index = True)

test_dataset = pd.concat(preprocessing(16, 18))
test_dataset.to_csv("TestData_processed.csv", index = True)

training_dataset



Unnamed: 0_level_0,HR,Attentive,dropValue,rollmean,rollmedian,rollstd,rollMax,rollMin
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-09 13:00:57,81.0,Yes,3.0,79.500000,79.5,2.121320,81.0,78.0
2022-01-09 13:00:58,84.0,Yes,3.0,81.000000,81.0,3.000000,84.0,78.0
2022-01-09 13:00:59,90.0,Yes,6.0,83.250000,82.5,5.123475,90.0,78.0
2022-01-09 13:01:00,90.0,Yes,0.0,84.600000,84.0,5.366563,90.0,78.0
2022-01-09 13:01:01,92.0,Yes,2.0,85.833333,87.0,5.671567,92.0,78.0
...,...,...,...,...,...,...,...,...
2022-01-09 20:03:05,80.0,Yes,3.0,82.983333,81.0,6.347271,94.0,74.0
2022-01-09 20:03:06,87.0,Yes,7.0,82.950000,81.0,6.320333,94.0,74.0
2022-01-09 20:03:07,86.0,Yes,-1.0,82.916667,81.0,6.298484,94.0,74.0
2022-01-09 20:03:08,86.0,Yes,0.0,82.900000,81.0,6.288812,94.0,74.0


## Feature selection

In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X = training_dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7]].values
y = training_dataset.iloc[:, 1].values

X_train = X
y_train = y

X_test = test_dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7]].values
y_test = test_dataset.iloc[:, 1].values

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=f_classif, k=7)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
index = ['HR', 'dropValue', 'rollmean', 'rollmedian', 'rollstd', 'rollMax', 'rollMin']
dfcolumns = pd.DataFrame(index)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']  #naming the dataframe columns
print(featureScores.nlargest(7,'Score'))  #print 10 best features


      Feature        Score
4     rollstd  4274.731703
5     rollMax  3697.342199
0          HR  3001.693564
2    rollmean  1371.309387
3  rollmedian  1027.111558
6     rollMin   932.452301
1   dropValue    13.848855


## Model building

In [25]:
X = training_dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7]].values
y = training_dataset.iloc[:, 1].values

X_train = X
y_train = y

X_test = test_dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7]].values
y_test = test_dataset.iloc[:, 1].values

In [34]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.19.0


### Random Forest

In [37]:
# Random forest classifier
import mlxtend 
from mlxtend.evaluate import bias_variance_decomp
rf = RandomForestClassifier(n_estimators = 40)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(y_pred)


[[ 265    0 1653]
 [   4   52   57]
 [ 311   28 8780]]

0.8158744394618834
                precision    recall  f1-score   support

            No       0.46      0.14      0.21      1918
Not Applicable       0.65      0.46      0.54       113
           Yes       0.84      0.96      0.90      9119

      accuracy                           0.82     11150
     macro avg       0.65      0.52      0.55     11150
  weighted avg       0.77      0.82      0.77     11150

['Yes' 'Yes' 'Yes' ... 'Yes' 'Yes' 'Yes']


### Logistic Regression

In [21]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[   0    0 1918]
 [   6    1  106]
 [   0    5 9114]]

0.8174887892376682
                precision    recall  f1-score   support

            No       0.00      0.00      0.00      1918
Not Applicable       0.17      0.01      0.02       113
           Yes       0.82      1.00      0.90      9119

      accuracy                           0.82     11150
     macro avg       0.33      0.34      0.31     11150
  weighted avg       0.67      0.82      0.74     11150



### KNN

In [22]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 331    0 1587]
 [  13   53   47]
 [ 611   48 8460]]

0.7931838565022421
                precision    recall  f1-score   support

            No       0.35      0.17      0.23      1918
Not Applicable       0.52      0.47      0.50       113
           Yes       0.84      0.93      0.88      9119

      accuracy                           0.79     11150
     macro avg       0.57      0.52      0.54     11150
  weighted avg       0.75      0.79      0.76     11150



### SVM

In [23]:
from sklearn.svm import SVC
SVC = SVC(kernel = 'rbf')
SVC.fit(X_train, y_train)

y_pred = SVC.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[   0    0 1918]
 [   0   59   54]
 [   0   27 9092]]

0.8207174887892377
                precision    recall  f1-score   support

            No       0.00      0.00      0.00      1918
Not Applicable       0.69      0.52      0.59       113
           Yes       0.82      1.00      0.90      9119

      accuracy                           0.82     11150
     macro avg       0.50      0.51      0.50     11150
  weighted avg       0.68      0.82      0.74     11150



  _warn_prf(average, modifier, msg_start, len(result))


## Write the model into a pickle file

In [13]:
with open('DrowsyRate_pulseModule.pkl', 'wb') as handle:
    training_dataset = pd.read_csv('TrainingData_processed.csv')
    
    training_dataset['Time'] = pd.to_datetime(training_dataset['Time'], infer_datetime_format = True)
    training_dataset = training_dataset.set_index(['Time'])

    test_dataset = pd.read_csv('TestData_processed.csv')
    test_dataset['Time'] = pd.to_datetime(test_dataset['Time'], infer_datetime_format = True)
    test_dataset = test_dataset.set_index(['Time'])

    
    X = training_dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7]].values
    y = training_dataset.iloc[:, 1].values

    X_train = X
    y_train = y

    X_test = test_dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7]].values
    y_test = test_dataset.iloc[:, 1].values

   # Random forest classifier

    rf = RandomForestClassifier(random_state = 40)
    rf.fit(X_train, y_train)  
    
    y_pred = rf.predict(X_test)
    # Predicted drowsiness as a percentage for every 2 minutes
    DrowsyPercentageList = []
    for i in range(1, len(y_pred), 120):
        sum_drowsy = 0
        sum_not_drowsy = 0
        sum_not_applicable = 0
        if(len(y_pred) - i >=119):
            for j in range(i, i+120, 1):
                if(y_pred[j-1] == "Yes"):
                    sum_not_drowsy = sum_not_drowsy + 1
                elif(y_pred[j-1] == "Not Applicable"):
                    sum_not_applicable = sum_not_applicable + 1    
                else:
                    sum_drowsy = sum_drowsy + 1
            Total = sum_drowsy + sum_not_drowsy + sum_not_applicable
            DrowsyPercentage = (sum_drowsy/Total)*100
            DrowsyPercentageList.append(round(DrowsyPercentage, 2))
            print(round(DrowsyPercentage, 2))
            
            
    pickle.dump( DrowsyPercentageList, handle)       
    #pickle.dump(b, handle)
    #handle.close()

   

0.0
0.83
0.0
0.83
5.0
14.17
0.0
0.83
0.0
0.0
5.83
1.67
0.0
0.0
7.5
7.5
0.0
0.0
10.83
9.17
22.5
0.0
10.0
3.33
12.5
10.0
15.83
4.17
8.33
0.0
0.0
0.0
0.83
0.0
0.0
4.17
0.0
6.67
20.0
0.0
0.0
0.0
0.0
7.5
0.0
1.67
22.5
0.0
0.0
0.0
3.33
1.67
0.83
19.17
0.0
0.0
0.0
5.83
0.0
0.0
5.0
0.0
0.0
19.17
5.83
7.5
9.17
20.83
23.33
0.0
6.67
14.17
6.67
1.67
0.0
5.83
10.0
12.5
0.83
1.67
9.17
9.17
4.17
0.0
0.0
0.0
1.67
0.0
0.0
0.0
0.83
0.0


In [14]:
with open('DrowsyRate_pulseModule.pkl', 'rb') as handle:
    b = pickle.load(handle)
print (b)

[0.0, 0.83, 0.0, 0.83, 5.0, 14.17, 0.0, 0.83, 0.0, 0.0, 5.83, 1.67, 0.0, 0.0, 7.5, 7.5, 0.0, 0.0, 10.83, 9.17, 22.5, 0.0, 10.0, 3.33, 12.5, 10.0, 15.83, 4.17, 8.33, 0.0, 0.0, 0.0, 0.83, 0.0, 0.0, 4.17, 0.0, 6.67, 20.0, 0.0, 0.0, 0.0, 0.0, 7.5, 0.0, 1.67, 22.5, 0.0, 0.0, 0.0, 3.33, 1.67, 0.83, 19.17, 0.0, 0.0, 0.0, 5.83, 0.0, 0.0, 5.0, 0.0, 0.0, 19.17, 5.83, 7.5, 9.17, 20.83, 23.33, 0.0, 6.67, 14.17, 6.67, 1.67, 0.0, 5.83, 10.0, 12.5, 0.83, 1.67, 9.17, 9.17, 4.17, 0.0, 0.0, 0.0, 1.67, 0.0, 0.0, 0.0, 0.83, 0.0]
