In [1]:
import numpy as np
import statistics
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

Data Parsing

In [2]:
def parse_data(fileDirectory, X, Y):
    source = open(fileDirectory, 'r')
    data = source.readlines()
    source.close()
    for i in range(1, len(data)):
        raw = data[i].split(",")
        Y.append(1 if raw[-1].strip().strip('"') == "STANDING" else 0)
        X.append(raw[:40])
        
X_train = []
Y_train = []
X_test = []
Y_test = []

parse_data('train.csv', X_train, Y_train)
parse_data('test.csv', X_test, Y_test)

Trying Linear SVM, score is 90.13%

In [3]:
linearKernelSVM = SVC(kernel = 'linear')
linearKernelSVM.fit(X_train, Y_train)
print(linearKernelSVM.score(X_test, Y_test))
Y_pred = linearKernelSVM.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred, target_names = ["STANDING", "Non-STANDING"]))

0.9012555140821175
[[2258  157]
 [ 134  398]]
              precision    recall  f1-score   support

    STANDING       0.94      0.93      0.94      2415
Non-STANDING       0.72      0.75      0.73       532

   micro avg       0.90      0.90      0.90      2947
   macro avg       0.83      0.84      0.84      2947
weighted avg       0.90      0.90      0.90      2947



Trying rbf kernel for C value ranging from 1e-4 to 1e6, optimal is 1e3 with score = 90.74%

In [4]:
C_2d_range = [1e-4, 1e-3, 1e-2, 1, 1e2, 1e3, 1e4, 1e5, 1e6]
scoreList = []
for c in C_2d_range:
    rbfKernelSVM = SVC(kernel = 'rbf', gamma = 'scale', C = c)
    rbfKernelSVM.fit(X_train, Y_train)
    print(rbfKernelSVM.score(X_train, Y_train))
    scoreList.append(rbfKernelSVM.score(X_test, Y_test))
print(scoreList)

rbfKernelSVM = SVC(kernel = 'rbf', gamma = 'scale', C = 1e3)
rbfKernelSVM.fit(X_train, Y_train)
Y_pred = rbfKernelSVM.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred, target_names = ["STANDING", "Non-STANDING"]))

0.8131120783460283
0.8131120783460283
0.8131120783460283
0.8982589771490751
0.9328073993471164
0.9523939064200218
0.9763329706202394
0.9938792165397171
1.0
[0.8194774346793349, 0.8194774346793349, 0.8194774346793349, 0.8995588734306074, 0.9039701391245334, 0.9073634204275535, 0.8958262639972854, 0.8897183576518494, 0.8741092636579573]
[[2259  156]
 [ 117  415]]
              precision    recall  f1-score   support

    STANDING       0.95      0.94      0.94      2415
Non-STANDING       0.73      0.78      0.75       532

   micro avg       0.91      0.91      0.91      2947
   macro avg       0.84      0.86      0.85      2947
weighted avg       0.91      0.91      0.91      2947



In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
transformedXTrain = scaler.transform(X_train)
rbfKernelScaledSVM = SVC(kernel = 'rbf', gamma = 'scale', C =1e3)
rbfKernelScaledSVM.fit(transformedXTrain, Y_train)
print(rbfKernelScaledSVM.score(transformedXTrain, Y_train))
transformedXTest = scaler.transform(X_test)
print(rbfKernelScaledSVM.score(transformedXTest, Y_test))

0.9862622415669206
0.8900576857821514


0.8958262639972854