In [91]:
import pandas as pd
import numpy as np
import itertools
import string


from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt


from helper import *

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=ConvergenceWarning)

np.random.seed(445)


def extract_word(input_string):
    for i in input_string:
        if i in string.punctuation:
            temp = ""
            temp += i
            input_string = input_string.replace(temp, "")
    return input_string.split()

In [92]:
d = {'text': ["EECS. 445,,, is the best", "EECS. 445,,, is the worst", "blah blah 445", "MATH the best", "MATH the worst",
             "445 is the hardest", "class is hard", "worst EECS class", "the fun class", "worst 445 project"],
     'label': [1, -1, 1, 1, -1, 1, -1, -1, 1, -1]}  
  
# Creating a Dataframe
df = pd.DataFrame(d) 
  
word_dict = {}
n = len(df)
for i in range(n):
    curr_row = df['text'][i]
    curr_row = extract_word(curr_row)
    for j in curr_row:
        if j in word_dict:
            word_dict.update({j: word_dict.get(j) + 1})
        if j not in word_dict:
            word_dict[j] = 1
print(word_dict)

{'EECS': 3, '445': 5, 'is': 4, 'the': 6, 'best': 2, 'worst': 4, 'blah': 2, 'MATH': 2, 'hardest': 1, 'class': 3, 'hard': 1, 'fun': 1, 'project': 1}


In [93]:
number_of_reviews = df.shape[0]
number_of_words = len(word_dict)
feature_matrix = np.zeros((number_of_reviews, number_of_words))
for i in range(number_of_reviews):
    curr_rev = extract_word(df['text'][i])
    for j in range(len(word_dict)):
        if list(word_dict)[j] in curr_rev:
            feature_matrix[i][j] = 1
print(feature_matrix)

[[1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.]]


In [94]:
skf = StratifiedKFold(n_splits=5)

In [95]:
X = pd.DataFrame(feature_matrix)
print(X)

    0    1    2    3    4    5    6    7    8    9    10   11   12
0  1.0  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
1  1.0  1.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
2  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0
4  0.0  0.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0
5  0.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0
6  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0
7  1.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0
8  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0
9  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0


In [96]:
Y = np.array([1, -1, 1, 1, -1, 1, -1, -1, 1, -1])
Y = pd.DataFrame(Y)
print(Y)
def performance(y_true, y_pred, metric="accuracy"):
    cv = metrics.confusion_matrix(ytrue,ypred)
    tp = cv[0,0]
    fp = cv[0,1]
    fn = cv[1,0]
    tn = cv[1,1]
    if metric == "accuracy":
        if tp+fp+fn+tn == 0:
            return 0
        else:
            return (tp+tn)/(tp+tn+fp+fn)
    if metric == "precision":
        if tp+fp == 0:
            return 0
        else:
            return tp/(tp+fp)
    if metric == "sensitivity":
        if tp+fn == 0:
            return 0
        else:
            return tp/(tp+fn)
    if metric == "specificity":
        if tn+fp == 0:
            return 0
        else:
            return tn/(tn+fp)
    if metric == "f1-score":
        if tp+fn == 0 or tn+fp == 0:
            return 0
        if tp/(tp+fn) + tn/(tn+fp) == 0:
            return 0
        else:
            return 2*(tp/(tp+fn))*(tn/(tn+fp))/(tp/(tp+fn) + tn/(tn+fp))
    else:
        return 0

   0
0  1
1 -1
2  1
3  1
4 -1
5  1
6 -1
7 -1
8  1
9 -1


In [97]:
'''scores = []
metric = "accuracy"
penalty="l2"
loss="hinge"
dual=True
lsvc = LinearSVC(penalty=penalty, loss=loss, dual=dual, C=10**0, random_state=445)
for i, (train_index, test_index) in enumerate(skf.split(X, Y)):
    #print(f"  Train: index={train_index}")
    #print(f"  Test:  index={test_index}")
    xtrain = X.iloc[train_index]
    ytrain = Y.iloc[train_index]
    xtest = X.iloc[test_index]
    ytest = Y.iloc[test_index]
    ytrue = ytest
    #print(xtrain)
    #print(ytrain)
    lsvc.fit(xtrain, ytrain)
    if metric == "AUROC":
        ypred = lsvc.decision_function(xtest)
        scores.append(metrics.roc_auc_score(ytrue,ypred))
    else:
        ypred = lsvc.predict(xtest)
        scores.append(performance(ytrue,ypred,metric))
print(scores)'''

'scores = []\nmetric = "accuracy"\npenalty="l2"\nloss="hinge"\ndual=True\nlsvc = LinearSVC(penalty=penalty, loss=loss, dual=dual, C=10**0, random_state=445)\nfor i, (train_index, test_index) in enumerate(skf.split(X, Y)):\n    #print(f"  Train: index={train_index}")\n    #print(f"  Test:  index={test_index}")\n    xtrain = X.iloc[train_index]\n    ytrain = Y.iloc[train_index]\n    xtest = X.iloc[test_index]\n    ytest = Y.iloc[test_index]\n    ytrue = ytest\n    #print(xtrain)\n    #print(ytrain)\n    lsvc.fit(xtrain, ytrain)\n    if metric == "AUROC":\n        ypred = lsvc.decision_function(xtest)\n        scores.append(metrics.roc_auc_score(ytrue,ypred))\n    else:\n        ypred = lsvc.predict(xtest)\n        scores.append(performance(ytrue,ypred,metric))\nprint(scores)'

In [98]:

def cv_performance(clf, X, y, k=5, metric="accuracy"):
    scores = []
    skf = StratifiedKFold(n_splits=k)
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        xtrain = X.iloc[train_index]
        ytrain = y.iloc[train_index]
        ytrue = y.iloc[test_index]
        clf.fit(xtrain, ytrain)
        if metric == "AUROC":
            ypred = clf.decision_function(X.iloc[test_index])
            scores.append(metrics.roc_auc_score(ytrue,ypred))
        else:
            ypred = clf.predict(X.iloc[test_index])
            scores.append(performance(ytrue,ypred,metric))
    return np.array(scores).mean()

In [108]:
maxsofar = 0
k=5
metric="accuracy"
C_range = np.array([0.001,0.01,0.1,1,10,100,1000])
for i in C_range:
    print(i)
    clf = LinearSVC(penalty=penalty, loss=loss, dual=dual, C=i, random_state=445)
    curperf = cv_performance(clf,X,Y,k,metric)
    print(curperf)
    if curperf > maxsofar:
        maxsofar = curperf
        maxi = i

0.001
0.5
0.01
0.5
0.1
0.5
1.0
0.5
10.0
0.5
100.0
0.5
1000.0
0.5


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu