# SVM
## -> without removing stopwords, with stemming
- Accuracy: around 0.86 for k=20 (up to 0.87 for k = 50)
- F1-value:
- Roc-curve:

In [36]:
import pandas as pd
df = pd.read_csv('dataset.csv')
#df = df.sample(20000)
df_target = df['humor']
df_data = df.copy()
df_data.drop(columns='humor')

df_target.head()

0    False
1    False
2     True
3    False
4    False
Name: humor, dtype: bool

Preprocessing

In [37]:
from sklearn import preprocessing

#encode target to numeric
label_encoder = preprocessing.LabelEncoder()
df_target = label_encoder.fit_transform(df_target)
#df_target

In [38]:
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re, string

#when running for the first time you need to activate this line for once.
#nltk.download('stopwords')

#definition of stemming function
token_pattern = re.compile(r"(?u)\b\w\w+\b") # split on whitespace

def tokenize(text):
    stemmer = PorterStemmer()
    stems = []
    
    tokens = token_pattern.findall(text)
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems

In [39]:
#I'm not sure if we agreed to use stemming or not. I'll store the results in another df
#Stopwords are removed here as well
stem_vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=0.0005)
matrix = stem_vectorizer.fit_transform(df_data['text'])

df_data_stemmed = pd.DataFrame(matrix.toarray(), columns=stem_vectorizer.get_feature_names())
#display(df_data_stemmed)




In [40]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import numpy as np

df_data_train, df_data_test, df_target_train, df_target_test = train_test_split(
    df_data_stemmed, df_target, test_size=0.2, random_state=42)

svm = LinearSVC(random_state=42,max_iter=300000)
#svm.fit(df_data_train, df_target_train)
#tfidfVectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=0.0015)

#pipeline = Pipeline(steps=[("transformer",tfidfVectorizer),("classifier",svm)])

# Specify the tunable hyper parameters
parameters = {
    'penalty': ['l2'],#'l1','l2'],
    'loss': ['hinge','squared_hinge'],
    'dual': [True],#False],
    'tol': [1e-04, 1e-06],# 0.0002, 0.0005],#1e-05,1e-04, 1e-03, 1e-02],
    'C': [1,2, 100]#10, 100, 1000]5,10,50

    #'transformer__min_df': [0.0015, 0.0005, 0.0001]
    #'transformer__max_df' : list(np.arange(0.85, 0.99, 0.02)),
    #'transformer__max_features' : list(range(140, 161, 10)),
    #'transformer__stop_words' : ['english','none']
}

# Define KFold parameters
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
estimator = GridSearchCV(svm, parameters, scoring="accuracy", cv=cv)
estimator.fit(df_data_train, df_target_train)

print(estimator.best_params_)
print(estimator.best_estimator_)
print(estimator.best_score_)

KeyboardInterrupt: 

SVM

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Create train/test split
df_data_train, df_data_test, df_target_train, df_target_test = train_test_split(
    df_data_stemmed, df_target, test_size=0.2, random_state=42)

#dt = DecisionTreeClassifier()
#dt.fit(df_data_train, df_target_train)

#svm = LinearSVC(random_state=42, tol=1e-04,C=1,dual=False,penalty='l1',max_iter=1000)
svm = LinearSVC(random_state=42, tol=1e-04,C=1,dual=True,penalty='l2', loss='hinge',max_iter=5000)
svm.fit(df_data_train, df_target_train)

df_prediction = svm.predict(df_data_test)




print("Accuracy: {}".format(accuracy_score(df_target_test, df_prediction)))

Accuracy: 0.915575


In [None]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    
cnf_matrix = confusion_matrix(df_target_test, df_prediction)
np.set_printoptions(precision=2)
plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_)