# Notebook to train LinearSVC model on data and analyse using ELI5
sources: \
https://medium.com/@gaurishah143/xg-boost-for-text-classification-9c8b1f8f24aa \
https://github.com/salonipriyani/eli5-article/blob/main/NLP-eli5.ipynb \
https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html

In [70]:
import eli5
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC

In [71]:
# Text data and labels
data_original = pd.read_csv('dataset/fulltrain.csv')
texts = data_original['Text']
labels = data_original['Label']

# Split the data into train and test sets
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [72]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
vectorizer_fit = vec.fit(texts_train)
texts_train = vectorizer_fit.transform(texts_train)

# use the balancedtest file. comment out if using fulltrain file.
test_data = pd.read_csv('dataset/balancedtest.csv')

# check for NaN value
# nan_rows = test_data[test_data['Text'].isnull()]
# print(nan_rows)

texts_test = test_data['Text']
labels_test = test_data['Label']

texts_test = vectorizer_fit.transform(texts_test)

clf = LinearSVC()
# Fit the pipeline on the training data
clf.fit(texts_train, labels_train)

# Make predictions on the test set
predictions = clf.predict(texts_test)

# Evaluate the model performance
accuracy = accuracy_score(labels_test, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))
f1_score = f1_score(labels_test, predictions, average='macro')
print("f1 score is: ", f1_score)

print(classification_report(labels_test, predictions, labels=[1, 2, 3, 4]))

Accuracy: 75.73%
f1 score is:  0.7508898405059017
              precision    recall  f1-score   support

           1       0.87      0.77      0.81       750
           2       0.79      0.50      0.61       750
           3       0.63      0.84      0.72       750
           4       0.81      0.93      0.86       750

    accuracy                           0.76      3000
   macro avg       0.77      0.76      0.75      3000
weighted avg       0.77      0.76      0.75      3000

