# Notebook to train LinearSVC model on data and analyse using ELI5
sources: \
https://medium.com/@gaurishah143/xg-boost-for-text-classification-9c8b1f8f24aa \
https://github.com/salonipriyani/eli5-article/blob/main/NLP-eli5.ipynb \
https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html \
\
About LinearSVC:
- stands for Linear Support Vector Classification
- Both LinearSVC and SVC of sklearn are based on Support Vector Machine (SVM)
- sklearn documentation on LinearSVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
- on SVM: https://www.ibm.com/topics/support-vector-machine

In [16]:
import eli5
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC

In [17]:
# # Text data and labels from original dataset
# data_original = pd.read_csv('dataset/fulltrain.csv')
# texts = data_original['Text']
# labels = data_original['Label']

# Text data and labels from augmented dataset
data_original = pd.read_csv('dataset/merged_final_df_with_topics_new.csv')
texts = data_original['text']
labels = data_original['label']

# # Split the data into train and test sets, when running tests on partition of training corpus.
# texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# when using the entire corpus as training data
texts_train = texts
labels_train = labels

In [18]:
vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
vectorizer_fit = vec.fit(texts_train)
texts_train = vectorizer_fit.transform(texts_train)

# # use the balancedtest file. comment out if using fulltrain file.
# test_data = pd.read_csv('dataset/balancedtest.csv')
# texts_test = test_data['Text']
# labels_test = test_data['Label']

# for augmented dataset
test_data = pd.read_csv('dataset/test_final_with_topics_new.csv')
texts_test = test_data['text']
labels_test = test_data['label']

texts_test = vectorizer_fit.transform(texts_test)

# Fit the model on the training data
clf = LinearSVC()
clf.fit(texts_train, labels_train)

# Make predictions on the test set
predictions = clf.predict(texts_test)

# Evaluate the model performance
accuracy = accuracy_score(labels_test, predictions)
print("Accuracy: ", accuracy)
f1_score = f1_score(labels_test, predictions, average='macro')
print("f1 score: ", f1_score)

print(classification_report(labels_test, predictions, labels=[1, 2, 3, 4]))

# # output prediction results for original dataset in csv for further analysis
# results_df = pd.DataFrame({
#     'Text': test_data['Text'],
#     'Original Label': labels_test,
#     'Predicted Label': predictions
# })
# results_df.to_csv('dataset/linearsvc_model_predictions_original_dataset.csv', index=False)

# output prediction results for augmented dataset in csv for further analysis
results_df = pd.DataFrame({
    'Text': test_data['text'],
    'Original Label': labels_test,
    'Predicted Label': predictions
})
results_df.to_csv('dataset/linearsvc_model_predictions_augmented_dataset.csv', index=False)

Accuracy:  0.7613333333333333
f1 score:  0.7560312918613052
              precision    recall  f1-score   support

           1       0.87      0.78      0.82       750
           2       0.79      0.52      0.63       750
           3       0.64      0.80      0.71       750
           4       0.80      0.94      0.86       750

    accuracy                           0.76      3000
   macro avg       0.77      0.76      0.76      3000
weighted avg       0.77      0.76      0.76      3000

