# Notebook to train LinearSVC model on data and analyse using ELI5
sources: \
https://medium.com/@gaurishah143/xg-boost-for-text-classification-9c8b1f8f24aa \
https://github.com/salonipriyani/eli5-article/blob/main/NLP-eli5.ipynb \
https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html \
\
About LinearSVC:
- stands for Linear Support Vector Classification
- Both LinearSVC and SVC of sklearn are based on Support Vector Machine (SVM)
- sklearn documentation on LinearSVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
- on SVM: https://www.ibm.com/topics/support-vector-machine

### Imports

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from scipy.sparse import hstack, csr_matrix, vstack

### Read in the data

In [18]:
# Text data and labels from original dataset
data_original = pd.read_csv('dataset/fulltrain.csv')  # pandas DataFrame
texts = data_original['Text']  # pandas Series
labels = data_original['Label']
# use the balancedtest file. comment out if using fulltrain file.
test_data = pd.read_csv('dataset/balancedtest.csv')
texts_test = test_data['Text']
labels_test = test_data['Label']

# # Text data and labels from augmented dataset
# data_augmented = pd.read_csv('dataset/merged_final_df_with_topics_new.csv')
# texts = data_augmented['text']
# labels = data_augmented['label']
# # for augmented dataset
# test_data = pd.read_csv('dataset/test_final_with_topics_new.csv')
# texts_test = test_data['text']
# labels_test = test_data['label']

# # Split the data into train and test sets, when running tests on partition of training corpus.
# texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# when using the entire corpus as training data
texts_train = texts
labels_train = labels

### Preprocessing & Vectorize

In [19]:
# Check the number of rows of original data using the shape attribute
num_rows_train = data_original.shape[0]
print("Number of rows in train dataset:", num_rows_train)
num_rows_test = test_data.shape[0]
print("Number of rows in test dataset:", num_rows_test)

# # Check the number of rows of augmented data using the shape attribute
# num_rows_train = data_augmented.shape[0]
# print("Number of rows in train dataset:", num_rows_train)
# num_rows_test = test_data.shape[0]
# print("Number of rows in test dataset:", num_rows_test)

vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
vectorizer_fit = vec.fit(texts_train)

texts_train = vectorizer_fit.transform(texts_train)  # becomes a csr_matrix (sparse matrix)

# Get the feature names (terms)
train_feature_names = vectorizer_fit.get_feature_names_out()
# Get the number of features (terms)
num_features = len(train_feature_names)
print("Number of features in train dataset:", num_features)

texts_test = vectorizer_fit.transform(texts_test)


# # incorporate extra features (columns) in augmented dataset
# encoded_df = pd.get_dummies(data_augmented, columns = ['has_swear_word', 'severity', 'topic'])
# dropped_df = encoded_df.drop(['label', 'text', 'processed_text'], axis=1)
# sparse = csr_matrix(dropped_df)
# sparse_2 = csr_matrix(texts_train)
# texts_train_combined = hstack([sparse_2, sparse])

# encoded_df = pd.get_dummies(test_data, columns = ['has_swear_word', 'severity', 'topic'])
# dropped_df = encoded_df.drop(['label', 'text', 'processed_text'], axis=1)
# sparse = csr_matrix(dropped_df)
# sparse_2 = csr_matrix(texts_test)
# texts_test_combined = hstack([sparse_2, sparse])

# def pad_columns(matrix1, matrix2):
#     matrix1_rows = matrix1.shape[0]
#     matrix2_rows = matrix2.shape[0]
#     matrix1_cols = matrix1.shape[1]
#     matrix2_cols = matrix2.shape[1]
#     diff = matrix1_cols - matrix2_cols
#     if (diff < 0):
#         # Need to pad columns to matrix 1
#         diff = diff * -1
#         zero_matrix = csr_matrix((matrix1_rows, diff))
#         matrix1 = hstack([matrix1, zero_matrix])
#     elif (diff > 0):
#         # Need to pad columns to matrix 2
#         zero_matrix = csr_matrix((matrix2_rows, diff))
#         matrix2 = hstack([matrix2, zero_matrix])
#     return (matrix1, matrix2)

# texts_train, texts_test = pad_columns(texts_train_combined, texts_test_combined)

Number of rows in train dataset: 48854
Number of rows in test dataset: 3000
Number of features in train dataset: 7388647


### Hyper-parameter tuning

In [20]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# params = {
#     'C': [0.1, 1.0, 10.0], # C=10.0 is the best
#     'penalty': ['l2'], # default value
#     'loss': ['squared_hinge'], # default value
#     'dual': [True] # default value
# }

# # Hyper-parameters fine tuning
# grid_search = GridSearchCV(estimator=LinearSVC(), param_grid=params, cv=5, verbose=2) # cv means cross-validation
# grid_search.fit(texts_train, labels_train)
# best_params = grid_search.best_params_
# print(best_params)

### Run model

In [21]:
# Fit the model on the training data
svm = LinearSVC(C=10.0)

clf = CalibratedClassifierCV(svm)  # add in this for predict_proba, reference: https://stackoverflow.com/questions/26478000/converting-linearsvcs-decision-function-to-probabilities-scikit-learn-python
clf.fit(texts_train, labels_train)

# Make predictions on the test set
predictions = clf.predict(texts_test)

# Evaluate the model performance
accuracy = accuracy_score(labels_test, predictions)
print("Accuracy: ", accuracy)
f1_score = f1_score(labels_test, predictions, average='macro')
print("f1 score: ", f1_score)

print(classification_report(labels_test, predictions, labels=[1, 2, 3, 4]))

Accuracy:  0.7576666666666667
f1 score:  0.7524138201939159
              precision    recall  f1-score   support

           1       0.89      0.74      0.81       750
           2       0.77      0.54      0.63       750
           3       0.65      0.81      0.72       750
           4       0.77      0.95      0.85       750

    accuracy                           0.76      3000
   macro avg       0.77      0.76      0.75      3000
weighted avg       0.77      0.76      0.75      3000



### Save prediction results

In [22]:
# # output prediction results for original dataset in csv for further analysis
# results_df = pd.DataFrame({
#     'Text': test_data['Text'],
#     'Original Label': labels_test,
#     'Predicted Label': predictions
# })
# results_df.to_csv('dataset/linearsvc_model_predictions_original_dataset.csv', index=False)

# # output prediction results for augmented dataset in csv for further analysis
# results_df = pd.DataFrame({
#     'Text': test_data['text'],
#     'Original Label': labels_test,
#     'Predicted Label': predictions
# })
# results_df.to_csv('dataset/linearsvc_model_predictions_augmented_dataset.csv', index=False)

### Model interpretability

In [23]:
import scipy
import numpy as np
def monkeypath_itemfreq(sampler_indices):
   return zip(*np.unique(sampler_indices, return_counts=True))

scipy.stats.itemfreq=monkeypath_itemfreq

import eli5
from eli5 import explain_weights, explain_prediction

# doc = test_data['Text'][0]
# print(doc)
doc = "Currently ELI5 allows to explain weights and predictions of scikit-learn linear classifiers and regressors, print decision trees as text or as SVG, show feature importances and explain predictions of decision trees and tree-based ensembles."
labels = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']

def print_prediction(doc):
    doc_transformed = vec.transform([doc])[0]
    y_pred = clf.predict_proba(doc_transformed)
    for target, prob in zip(labels, y_pred[0]):
        print("{:.3f} {}".format(prob, target))

print_prediction(doc)

# from eli5.lime import TextExplainer

# # doc_2d = np.reshape(doc, (-1, 1))  # change 1D array to 2D array. still does not work.
# te = TextExplainer(random_state=42)
# te.fit(doc, clf.predict_proba)  # using doc, error msg: ValueError: Expected 2D array, got 1D array instead. using doc_2d, error msg: AttributeError: 'numpy.ndarray' object has no attribute 'lower'
# te.show_prediction(target_names=labels)

# # shows results but not useful as "features" are just hex that represent the words
# eli5.show_prediction(clf, test_data['text'][0], target_names=['Satire', 'Hoax', 'Propaganda', 'Reliable News'], vec=vec)

0.536 Satire
0.270 Hoax
0.158 Propaganda
0.035 Reliable News
