In [1]:
# %pip install xgboost
# %pip install catboost
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import warnings

warnings.simplefilter(action='ignore')
# warnings.resetwarnings() to reset

## VECTORIZATION
### TFIDF

In [2]:
df_train = pd.read_csv("cleaned_data.csv", index_col=0)
df_train.head()

Unnamed: 0,Text,Author,length,preprocessed_text
0,Scoring in PROC DISCRIM is as easy as validati...,AM,215,"['scoring', 'proc', 'discrim', 'easy', 'valida..."
1,"In the GLM procedure, you may have used LSMEAN...",AM,782,"['glm', 'procedure', 'may', 'used', 'lsmeans',..."
2,"The first problem, accuracy of the data file, ...",AM,990,"['first', 'problem', 'accuracy', 'data', 'file..."
3,If the homogeneity of covariance matrices assu...,AM,934,"['homogeneity', 'covariance', 'matrix', 'assum..."
4,"With a CONTRAST statement, you specify L, in t...",AM,1490,"['contrast', 'statement', 'specify', 'l', 'cas..."


In [3]:
df_train.shape

(816, 4)

In [4]:
X = df_train[["length", "preprocessed_text"]]
y = df_train[["Author"]]

In [5]:
# train test split on dataset

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y, shuffle=True)

<div class="alert alert-info" role="alert">
Creating a validation set from the training data helps ensure that the final test set remains completely unseen, avoiding any data leakage and giving a true measure of the model's performance.
</div>

In [6]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_text'])
vectorizer.get_feature_names_out()

array(['0average', '10th', '10unit', ..., 'zero', 'zip', 'zscores'],
      dtype=object)

<div class="alert alert-info" role="alert">
    The reason TFIDF was used over bag of words is because bag of words just creates a set of embeddings which show the word occurrences whereas TF-IDF shows the relative importance of a term to a document in a collection of documents. While bag of words is easy to interpret, majority of the time TF-IDF performs better in machine learning models
</div>

<div class="alert alert-danger" role="alert">
array(['0average', '0no', '10th', ..., 'zip', 'zscores', 'zt'],
      dtype=object)
YOU DIDNT CLEAN PROPERLY 
</div>

In [7]:
scaler = StandardScaler()
X_train_length_scaled = scaler.fit_transform(X_train[['length']])
X_train_length_scaled[:5] # printing first 5 responses

array([[-0.48717892],
       [ 1.11741945],
       [ 0.73314064],
       [ 1.78475241],
       [-0.89957569]])

<div class="alert alert-info" role="alert">
TF-IDF gives out normalized values, this means the other features used should also be scaled to ensure that they are contributing appropriately. 
</div>

In [8]:
# Combine TF-IDF feature with 'length' feature
X_train_combined = hstack([X_train_tfidf, X_train_length_scaled])
X_train_combined

<612x5690 sparse matrix of type '<class 'numpy.float64'>'
	with 26959 stored elements in COOrdinate format>

In [9]:
# Doing the same for validation set
X_val_tfidf = vectorizer.transform(X_val['preprocessed_text'])
X_val_length_scaled = scaler.transform(X_val[['length']])

X_val_combined = hstack([X_val_tfidf, X_val_length_scaled])

In [10]:
X_val_combined

<204x5690 sparse matrix of type '<class 'numpy.float64'>'
	with 8125 stored elements in COOrdinate format>

## IMPLEMENTING LOGISTIC REGRESSION MODEL

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train_combined, y_train)

In [12]:
y_val_pred = logreg.predict(X_val_combined)

report = classification_report(y_val, y_val_pred)
print(report)

              precision    recall  f1-score   support

          AM       1.00      0.62      0.77        16
          CD       0.84      0.93      0.88        44
          DM       0.92      0.94      0.93        36
          DO       0.92      0.75      0.83        16
          FE       0.94      0.97      0.96        35
          TK       0.97      1.00      0.98        57

    accuracy                           0.92       204
   macro avg       0.93      0.87      0.89       204
weighted avg       0.93      0.92      0.92       204



<div class="alert alert-info" role="alert">
Since the data is imbalanced, we will be looking at the weighted avg f1 score as the main metric as it multiplies the F1-Score by the proportion of the samples that belong to each author class. 
</div>

## IMPLEMENTING NAIVE BAYES MODEL

In [13]:
from sklearn.naive_bayes import ComplementNB
from sklearn.preprocessing import MinMaxScaler

# complement naive bayes will be implemented due to the class imbalance

cnb = ComplementNB()

# ValueError: Negative values in data passed to ComplementNB (input X)
# This means that the data has to be scaled to 0 to 1 range, need to use minmaxscaler


scaler = MinMaxScaler()
X_train_dense = X_train_combined.toarray()
X_train_scaled = scaler.fit_transform(X_train_dense)


# fitting the data on the model
cnb.fit(X_train_scaled, y_train)

In [14]:
y_val_pred = cnb.predict(X_val_combined)

report = classification_report(y_val, y_val_pred)
print(report)

              precision    recall  f1-score   support

          AM       1.00      0.75      0.86        16
          CD       0.89      0.95      0.92        44
          DM       0.97      1.00      0.99        36
          DO       0.94      0.94      0.94        16
          FE       0.97      0.97      0.97        35
          TK       1.00      1.00      1.00        57

    accuracy                           0.96       204
   macro avg       0.96      0.94      0.95       204
weighted avg       0.96      0.96      0.96       204



<div class="alert alert-info" role="alert">
Naive bayes seems to be performing much better in almost all categories, which is a good sign. 
</div>

## IMPLEMENTING SVM MODEL

In [15]:
from sklearn import svm

svm_clf = svm.SVC()
svm_clf.fit(X_train_combined, y_train)

In [16]:
y_val_pred = svm_clf.predict(X_val_combined)


<div class="alert alert-info" role="alert">
Not the best weighted avg f1 score, we can train on different kernels to see if there is a particular one that performs better
</div>

In [17]:
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Define the kernels to test
kernels = ['linear', 'rbf', 'poly', 'sigmoid']

# Train and evaluate an SVM with each kernel
for kernel in kernels:
    print(f"{kernel} is being used:")
    clf = svm.SVC(kernel=kernel)
    clf.fit(X_train_combined, y_train)
    y_val_pred = clf.predict(X_val_combined)

    # Evaluate the model
    print(classification_report(y_val, y_val_pred))
    print("======================================================")

linear is being used:
              precision    recall  f1-score   support

          AM       1.00      0.69      0.81        16
          CD       0.86      0.95      0.90        44
          DM       0.97      1.00      0.99        36
          DO       0.93      0.81      0.87        16
          FE       0.97      1.00      0.99        35
          TK       1.00      1.00      1.00        57

    accuracy                           0.95       204
   macro avg       0.96      0.91      0.93       204
weighted avg       0.95      0.95      0.95       204

rbf is being used:
              precision    recall  f1-score   support

          AM       1.00      0.56      0.72        16
          CD       0.82      0.91      0.86        44
          DM       0.94      0.94      0.94        36
          DO       0.92      0.69      0.79        16
          FE       0.89      0.97      0.93        35
          TK       0.95      1.00      0.97        57

    accuracy                        

<div class="alert alert-info" role="alert">
The linear kernel performs the best with a weighted average f1 score of 95%. With further hyperparameter tuning, these models will be able to perform much better. Before that lets try out a few other models including tree based and ensemble methods. 
</div>

## DECISION TREE

In [40]:
from sklearn import tree

dc_clf = tree.DecisionTreeClassifier().fit(X_train_combined, y_train)
y_val_pred = dc_clf.predict(X_val_combined)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          AM       0.40      0.38      0.39        16
          CD       0.78      0.82      0.80        44
          DM       0.75      0.75      0.75        36
          DO       0.71      0.75      0.73        16
          FE       0.64      0.71      0.68        35
          TK       0.96      0.86      0.91        57

    accuracy                           0.76       204
   macro avg       0.71      0.71      0.71       204
weighted avg       0.77      0.76      0.76       204



## Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_combined, y_train)
y_val_pred = rf_clf.predict(X_val_combined)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          AM       1.00      0.69      0.81        16
          CD       0.85      0.93      0.89        44
          DM       0.97      0.97      0.97        36
          DO       0.93      0.81      0.87        16
          FE       0.89      0.97      0.93        35
          TK       0.98      0.98      0.98        57

    accuracy                           0.93       204
   macro avg       0.94      0.89      0.91       204
weighted avg       0.94      0.93      0.93       204



## Boosting Algorithms (GBM, Hist GBM, XGB, Adaboost, Catboost)

In [51]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

gbm_clf = GradientBoostingClassifier()
gbm_clf.fit(X_train_combined, y_train)
y_val_pred = gbm_clf.predict(X_val_combined)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          AM       0.92      0.69      0.79        16
          CD       0.79      0.95      0.87        44
          DM       0.97      0.86      0.91        36
          DO       0.93      0.81      0.87        16
          FE       0.86      0.91      0.89        35
          TK       1.00      0.98      0.99        57

    accuracy                           0.91       204
   macro avg       0.91      0.87      0.89       204
weighted avg       0.91      0.91      0.91       204



In [55]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hgb_clf = HistGradientBoostingClassifier()
hgb_clf.fit(X_train_combined.toarray(), y_train)
y_val_pred = hgb_clf.predict(X_val_combined.toarray())
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          AM       0.73      0.69      0.71        16
          CD       0.86      0.84      0.85        44
          DM       0.89      0.92      0.90        36
          DO       0.80      0.75      0.77        16
          FE       0.83      0.83      0.83        35
          TK       0.95      0.98      0.97        57

    accuracy                           0.87       204
   macro avg       0.84      0.83      0.84       204
weighted avg       0.87      0.87      0.87       204



In [62]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

xgb_clf = XGBClassifier(objective='multi:softmax')

xgb_clf.fit(X_train_combined, y_train_encoded)

y_val_pred_encoded = xgb_clf.predict(X_val_combined)

y_val_pred = label_encoder.inverse_transform(y_val_pred_encoded)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          AM       0.82      0.56      0.67        16
          CD       0.78      0.91      0.84        44
          DM       0.88      0.83      0.86        36
          DO       0.81      0.81      0.81        16
          FE       0.81      0.74      0.78        35
          TK       0.95      1.00      0.97        57

    accuracy                           0.86       204
   macro avg       0.84      0.81      0.82       204
weighted avg       0.86      0.86      0.85       204



In [81]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming X_train, y_train, X_val, and y_val are already defined

# Initialize Logistic Regression
log_reg = LogisticRegression()

# Initialize AdaBoostClassifier with Logistic Regression as the base estimator
ada_clf = AdaBoostClassifier(
    base_estimator=log_reg,
    n_estimators=1000,
    learning_rate=1.0,
    algorithm="SAMME.R"
)

# Train the model
ada_clf.fit(X_train_combined, y_train)

# Make predictions
y_val_pred = ada_clf.predict(X_val_combined)

# Print classification report
print(classification_report(y_val, y_val_pred))


              precision    recall  f1-score   support

          AM       0.85      0.69      0.76        16
          CD       1.00      0.23      0.37        44
          DM       1.00      0.08      0.15        36
          DO       0.93      0.81      0.87        16
          FE       0.23      1.00      0.38        35
          TK       1.00      0.25      0.39        57

    accuracy                           0.42       204
   macro avg       0.83      0.51      0.49       204
weighted avg       0.85      0.42      0.41       204



In [86]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Assuming X_train, y_train, X_val, and y_val are already defined

# Initialize CatBoostClassifier
# The parameters here are basic; you might need to tune them based on your dataset
cat_clf = CatBoostClassifier()

# Train the model
cat_clf.fit(X_train_combined, y_train)

# Make predictions
y_val_pred = cat_clf.predict(X_val_combined)

# Print classification report
print(classification_report(y_val, y_val_pred))

Learning rate set to 0.07717
0:	learn: 1.7284563	total: 81.7ms	remaining: 1m 21s
1:	learn: 1.6855807	total: 99.7ms	remaining: 49.7s
2:	learn: 1.6369820	total: 118ms	remaining: 39.2s
3:	learn: 1.5883775	total: 137ms	remaining: 34s
4:	learn: 1.5479156	total: 155ms	remaining: 30.9s
5:	learn: 1.5118765	total: 173ms	remaining: 28.7s
6:	learn: 1.4842768	total: 191ms	remaining: 27.1s
7:	learn: 1.4457485	total: 210ms	remaining: 26.1s
8:	learn: 1.4111487	total: 229ms	remaining: 25.2s
9:	learn: 1.3799144	total: 248ms	remaining: 24.5s
10:	learn: 1.3515181	total: 266ms	remaining: 23.9s
11:	learn: 1.3241570	total: 284ms	remaining: 23.4s
12:	learn: 1.2937076	total: 302ms	remaining: 22.9s
13:	learn: 1.2692514	total: 320ms	remaining: 22.5s
14:	learn: 1.2525150	total: 338ms	remaining: 22.2s
15:	learn: 1.2313487	total: 356ms	remaining: 21.9s
16:	learn: 1.2153719	total: 374ms	remaining: 21.6s
17:	learn: 1.1977806	total: 392ms	remaining: 21.4s
18:	learn: 1.1768064	total: 411ms	remaining: 21.2s
19:	learn: 