In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import warnings

warnings.simplefilter(action='ignore')
# warnings.resetwarnings() to reset

## VECTORIZATION
### TFIDF

In [25]:
df_train = pd.read_csv("cleaned_data.csv", index_col=0)
df_train.head()

Unnamed: 0,Text,Author,length,preprocessed_text
0,Scoring in PROC DISCRIM is as easy as validati...,AM,215,"['scoring', 'proc', 'discrim', 'easy', 'valida..."
1,"In the GLM procedure, you may have used LSMEAN...",AM,782,"['glm', 'procedure', 'may', 'used', 'lsmeans',..."
2,"The first problem, accuracy of the data file, ...",AM,990,"['first', 'problem', 'accuracy', 'data', 'file..."
3,If the homogeneity of covariance matrices assu...,AM,934,"['homogeneity', 'covariance', 'matrix', 'assum..."
4,"With a CONTRAST statement, you specify L, in t...",AM,1490,"['contrast', 'statement', 'specify', 'l', 'cas..."


In [26]:
df_train.shape

(816, 4)

In [27]:
X = df_train[["length", "preprocessed_text"]]
y = df_train[["Author"]]

In [28]:
# train test split on dataset

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y, shuffle=True)

<div class="alert alert-info" role="alert">
Creating a validation set from the training data helps ensure that the final test set remains completely unseen, avoiding any data leakage and giving a true measure of the model's performance.
</div>

In [29]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_text'])
vectorizer.get_feature_names_out()

array(['0average', '10th', '10unit', ..., 'zero', 'zip', 'zscores'],
      dtype=object)

<div class="alert alert-info" role="alert">
    The reason TFIDF was used over bag of words is because bag of words just creates a set of embeddings which show the word occurrences whereas TF-IDF shows the relative importance of a term to a document in a collection of documents. While bag of words is easy to interpret, majority of the time TF-IDF performs better in machine learning models
</div>

<div class="alert alert-danger" role="alert">
array(['0average', '0no', '10th', ..., 'zip', 'zscores', 'zt'],
      dtype=object)
YOU DIDNT CLEAN PROPERLY 
</div>

In [30]:
scaler = StandardScaler()
X_train_length_scaled = scaler.fit_transform(X_train[['length']])
X_train_length_scaled[:5] # printing first 5 responses

array([[-0.48717892],
       [ 1.11741945],
       [ 0.73314064],
       [ 1.78475241],
       [-0.89957569]])

<div class="alert alert-info" role="alert">
TF-IDF gives out normalized values, this means the other features used should also be scaled to ensure that they are contributing appropriately. 
</div>

In [31]:
# Combine TF-IDF feature with 'length' feature
X_train_combined = hstack([X_train_tfidf, X_train_length_scaled])
X_train_combined

<612x5690 sparse matrix of type '<class 'numpy.float64'>'
	with 26959 stored elements in COOrdinate format>

In [32]:
# Doing the same for validation set
X_val_tfidf = vectorizer.transform(X_val['preprocessed_text'])
X_val_length_scaled = scaler.transform(X_val[['length']])

X_val_combined = hstack([X_val_tfidf, X_val_length_scaled])

In [33]:
X_val_combined

<204x5690 sparse matrix of type '<class 'numpy.float64'>'
	with 8125 stored elements in COOrdinate format>

## IMPLEMENTING LOGISTIC REGRESSION MODEL

In [34]:
logreg = LogisticRegression()
logreg.fit(X_combined, y_train)

In [35]:
y_val_pred = logreg.predict(X_val_combined)

report = classification_report(y_val, y_val_pred)
print(report)

              precision    recall  f1-score   support

          AM       1.00      0.62      0.77        16
          CD       0.84      0.93      0.88        44
          DM       0.92      0.94      0.93        36
          DO       0.92      0.75      0.83        16
          FE       0.94      0.97      0.96        35
          TK       0.97      1.00      0.98        57

    accuracy                           0.92       204
   macro avg       0.93      0.87      0.89       204
weighted avg       0.93      0.92      0.92       204

