In [1]:
#this script trains bag of words model and saves it for later use
import nbimporter
import sys
import import_ipynb
sys.path.append('../')
from Preprocessing import Train_Test_split as ts  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import json
import joblib

Training data shape: (452068, 3)
Test data shape: (113018, 3)
Training samples: 361654, Validation samples: 90414, Test samples: 113018


In [2]:
#BOW Vectorization TF-IDF
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=10000,
    ngram_range=(1, 3)  # unigrams, bigrams, trigrams
)

# Fit and transform training data
X_train_vec = vectorizer.fit_transform(ts.X_train)
X_val_vec = vectorizer.transform(ts.X_val)
X_test_vec = vectorizer.transform(ts.X_test)

print("TF-IDF vectorization completed.")
print(f"Shape of training data: {X_train_vec.shape}")
print(f"Shape of validation data: {X_val_vec.shape}")
print(f"Shape of test data: {X_test_vec.shape}")

TF-IDF vectorization completed.
Shape of training data: (361654, 10000)
Shape of validation data: (90414, 10000)
Shape of test data: (113018, 10000)


In [3]:
#Train Logistic Regression Model
model = LogisticRegression(
    max_iter=1000,
    C=1.1,
    random_state=42
)
model.fit(X_train_vec, ts.y_train)
print("Model training completed.")

Model training completed.


In [4]:
# Training set predictions
y_train_pred = model.predict(X_train_vec)

# Validation set predictions
y_val_pred = model.predict(X_val_vec)

# Test set predictions
y_test_pred = model.predict(X_test_vec)

In [5]:
y_test_pred

array([1, 3, 3, ..., 3, 1, 3], dtype=int64)

In [6]:
#Evaluate the model
print("\nPerformance on the training set:")
print(classification_report(ts.y_train, y_train_pred))

print("\nPerformance on the validation set:")
print(classification_report(ts.y_val, y_val_pred))

print("\nPerformance on the test set:")
print(classification_report(ts.y_test, y_test_pred))


Performance on the training set:
              precision    recall  f1-score   support

           1       0.88      0.94      0.91    148525
           2       0.62      0.09      0.16     18473
           3       0.92      0.95      0.94    194656

    accuracy                           0.90    361654
   macro avg       0.81      0.66      0.67    361654
weighted avg       0.89      0.90      0.89    361654


Performance on the validation set:
              precision    recall  f1-score   support

           1       0.88      0.93      0.90     37131
           2       0.47      0.07      0.12      4619
           3       0.92      0.95      0.93     48664

    accuracy                           0.89     90414
   macro avg       0.75      0.65      0.65     90414
weighted avg       0.88      0.89      0.88     90414


Performance on the test set:
              precision    recall  f1-score   support

           1       0.88      0.93      0.90     46415
           2       0.47      

In [7]:
#save the model
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(model, 'logreg_bow_model.joblib')

['logreg_bow_model.joblib']