# 1. Setup and Dependencies

In [100]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


## 2. # Load TF-IDF matrix and vectorizer

In [101]:
X = joblib.load('../data/processed/tfidf_matrix.pkl')
vectorizer = joblib.load('../models/tfidf_vectorizer.pkl')

## 3. Load labels

In [102]:
df = pd.read_csv('../data/processed/cleaned_data.csv')
labels = df['label']



## 4. Split data (stratified)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

## 5. Train Logistic Regression model

In [104]:
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)
print (model)

LogisticRegression(class_weight='balanced', max_iter=1000)


## 6. Predict and evaluate

In [105]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       118
           1       0.60      0.35      0.44        17

    accuracy                           0.89       135
   macro avg       0.76      0.66      0.69       135
weighted avg       0.87      0.89      0.88       135

[[114   4]
 [ 11   6]]


## 7. Save trained model

In [106]:
joblib.dump(model, '../models/logistic_regression_model.pkl')
print("Model saved with SMOTE.")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


Model saved with SMOTE.
