In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [2]:
CHECKPOINT_1_OUTPUT_DIR = "../checkpoint 1/output"
MODEL_DIR = "models"
PREPROCESSED_DATA_FILE = "FakeReviewDataPreprocessed.csv"
RANDOM_STATE = 42
MODEL_PATH = os.path.join(MODEL_DIR, "logistic_regression_model.pkl")
FEATURE_NAMES_PATH = os.path.join("..","checkpoint 1","models","tfidf_feature_names.pkl")

In [3]:
if not os.path.exists(os.path.join(CHECKPOINT_1_OUTPUT_DIR, PREPROCESSED_DATA_FILE)):
    raise FileNotFoundError(f"Error: The file {PREPROCESSED_DATA_FILE} was not found in {CHECKPOINT_1_OUTPUT_DIR}. Ensure preprocessing is completed first.")

data = pd.read_csv(os.path.join(CHECKPOINT_1_OUTPUT_DIR, PREPROCESSED_DATA_FILE))
print("Preprocessed data loaded successfully.")

Preprocessed data loaded successfully.


In [4]:
encoder = LabelEncoder()
if 'label' in data.columns:
    data['label'] = encoder.fit_transform(data['label'])
    print("Label column label-encoded.")

Label column label-encoded.


In [5]:
columns_to_exclude = ['label', 'text', 'category']
feature_cols = [col for col in data.columns if col not in columns_to_exclude]
if not feature_cols:
    raise ValueError("No feature columns were found. Please verify the preprocessing output.")

X = data[feature_cols].to_numpy()
y = data['label']
print("Dataset prepared for training.")
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_STATE)
print("Dataset split into training+validation and test sets (90% train+val, 10% test).")

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=RANDOM_STATE)
print("Dataset split into training and validation sets (80% train, 10% val).")

Dataset prepared for training.
Dataset split into training+validation and test sets (90% train+val, 10% test).
Dataset split into training and validation sets (80% train, 10% val).


In [6]:
if not os.path.exists(FEATURE_NAMES_PATH):
    raise FileNotFoundError(f"Error: The file {FEATURE_NAMES_PATH} was not found. Please run preprocessing first")
feature_names = joblib.load(FEATURE_NAMES_PATH)

best_model = LogisticRegression(max_iter=1000, random_state=42,solver='saga')
best_model.fit(X_train, y_train)
best_model.feature_names_in_ = np.array(feature_names.tolist()+['rating'])

In [7]:
model_filename = os.path.join(MODEL_DIR, "logistic_regression_model.pkl")
joblib.dump(best_model, model_filename)
print(f"Model saved to {model_filename}")

Model saved to models\logistic_regression_model.pkl


In [8]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    metrics = {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred, average='binary', zero_division=0),
        "recall": recall_score(y, y_pred, average='binary', zero_division=0),
        "f1": f1_score(y, y_pred, average='binary', zero_division=0),
    }
    return metrics

train_metrics = evaluate_model(best_model, X_train, y_train)
val_metrics = evaluate_model(best_model, X_val, y_val)
test_metrics = evaluate_model(best_model, X_test, y_test)

print(f"Train Metrics: {train_metrics}")
print(f"Validation Metrics: {val_metrics}")
print(f"Test Metrics: {test_metrics}")



Train Metrics: {'accuracy': 0.8813071976255256, 'precision': 0.8713707973682622, 'recall': 0.8942021803766105, 'f1': 0.8826388676591972}
Validation Metrics: {'accuracy': 0.8474282888229476, 'precision': 0.8432141107300343, 'recall': 0.852824578790882, 'f1': 0.8479921162847992}
Test Metrics: {'accuracy': 0.8583086053412463, 'precision': 0.8575567358763883, 'recall': 0.8646543330087634, 'f1': 0.8610909090909091}


