In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

CHECKPOINT_1_OUTPUT_DIR = "../checkpoint 1/output"
MODEL_DIR = "models"
PREPROCESSED_DATA_FILE = "FakeReviewDataPreprocessed.csv"
RANDOM_STATE = 42
MODEL_PATH = os.path.join(MODEL_DIR, "logistic_regression_model.pkl")
FEATURE_NAMES_PATH = os.path.join("..", "checkpoint 1", "models", "tfidf_feature_names.pkl")

In [2]:
if not os.path.exists(os.path.join(CHECKPOINT_1_OUTPUT_DIR, PREPROCESSED_DATA_FILE)):
    raise FileNotFoundError(f"Error: The file {PREPROCESSED_DATA_FILE} was not found in {CHECKPOINT_1_OUTPUT_DIR}. Ensure preprocessing is completed first.")

data = pd.read_csv(os.path.join(CHECKPOINT_1_OUTPUT_DIR, PREPROCESSED_DATA_FILE), low_memory=False)
print("Preprocessed data loaded successfully.")

Preprocessed data loaded successfully.


In [3]:
encoder = LabelEncoder()
if 'label' in data.columns:
    data['label'] = encoder.fit_transform(data['label'])
    print("Label column label-encoded.")

columns_to_exclude = ['label', 'text', 'category']
feature_cols = [col for col in data.columns if col not in columns_to_exclude]

if not feature_cols:
    raise ValueError("No feature columns were found. Please verify the preprocessing output.")

Label column label-encoded.


In [4]:
X = data[feature_cols].to_numpy()
y = data['label']
print("Dataset prepared for training.")
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_STATE)
print("Dataset split into training+validation and test sets (90% train+val, 10% test).")

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=RANDOM_STATE)
print("Dataset split into training and validation sets (80% train, 10% val).")
if not os.path.exists(FEATURE_NAMES_PATH):
    raise FileNotFoundError(f"Error: The file {FEATURE_NAMES_PATH} was not found. Please run preprocessing first")
feature_names = joblib.load(FEATURE_NAMES_PATH)

Dataset prepared for training.
Dataset split into training+validation and test sets (90% train+val, 10% test).
Dataset split into training and validation sets (80% train, 10% val).


In [5]:
best_model = LogisticRegression(
    max_iter=1000
)
best_model.fit(X_train, y_train)
best_model.feature_names_in_ = np.array(feature_names.tolist()+['rating'])

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(best_model, MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")

Model saved to models\logistic_regression_model.pkl


In [6]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    metrics = {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred, average='binary', zero_division=0),
        "recall": recall_score(y, y_pred, average='binary', zero_division=0),
        "f1": f1_score(y, y_pred, average='binary', zero_division=0),
    }
    return metrics

train_metrics = evaluate_model(best_model, X_train, y_train)
val_metrics = evaluate_model(best_model, X_val, y_val)
test_metrics = evaluate_model(best_model, X_test, y_test)

print(f"Train Metrics: {train_metrics}")
print(f"Validation Metrics: {val_metrics}")
print(f"Test Metrics: {test_metrics}")



Train Metrics: {'accuracy': 0.9038830742552796, 'precision': 0.8945300695074041, 'recall': 0.9158982610310044, 'f1': 0.9050880626223092}
Validation Metrics: {'accuracy': 0.8778796135744364, 'precision': 0.8709990300678953, 'recall': 0.8877904102817598, 'f1': 0.8793145654834761}
Test Metrics: {'accuracy': 0.8776319048798613, 'precision': 0.868382710053424, 'recall': 0.8891098955743412, 'f1': 0.8786240786240787}


