In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# ----------------- Step 1: Set up the Hyperparameter Grid -----------------

# Define the model we want to tune
lr_model_for_tuning = LogisticRegression(random_state=42)

# Define the grid of parameters to search. We will focus on 'C'.
# These values are common starting points: from weak to strong regularization.
param_grid = {
    'C': [0.1, 0.5, 1, 5, 10],  # Inverse of regularization strength
    'solver': ['liblinear']     # A good solver for this kind of problem
}


# ----------------- Step 2: Set up and Run GridSearchCV -----------------

# Initialize GridSearchCV
# cv=5 means 5-fold cross-validation.
# n_jobs=-1 uses all available CPU cores to speed up the process.
# verbose=2 will print progress updates.
grid_search = GridSearchCV(estimator=lr_model_for_tuning, 
                           param_grid=param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=2)

# Fit the grid search to the data
# This will train and test the model for each 'C' value using cross-validation.
# It can take a few minutes!
print("Starting GridSearchCV for Logistic Regression...")
grid_search.fit(X_train_tfidf, y_train)
print("GridSearchCV complete.")


# ----------------- Step 3: Analyze the Results -----------------

# Print the best parameters found
print(f"\nBest Parameters found: {grid_search.best_params_}")

# Print the best cross-validation score
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# The grid_search object is now a trained model with the best parameters.
# Let's get its performance on the actual test set.
best_lr_model = grid_search.best_estimator_
y_pred_tuned = best_lr_model.predict(X_test_tfidf)

# Calculate and print the final accuracy on the test set
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f"\nTest Accuracy of Tuned Logistic Regression Model: {accuracy_tuned:.4f}")


# ----------------- Step 4: Compare with Baseline Model -----------------
print("\n--- Model Comparison ---")
print(f"Logistic Regression (Baseline) Accuracy: {accuracy:.4f}")
print(f"Tuned Logistic Regression Accuracy:      {accuracy_tuned:.4f}")
print("------------------------")

In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import time

# X_train_tfidf, y_train, X_test_tfidf, y_test are already loaded
# from our previous steps.

# --- Step 6: Model Showdown ---

# 1. Define the models we want to test
models_to_test = {
    
    "Linear SVC": LinearSVC(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_jobs=-1),
    "LightGBM": LGBMClassifier(random_state=42),
    "Passive Aggressive": PassiveAggressiveClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42)
}

# 2. Loop through each model to train, predict, and evaluate
model_performance = []

print("--- Starting Model Showdown ---")

for name, model in models_to_test.items():
    print(f"Training and evaluating {name}...")
    start_time = time.time()
    
    # Train the model (Logistic Regression is already trained)
    if name != "Logistic Regression":
        model.fit(X_train_tfidf, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    end_time = time.time()
    training_time = end_time - start_time
    
    # Store the results
    model_performance.append({
        "Model": name,
        "Accuracy": accuracy,
        "Time (s)": training_time
    })
    
    print(f"  -> {name} Accuracy: {accuracy:.4f}")
    print(f"  -> Time taken: {training_time:.2f} seconds\n")

print("--- Model Showdown Complete ---")

# 3. Display the results in a sorted DataFrame
performance_df = pd.DataFrame(model_performance)
performance_df = performance_df.sort_values(by="Accuracy", ascending=False)
performance_df = performance_df.set_index("Model")

print("\n--- Final Model Performance Comparison ---")
print(performance_df)

# Find the best model from the test
best_model_name = performance_df.index[0]
best_model_accuracy = performance_df.iloc[0]['Accuracy']

print(f"\nüèÜ New Champion Found: {best_model_name} with an accuracy of {best_model_accuracy:.4f}!")