In [1]:
# Movie Genre Classification - Model Training with Hyperparameter Tuning
# CodSoft ML Internship - Task 1

import numpy as np
import pickle
import json
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score, precision_score, 
                             recall_score, make_scorer)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("MOVIE GENRE CLASSIFICATION - MODEL TRAINING")
print("="*60)

# Load preprocessed data
print("\nüìÇ Loading preprocessed data...")
X_train = np.load('../artifacts/X_train_tfidf.npy')
X_val = np.load('../artifacts/X_val_tfidf.npy')
y_train = np.load('../artifacts/y_train.npy')
y_val = np.load('../artifacts/y_val.npy')

with open('../artifacts/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

print(f"‚úÖ Training data shape: {X_train.shape}")
print(f"‚úÖ Validation data shape: {X_val.shape}")
print(f"‚úÖ Number of genres: {len(label_encoder.classes_)}")

# Dictionary to store results
results = {}
all_models = {}

MOVIE GENRE CLASSIFICATION - MODEL TRAINING

üìÇ Loading preprocessed data...
‚úÖ Training data shape: (43371, 5000)
‚úÖ Validation data shape: (10843, 5000)
‚úÖ Number of genres: 27


In [2]:
# BASELINE MODELS 

print("\n" + "="*60)
print("PHASE 1: BASELINE MODELS (Without Hyperparameter Tuning)")
print("="*60)

def train_baseline_model(model, model_name):
    """Train baseline model without hyperparameter tuning"""
    print(f"\nüîÑ Training {model_name} (Baseline)...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    
    # Store results
    results[f"{model_name} (Baseline)"] = {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'model_type': 'baseline'
    }
    
    return model, y_pred


PHASE 1: BASELINE MODELS (Without Hyperparameter Tuning)


In [3]:
# 1. Logistic Regression Baseline
lr_baseline, lr_pred_baseline = train_baseline_model(
    LogisticRegression(max_iter=1000, random_state=42),
    "Logistic Regression"
)


üîÑ Training Logistic Regression (Baseline)...
  Accuracy:  0.5771
  Precision: 0.5561
  Recall:    0.5771
  F1-Score:  0.5358


In [4]:
# 2. Naive Bayes Baseline
nb_baseline, nb_pred_baseline = train_baseline_model(
    MultinomialNB(),
    "Naive Bayes"
)



üîÑ Training Naive Bayes (Baseline)...
  Accuracy:  0.5239
  Precision: 0.5087
  Recall:    0.5239
  F1-Score:  0.4464


In [5]:
# 3. Linear SVM Baseline
svm_baseline, svm_pred_baseline = train_baseline_model(
    LinearSVC(random_state=42, max_iter=1000),
    "Linear SVM"
)


üîÑ Training Linear SVM (Baseline)...
  Accuracy:  0.5653
  Precision: 0.5355
  Recall:    0.5653
  F1-Score:  0.5416


In [None]:
# HYPERPARAMETER TUNING

print("\n" + "="*60)
print("PHASE 2: HYPERPARAMETER TUNING (Finding Best Parameters)")
print("="*60)

# Define scoring metric
scoring = make_scorer(f1_score, average='weighted')

In [None]:
# 1. LOGISTIC REGRESSION - Grid Search

print("\nüîç Tuning Logistic Regression...")
print("   Testing different regularization strengths and solvers...")

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2'],
    'max_iter': [1000]
}

lr_grid = GridSearchCV(
    LogisticRegression(random_state=42),
    lr_param_grid,
    cv=3,  # Reduced from 5 to save memory
    scoring=scoring,
    n_jobs=2,  # Limited parallel jobs to avoid memory issues
    verbose=1
)

print("   Running Grid Search (this may take a few minutes)...")
lr_grid.fit(X_train, y_train)

print(f"\n‚úÖ Best Parameters: {lr_grid.best_params_}")
print(f"‚úÖ Best CV Score: {lr_grid.best_score_:.4f}")

In [None]:
# Evaluate best Logistic Regression
lr_best = lr_grid.best_estimator_
y_pred_lr = lr_best.predict(X_val)

lr_accuracy = accuracy_score(y_val, y_pred_lr)
lr_precision = precision_score(y_val, y_pred_lr, average='weighted')
lr_recall = recall_score(y_val, y_pred_lr, average='weighted')
lr_f1 = f1_score(y_val, y_pred_lr, average='weighted')

print(f"\nüìä Validation Results:")
print(f"   Accuracy:  {lr_accuracy:.4f}")
print(f"   Precision: {lr_precision:.4f}")
print(f"   Recall:    {lr_recall:.4f}")
print(f"   F1-Score:  {lr_f1:.4f}")

results["Logistic Regression (Tuned)"] = {
    'accuracy': float(lr_accuracy),
    'precision': float(lr_precision),
    'recall': float(lr_recall),
    'f1_score': float(lr_f1),
    'best_params': lr_grid.best_params_,
    'cv_score': float(lr_grid.best_score_),
    'model_type': 'tuned'
}

all_models['Logistic Regression'] = lr_best

In [None]:
# 2. NAIVE BAYES - Grid Search

print("\n" + "-"*60)
print("üîç Tuning Naive Bayes...")
print("   Testing different smoothing parameters...")

nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'fit_prior': [True, False]
}

nb_grid = GridSearchCV(
    MultinomialNB(),
    nb_param_grid,
    cv=3,  # Reduced from 5 to save memory
    scoring=scoring,
    n_jobs=2,  # Limited parallel jobs to avoid memory issues
    verbose=1
)

print("   Running Grid Search...")
nb_grid.fit(X_train, y_train)

print(f"\n‚úÖ Best Parameters: {nb_grid.best_params_}")
print(f"‚úÖ Best CV Score: {nb_grid.best_score_:.4f}")

In [None]:
# Evaluate best Naive Bayes
nb_best = nb_grid.best_estimator_
y_pred_nb = nb_best.predict(X_val)

nb_accuracy = accuracy_score(y_val, y_pred_nb)
nb_precision = precision_score(y_val, y_pred_nb, average='weighted')
nb_recall = recall_score(y_val, y_pred_nb, average='weighted')
nb_f1 = f1_score(y_val, y_pred_nb, average='weighted')

print(f"\nüìä Validation Results:")
print(f"   Accuracy:  {nb_accuracy:.4f}")
print(f"   Precision: {nb_precision:.4f}")
print(f"   Recall:    {nb_recall:.4f}")
print(f"   F1-Score:  {nb_f1:.4f}")

results["Naive Bayes (Tuned)"] = {
    'accuracy': float(nb_accuracy),
    'precision': float(nb_precision),
    'recall': float(nb_recall),
    'f1_score': float(nb_f1),
    'best_params': nb_grid.best_params_,
    'cv_score': float(nb_grid.best_score_),
    'model_type': 'tuned'
}

all_models['Naive Bayes'] = nb_best

In [None]:
# 3. LINEAR SVM - Grid Search

print("\n" + "-"*60)
print("üîç Tuning Linear SVM...")
print("   Testing different regularization parameters...")

svm_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'loss': ['hinge', 'squared_hinge'],
    'max_iter': [1000, 2000]
}

svm_grid = GridSearchCV(
    LinearSVC(random_state=42),
    svm_param_grid,
    cv=3,  # Reduced from 5 to save memory
    scoring=scoring,
    n_jobs=2,  # Limited parallel jobs to avoid memory issues
    verbose=1
)

print("   Running Grid Search...")
svm_grid.fit(X_train, y_train)

print(f"\n‚úÖ Best Parameters: {svm_grid.best_params_}")
print(f"‚úÖ Best CV Score: {svm_grid.best_score_:.4f}")

In [None]:
# Evaluate best SVM
svm_best = svm_grid.best_estimator_
y_pred_svm = svm_best.predict(X_val)

svm_accuracy = accuracy_score(y_val, y_pred_svm)
svm_precision = precision_score(y_val, y_pred_svm, average='weighted')
svm_recall = recall_score(y_val, y_pred_svm, average='weighted')
svm_f1 = f1_score(y_val, y_pred_svm, average='weighted')

print(f"\nüìä Validation Results:")
print(f"   Accuracy:  {svm_accuracy:.4f}")
print(f"   Precision: {svm_precision:.4f}")
print(f"   Recall:    {svm_recall:.4f}")
print(f"   F1-Score:  {svm_f1:.4f}")

results["Linear SVM (Tuned)"] = {
    'accuracy': float(svm_accuracy),
    'precision': float(svm_precision),
    'recall': float(svm_recall),
    'f1_score': float(svm_f1),
    'best_params': svm_grid.best_params_,
    'cv_score': float(svm_grid.best_score_),
    'model_type': 'tuned'
}

all_models['Linear SVM'] = svm_best


In [None]:
# 4. RANDOM FOREST - Grid Search 

print("\n" + "-"*60)
print("üîç Tuning Random Forest (Bonus Model)...")
print("   Testing different tree configurations...")
print("   Note: Using smaller parameter grid to avoid memory issues")

rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'max_features': ['sqrt']
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=3,  # Reduced from 5 to save memory
    scoring=scoring,
    n_jobs=2,  # Limited parallel jobs to avoid memory issues
    verbose=1
)

print("   Running Grid Search...")
rf_grid.fit(X_train, y_train)

print(f"\n‚úÖ Best Parameters: {rf_grid.best_params_}")
print(f"‚úÖ Best CV Score: {rf_grid.best_score_:.4f}")

In [None]:
# Evaluate best Random Forest
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_val)

rf_accuracy = accuracy_score(y_val, y_pred_rf)
rf_precision = precision_score(y_val, y_pred_rf, average='weighted')
rf_recall = recall_score(y_val, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_val, y_pred_rf, average='weighted')

print(f"\nüìä Validation Results:")
print(f"   Accuracy:  {rf_accuracy:.4f}")
print(f"   Precision: {rf_precision:.4f}")
print(f"   Recall:    {rf_recall:.4f}")
print(f"   F1-Score:  {rf_f1:.4f}")

results["Random Forest (Tuned)"] = {
    'accuracy': float(rf_accuracy),
    'precision': float(rf_precision),
    'recall': float(rf_recall),
    'f1_score': float(rf_f1),
    'best_params': rf_grid.best_params_,
    'cv_score': float(rf_grid.best_score_),
    'model_type': 'tuned'
}

all_models['Random Forest'] = rf_best

In [None]:
# MODEL COMPARISON & SELECTION

print("\n" + "="*60)
print("PHASE 3: MODEL COMPARISON & BEST MODEL SELECTION")
print("="*60)

# Compare tuned models
tuned_results = {k: v for k, v in results.items() if v['model_type'] == 'tuned'}

print("\nüìä Tuned Models Performance:")
print("-" * 80)
print(f"{'Model':<30} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
print("-" * 80)

for model_name, metrics in tuned_results.items():
    print(f"{model_name:<30} {metrics['accuracy']:<12.4f} {metrics['precision']:<12.4f} "
          f"{metrics['recall']:<12.4f} {metrics['f1_score']:<12.4f}")

# Select best model based on F1-score (better for multi-class)
best_model_name = max(tuned_results, key=lambda x: tuned_results[x]['f1_score'])
best_model_base = best_model_name.replace(" (Tuned)", "")
best_model = all_models[best_model_base]
best_metrics = tuned_results[best_model_name]

print("\n" + "="*60)
print("üèÜ BEST MODEL SELECTED")
print("="*60)
print(f"Model: {best_model_name}")
print(f"Accuracy:  {best_metrics['accuracy']:.4f}")
print(f"Precision: {best_metrics['precision']:.4f}")
print(f"Recall:    {best_metrics['recall']:.4f}")
print(f"F1-Score:  {best_metrics['f1_score']:.4f}")
print(f"\nBest Hyperparameters:")
for param, value in best_metrics['best_params'].items():
    print(f"  {param}: {value}")

In [None]:
# SAVE BEST MODEL

print("\nüíæ Saving best model...")
with open('../models/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("‚úÖ Best model saved: ../models/model.pkl")

# Save all models (optional)
print("\nüíæ Saving all trained models...")
for name, model in all_models.items():
    filename = f"../models/{name.lower().replace(' ', '_')}_model.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"‚úÖ {name} saved: {filename}")

In [None]:
# DETAILED EVALUATION OF BEST MODEL

print("\n" + "="*60)
print("DETAILED EVALUATION OF BEST MODEL")
print("="*60)

# Get predictions from best model
if best_model_base == 'Logistic Regression':
    best_pred = y_pred_lr
elif best_model_base == 'Naive Bayes':
    best_pred = y_pred_nb
elif best_model_base == 'Linear SVM':
    best_pred = y_pred_svm
else:
    best_pred = y_pred_rf

# Classification Report
print("\nüìã Classification Report:")
print("="*60)
report = classification_report(y_val, best_pred, 
                               target_names=label_encoder.classes_,
                               digits=4)
print(report)

In [None]:
# Save detailed report
with open('artifacts/classification_report.txt', 'w') as f:
    f.write("="*60 + "\n")
    f.write("MOVIE GENRE CLASSIFICATION - MODEL PERFORMANCE\n")
    f.write("="*60 + "\n\n")
    f.write(f"Best Model: {best_model_name}\n")
    f.write(f"Overall Accuracy: {best_metrics['accuracy']:.4f}\n")
    f.write(f"F1-Score: {best_metrics['f1_score']:.4f}\n\n")
    f.write("Best Hyperparameters:\n")
    for param, value in best_metrics['best_params'].items():
        f.write(f"  {param}: {value}\n")
    f.write("\n" + "="*60 + "\n")
    f.write("Classification Report:\n")
    f.write("="*60 + "\n")
    f.write(report)
    f.write("\n\n" + "="*60 + "\n")
    f.write("All Models Comparison:\n")
    f.write("="*60 + "\n")
    for model_name, metrics in results.items():
        f.write(f"\n{model_name}:\n")
        f.write(f"  Accuracy:  {metrics['accuracy']:.4f}\n")
        f.write(f"  Precision: {metrics['precision']:.4f}\n")
        f.write(f"  Recall:    {metrics['recall']:.4f}\n")
        f.write(f"  F1-Score:  {metrics['f1_score']:.4f}\n")
        if 'best_params' in metrics:
            f.write(f"  Best Parameters: {metrics['best_params']}\n")

print("‚úÖ Detailed report saved: ../artifacts/classification_report.txt")

In [None]:
# VISUALIZATIONS

print("\nüìä Creating visualizations...")

# 1. Confusion Matrix
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_val, best_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_,
            cbar_kws={'label': 'Count'})
plt.title(f'Confusion Matrix - {best_model_name}\nAccuracy: {best_metrics["accuracy"]:.4f}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('../artifacts/confusion_matrix.png', dpi=300, bbox_inches='tight')
print("‚úÖ Confusion matrix saved: ../artifacts/confusion_matrix.png")
plt.close()

# 2. Model Comparison - Tuned Models
plt.figure(figsize=(12, 6))
tuned_names = list(tuned_results.keys())
tuned_accuracies = [tuned_results[m]['accuracy'] for m in tuned_names]
tuned_f1_scores = [tuned_results[m]['f1_score'] for m in tuned_names]

x = np.arange(len(tuned_names))
width = 0.35

bars1 = plt.bar(x - width/2, tuned_accuracies, width, label='Accuracy', alpha=0.8)
bars2 = plt.bar(x + width/2, tuned_f1_scores, width, label='F1-Score', alpha=0.8)

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Tuned Models Performance Comparison')
plt.xticks(x, [name.replace(' (Tuned)', '') for name in tuned_names], rotation=15, ha='right')
plt.legend()
plt.ylim([0, 1])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../artifacts/model_comparison.png', dpi=300, bbox_inches='tight')
print("‚úÖ Model comparison saved: ../artifacts/model_comparison.png")
plt.close()

# 3. Baseline vs Tuned Comparison
plt.figure(figsize=(14, 6))
all_model_types = ['Logistic Regression', 'Naive Bayes', 'Linear SVM']
baseline_scores = []
tuned_scores = []

for model_type in all_model_types:
    baseline_key = f"{model_type} (Baseline)"
    tuned_key = f"{model_type} (Tuned)"
    
    baseline_scores.append(results[baseline_key]['f1_score'])
    tuned_scores.append(results[tuned_key]['f1_score'])

x = np.arange(len(all_model_types))
width = 0.35

bars1 = plt.bar(x - width/2, baseline_scores, width, label='Baseline', alpha=0.8, color='lightcoral')
bars2 = plt.bar(x + width/2, tuned_scores, width, label='Tuned', alpha=0.8, color='lightgreen')

plt.xlabel('Models')
plt.ylabel('F1-Score')
plt.title('Baseline vs Hyperparameter Tuned Models')
plt.xticks(x, all_model_types, rotation=15, ha='right')
plt.legend()
plt.ylim([0, 1])
plt.grid(axis='y', alpha=0.3)

# Add value labels and improvement percentages
for i, (bar1, bar2) in enumerate(zip(bars1, bars2)):
    h1 = bar1.get_height()
    h2 = bar2.get_height()
    improvement = ((h2 - h1) / h1) * 100
    
    plt.text(bar1.get_x() + bar1.get_width()/2., h1,
            f'{h1:.3f}', ha='center', va='bottom', fontsize=9)
    plt.text(bar2.get_x() + bar2.get_width()/2., h2,
            f'{h2:.3f}', ha='center', va='bottom', fontsize=9)
    
    # Show improvement
    mid_x = x[i]
    mid_y = (h1 + h2) / 2
    plt.text(mid_x, mid_y, f'+{improvement:.1f}%', 
            ha='center', va='center', fontsize=8, 
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('../artifacts/baseline_vs_tuned.png', dpi=300, bbox_inches='tight')
print("‚úÖ Baseline vs Tuned comparison saved: ../artifacts/baseline_vs_tuned.png")
plt.close()


In [None]:
# SAVE METRICS AND PARAMETERS

print("\nüíæ Saving metrics and parameters...")

# Save all results
with open('../artifacts/metrics.json', 'w') as f:
    json.dump({
        'best_model': best_model_name,
        'best_model_type': best_model_base,
        'all_results': results,
        'best_hyperparameters': best_metrics['best_params']
    }, f, indent=4)
print("‚úÖ Metrics saved: ../artifacts/metrics.json")

# Save training parameters
params = {
    'best_model': best_model_name,
    'best_model_type': best_model_base,
    'num_features': X_train.shape[1],
    'num_classes': len(label_encoder.classes_),
    'training_samples': int(len(y_train)),
    'validation_samples': int(len(y_val)),
    'classes': label_encoder.classes_.tolist(),
    'hyperparameter_tuning': 'enabled',
    'cv_folds': 3,  # 3-fold CV for memory efficiency
    'parallel_jobs': 2  # Limited to avoid memory issues
}

with open('../artifacts/params.json', 'w') as f:
    json.dump(params, f, indent=4)
print("‚úÖ Parameters saved: ../artifacts/params.json")

In [None]:
# SUMMARY

print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)

print("\nüìÅ Generated Files:")
print("   ‚úÖ models/model.pkl (Best model)")
for name in all_models.keys():
    filename = f"{name.lower().replace(' ', '_')}_model.pkl"
    print(f"   ‚úÖ models/{filename}")
print("   ‚úÖ ../artifacts/classification_report.txt")
print("   ‚úÖ ../artifacts/confusion_matrix.png")
print("   ‚úÖ ../artifacts/model_comparison.png")
print("   ‚úÖ ../artifacts/baseline_vs_tuned.png")
print("   ‚úÖ ../artifacts/metrics.json")
print("   ‚úÖ ../artifacts/params.json")

print("\nüìä Best Model Summary:")
print(f"   Model: {best_model_name}")
print(f"   Accuracy: {best_metrics['accuracy']:.4f}")
print(f"   F1-Score: {best_metrics['f1_score']:.4f}")

print("\nüöÄ Next Steps:")
print("   1. Run experiments.ipynb to test the model")
print("   2. Use app.py for interactive predictions")
print("   3. Check artifacts/ folder for visualizations")

print("\n" + "="*60)