# 🤖 ML Project Template

Complete machine learning pipeline template. From data to deployed model!

**Perfect for**: Classification, Regression, or any supervised learning project.

## 📦 Import & Setup

In [None]:
# 🐼 Data handling
import pandas as pd
import numpy as np

# 📊 Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# 🤖 Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

# ⚙️ Config
np.random.seed(42)
plt.style.use('seaborn-v0_8')

print("🚀 ML Environment Ready!")

## 📥 Data Loading & Initial Exploration

In [None]:
# 📁 Load your dataset
# df = pd.read_csv('your_data.csv')

# 🧪 Demo with built-in dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, 
                          n_redundant=2, n_classes=2, random_state=42)

# Create DataFrame for easier handling
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print(f"📊 Dataset: {df.shape[0]} rows, {df.shape[1]-1} features")
print(f"🎯 Target distribution:")
print(df['target'].value_counts())

display(df.head())

## 🔍 Exploratory Data Analysis

In [None]:
# 📊 Basic statistics
print("📈 Feature Statistics:")
display(df.describe())

# 🕳️ Missing values check
print(f"\n🕳️ Missing values: {df.isnull().sum().sum()}")

# 📊 Target distribution visualization
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
df['target'].value_counts().plot(kind='bar', color=['skyblue', 'lightcoral'])
plt.title('Target Distribution')
plt.xticks(rotation=0)

# Feature correlations
plt.subplot(1, 3, 2)
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, square=True)
plt.title('Feature Correlations')

# Sample feature distribution
plt.subplot(1, 3, 3)
df[feature_names[0]].hist(bins=20, alpha=0.7, color='green')
plt.title(f'{feature_names[0]} Distribution')

plt.tight_layout()
plt.show()

## 🧹 Data Preprocessing

In [None]:
# 🎯 Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print(f"✅ Features: {X.shape}")
print(f"✅ Target: {y.shape}")

# 🔄 Handle missing values (if any)
# X.fillna(X.median(), inplace=True)  # For numerical
# X.fillna(X.mode().iloc[0], inplace=True)  # For categorical

# 🏷️ Encode categorical variables (if any)
# le = LabelEncoder()
# for col in X.select_dtypes(include=['object']).columns:
#     X[col] = le.fit_transform(X[col])

# ✂️ Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📊 Training set: {X_train.shape}")
print(f"📊 Test set: {X_test.shape}")

# 📏 Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Features scaled and ready!")

## 🤖 Model Training & Comparison

In [None]:
# 🎯 Define models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# 📊 Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n🔧 Training {name}...")
    
    # Use scaled data for LogReg, original for RF
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"✅ {name} Accuracy: {accuracy:.4f}")

# 🏆 Find best model
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']
best_accuracy = results[best_model_name]['accuracy']

print(f"\n🏆 Best Model: {best_model_name} (Accuracy: {best_accuracy:.4f})")

## 📊 Model Evaluation & Insights

In [None]:
# 📋 Detailed evaluation of best model
best_predictions = results[best_model_name]['predictions']

print(f"🎯 Detailed Evaluation - {best_model_name}")
print("=" * 50)
print(classification_report(y_test, best_predictions))

# 📊 Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Model comparison
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]

axes[0].bar(model_names, accuracies, color=['skyblue', 'lightcoral'])
axes[0].set_title('Model Comparison')
axes[0].set_ylabel('Accuracy')
axes[0].tick_params(axis='x', rotation=45)

# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_imp = pd.DataFrame({
        'feature': feature_names,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=True)
    
    axes[1].barh(feature_imp['feature'][-10:], feature_imp['importance'][-10:])
    axes[1].set_title('Top 10 Feature Importance')
    axes[1].set_xlabel('Importance')

# Confusion matrix visualization
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[2])
axes[2].set_title('Confusion Matrix')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## 🔧 Model Optimization (Optional)

In [None]:
# 🎯 Hyperparameter tuning for best model
print(f"🔧 Optimizing {best_model_name}...")

if 'Random Forest' in best_model_name:
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, None],
        'min_samples_split': [2, 5, 10]
    }
    
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"🏆 Best parameters: {grid_search.best_params_}")
    print(f"🎯 Best CV score: {grid_search.best_score_:.4f}")
    
    # Test optimized model
    optimized_pred = grid_search.best_estimator_.predict(X_test)
    optimized_accuracy = accuracy_score(y_test, optimized_pred)
    
    print(f"✅ Optimized test accuracy: {optimized_accuracy:.4f}")
    print(f"📈 Improvement: {optimized_accuracy - best_accuracy:.4f}")

else:
    print("💡 Add hyperparameter tuning for your chosen model!")

## 💾 Model Saving & Deployment Prep

In [None]:
import joblib
from datetime import datetime

# 💾 Save the best model and preprocessor
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f'best_model_{timestamp}.joblib'
scaler_filename = f'scaler_{timestamp}.joblib'

# Save model and scaler
joblib.dump(best_model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"💾 Model saved as: {model_filename}")
print(f"💾 Scaler saved as: {scaler_filename}")

# 🧪 Test loading and prediction
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)

# Make a sample prediction
sample_idx = 0
if 'Logistic' in best_model_name:
    sample_scaled = loaded_scaler.transform(X_test.iloc[[sample_idx]])
    prediction = loaded_model.predict(sample_scaled)[0]
else:
    prediction = loaded_model.predict(X_test.iloc[[sample_idx]])[0]

actual = y_test.iloc[sample_idx]

print(f"\n🧪 Test prediction:")
print(f"   Predicted: {prediction}")
print(f"   Actual: {actual}")
print(f"   ✅ {'Correct!' if prediction == actual else 'Incorrect'}")

## 📋 Project Summary & Next Steps

### 🎯 Model Performance Summary:
- **Best Model**: [Fill in after running]
- **Test Accuracy**: [Fill in after running]
- **Key Features**: [Fill in top 3-5 important features]

### 🔮 Business Insights:
- [ ] Key finding #1
- [ ] Key finding #2 
- [ ] Key finding #3

### 🚀 Next Steps:
- [ ] Collect more data for specific scenarios
- [ ] Try additional algorithms (XGBoost, Neural Networks)
- [ ] Feature engineering improvements
- [ ] Deploy model to production
- [ ] Set up monitoring and retraining pipeline

### 📊 Technical Improvements:
- [ ] Cross-validation for more robust evaluation
- [ ] Advanced preprocessing techniques
- [ ] Ensemble methods
- [ ] Handle class imbalance (if applicable)
