# Handwritten Digit Recognition Analysis

This notebook contains comprehensive analysis and visualization of our handwritten digit recognition model.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib
import cv2
import plotly.express as px
import plotly.graph_objects as go

## 1. Data Loading and Preprocessing

In [None]:
def load_data(train_path, test_path):
    """Load and preprocess training and test data"""
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    X_train = train_data.drop('label', axis=1)
    y_train = train_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']
    
    return X_train, X_test, y_train, y_test

# Load the data
X_train, X_test, y_train, y_test = load_data('Train Data.csv', 'Test Data.csv')

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 2. Data Visualization

In [None]:
def plot_digit_distribution(y_train):
    """Plot distribution of digits in training data"""
    plt.figure(figsize=(12, 6))
    sns.countplot(data=pd.DataFrame(y_train, columns=['digit']), x='digit')
    plt.title('Distribution of Digits in Training Data')
    plt.xlabel('Digit')
    plt.ylabel('Count')
    plt.savefig('static/digit_distribution.png')
    plt.show()

def plot_sample_digits(X_train, y_train, num_samples=10):
    """Plot sample digits from the dataset"""
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    for i, ax in enumerate(axes.flat):
        if i < num_samples:
            img = X_train[i].reshape(28, 28)
            ax.imshow(img, cmap='gray')
            ax.set_title(f'Digit: {y_train.iloc[i]}')
            ax.axis('off')
    plt.savefig('static/sample_digits.png')
    plt.show()

# Plot distributions and samples
plot_digit_distribution(y_train)
plot_sample_digits(X_train, y_train)

## 3. Model Training and Evaluation

In [None]:
def train_model(X_train_scaled, y_train):
    """Train the Random Forest model"""
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    return model

def evaluate_model(model, X_test_scaled, y_test):
    """Evaluate model performance"""
    y_pred = model.predict(X_test_scaled)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig('static/confusion_matrix.png')
    plt.show()
    
    # Print classification report
    print('\nClassification Report:')
    print(classification_report(y_test, y_pred))
    
    return y_pred

# Train and evaluate model
model = train_model(X_train_scaled, y_train)
y_pred = evaluate_model(model, X_test_scaled, y_test)

# Save the model
joblib.dump(model, 'models/digit_model.joblib')
joblib.dump(scaler, 'models/scaler.joblib')

## 4. Feature Importance Analysis

In [None]:
def analyze_feature_importance(model, X_train):
    """Analyze and visualize feature importance"""
    feature_importance = pd.DataFrame({
        'feature': range(X_train.shape[1]),
        'importance': model.feature_importances_
    })
    
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 6))
    plt.bar(range(20), feature_importance['importance'][:20])
    plt.title('Top 20 Most Important Features')
    plt.xlabel('Feature Index')
    plt.ylabel('Importance')
    plt.savefig('static/feature_importance.png')
    plt.show()

analyze_feature_importance(model, X_train)

## 5. Misclassification Analysis

In [None]:
def analyze_misclassifications(X_test, y_test, y_pred):
    """Analyze and visualize misclassified examples"""
    misclassified = np.where(y_test != y_pred)[0]
    
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    for i, ax in enumerate(axes.flat):
        if i < len(misclassified) and i < 10:
            idx = misclassified[i]
            img = X_test.iloc[idx].values.reshape(28, 28)
            ax.imshow(img, cmap='gray')
            ax.set_title(f'True: {y_test.iloc[idx]}\nPred: {y_pred[idx]}')
            ax.axis('off')
    plt.savefig('static/misclassified_examples.png')
    plt.show()

analyze_misclassifications(X_test, y_test, y_pred)

## 6. Model Confidence Analysis

In [None]:
def analyze_prediction_confidence(model, X_test_scaled):
    """Analyze and visualize model prediction confidence"""
    probabilities = model.predict_proba(X_test_scaled)
    confidence = np.max(probabilities, axis=1)
    
    plt.figure(figsize=(10, 6))
    sns.histplot(confidence, bins=50)
    plt.title('Distribution of Prediction Confidence')
    plt.xlabel('Confidence')
    plt.ylabel('Count')
    plt.savefig('static/confidence_distribution.png')
    plt.show()
    
    # Calculate average confidence per digit
    predictions = model.predict(X_test_scaled)
    confidence_by_digit = pd.DataFrame({
        'digit': predictions,
        'confidence': confidence
    }).groupby('digit')['confidence'].mean()
    
    plt.figure(figsize=(10, 6))
    confidence_by_digit.plot(kind='bar')
    plt.title('Average Prediction Confidence by Digit')
    plt.xlabel('Digit')
    plt.ylabel('Average Confidence')
    plt.tight_layout()
    plt.show()

analyze_prediction_confidence(model, X_test_scaled)

## 7. Interactive Visualization with Plotly

In [None]:
def create_interactive_confusion_matrix(y_test, y_pred):
    """Create interactive confusion matrix using plotly"""
    cm = confusion_matrix(y_test, y_pred)
    
    fig = px.imshow(cm,
                    labels=dict(x="Predicted", y="True", color="Count"),
                    x=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
                    y=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
    
    fig.update_traces(text=cm, texttemplate="%{z}")
    fig.update_layout(title='Interactive Confusion Matrix')
    fig.show()

create_interactive_confusion_matrix(y_test, y_pred)