In [None]:
# Complete Jupyter Notebook combining all the steps

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Input, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Load Data
print("Loading data...")
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# 2. Explore Data
print("\nFirst few rows of training data:")
print(train_data.head())

# Check distribution of labels
plt.figure(figsize=(10, 5))
train_data['label'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Digits in Training Set')
plt.xlabel('Digit')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3)
plt.show()

# 3. Preprocess Data
print("\nPreprocessing data...")

# Separate features and labels
X = train_data.iloc[:, 1:].values
y = train_data.iloc[:, 0].values
X_test = test_data.values

# Normalize
X = X.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Reshape
X = X.reshape(-1, 28, 28, 1)
X_test = X_test.reshape(-1, 28, 28, 1)

# One-hot encode labels
y = to_categorical(y, num_classes=10)

# Split into train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

# 4. Visualize some samples
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
axes = axes.flatten()

for i in range(10):
    img = X_train[i].reshape(28, 28)
    axes[i].imshow(img, cmap='gray')
    axes[i].set_title(f"Label: {np.argmax(y_train[i])}")
    axes[i].axis('off')

plt.suptitle('Sample Training Images', fontsize=16)
plt.tight_layout()
plt.show()

# 5. Build Model
print("\nBuilding model...")

model = Sequential([
    Input(shape=(28, 28, 1)),
    Flatten(),
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(10, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# 6. Define Callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='models/digit_recognizer.h5',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=0.00001,
        verbose=1
    )
]

# 7. Train Model
print("\nTraining model...")

history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

# 8. Plot Training History
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot accuracy
ax1.plot(history.history['accuracy'], label='Training Accuracy')
ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax1.set_title('Model Accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot loss
ax2.plot(history.history['loss'], label='Training Loss')
ax2.plot(history.history['val_loss'], label='Validation Loss')
ax2.set_title('Model Loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.suptitle('Training History', fontsize=16)
plt.tight_layout()
plt.show()

# 9. Evaluate Model
print("\nEvaluating model...")

# Evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Accuracy: {val_accuracy*100:.2f}%")

# Make predictions
y_pred_probs = model.predict(X_val, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_val, axis=1)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=range(10), yticklabels=range(10))
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# 10. Analyze Misclassifications
misclassified_idx = np.where(y_true != y_pred)[0]

if len(misclassified_idx) > 0:
    print(f"\nTotal misclassifications: {len(misclassified_idx)}")
    print(f"Misclassification rate: {len(misclassified_idx)/len(y_true)*100:.2f}%")
    
    # Visualize some misclassifications
    num_samples = min(5, len(misclassified_idx))
    fig, axes = plt.subplots(1, num_samples, figsize=(15, 3))
    
    if num_samples == 1:
        axes = [axes]
    
    for i, idx in enumerate(misclassified_idx[:num_samples]):
        img = X_val[idx].reshape(28, 28)
        axes[i].imshow(img, cmap='gray')
        axes[i].set_title(f"True: {y_true[idx]}, Pred: {y_pred[idx]}")
        axes[i].axis('off')
    
    plt.suptitle('Misclassified Samples', fontsize=16)
    plt.tight_layout()
    plt.show()
else:
    print("\nNo misclassifications found!")

# 11. Make Predictions on Test Set
print("\nMaking predictions on test set...")

test_predictions = model.predict(X_test, verbose=0)
test_pred_labels = np.argmax(test_predictions, axis=1)

# Visualize some test predictions
fig, axes = plt.subplots(1, 5, figsize=(15, 3))

for i in range(5):
    img = X_test[i].reshape(28, 28)
    axes[i].imshow(img, cmap='gray')
    axes[i].set_title(f"Predicted: {test_pred_labels[i]}")
    axes[i].axis('off')

plt.suptitle('Test Set Predictions', fontsize=16)
plt.tight_layout()
plt.show()

# 12. Create Submission File
submission = pd.DataFrame({
    'ImageId': range(1, len(test_pred_labels) + 1),
    'Label': test_pred_labels
})

submission.to_csv('submission.csv', index=False)
print(f"\nSubmission file created: submission.csv")
print(f"First few predictions:")
print(submission.head())

print("\n" + "="*50)
print("PROJECT COMPLETED SUCCESSFULLY!")
print("="*50)