<a href="https://colab.research.google.com/github/farheenfathimaa/digit-recognizer/blob/main/digit_recognizer_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# mounting drive
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [2]:
# Unzip the uploaded data into Google Drive
#!unzip "/content/drive/MyDrive/digit-recognizer.zip" -d "/content/drive/MyDrive/digit-recognizer"

#Digit recognizer
This notebook contains my code for the competition held on kaggle [link](https://www.kaggle.com/competitions/digit-recognizer/overview)

In [3]:
# Importing libraries
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import random
import datetime
import matplotlib.pyplot as plt
import io
from IPython.display import FileLink

import warnings
warnings.filterwarnings('ignore')

## Exploratory Data Analysis
### Import data

In [4]:
train_data = pd.read_csv("/content/drive/MyDrive/digit-recognizer/train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/digit-recognizer/train.csv")
train_data

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31544,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31545,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31546,9,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31547,5,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31549 entries, 0 to 31548
Columns: 785 entries, label to pixel783
dtypes: float64(416), int64(369)
memory usage: 188.9 MB


In [6]:
# Shuffle the data
train_data_shuffled = train_data.sample(frac=1, random_state=42)
train_data_shuffled

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
821,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22576,9,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6379,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23856,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3752,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,6,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,6,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15795,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
train_data.isna().sum()

Unnamed: 0,0
label,0
pixel0,0
pixel1,0
pixel2,0
pixel3,0
...,...
pixel779,1
pixel780,1
pixel781,1
pixel782,1


In [8]:
train_data_shuffled.dropna()
train_data_shuffled

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
821,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22576,9,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6379,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23856,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3752,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,6,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,6,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15795,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
train_data_shuffled["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,3504
7,3295
3,3250
2,3176
9,3150
6,3100
4,3096
0,3074
8,3046
5,2858


In [10]:
# Create a small list so we can index onto our training labels so they're human-readable
class_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
len(class_names)

10

## Preprocessing
### Split into training and validation data

In [30]:
train_data_shuffled.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
821,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22576,9,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6379,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23856,8,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3752,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Set target and predictors
X = train_data_shuffled.drop(["label"], axis=1).to_numpy()
y = train_data_shuffled["label"].to_numpy()

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2)

# Check the shapes
X_train.shape, X_val.shape, y_train.shape, y_val.shape

KeyError: '[0] not found in axis'

## Building a multi-class classification model
For our multi-class classification model, we can use a similar architecture to our binary classifiers, however, we're going to have to tweak a few things:

* **Input shape** = 28 x 28 (the shape of one image)
* **Output shape** = 10 (one per class of clothing)
* **Loss function** = `tf.keras.losses.CategoricalCrossentropy()`
  * If your labels are one-hot encoded, use `CategoricalCrossentropy()`
  * If your labels are integer form use `SparseCategoricalCrossentropy()`
* **Output layer activation** = `Softmax` (not sigmoid)

In [23]:
# Set random seed
tf.random.set_seed(42)

# build the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(784,)),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

# compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

# fit the model
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_val, y_val),
                    epochs=10)

Epoch 1/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.1024 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 2/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 3/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 4/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 5/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 6/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 7/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

### Evaluate the model

In [24]:
# construct metrics dictionary
def metrics_dict(y_true, y_pred):
  """Gives the accuracy, precision, recall and f1-score"""
  acc = accuracy_score(y_true, y_pred)
  prec = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)

  metrics = {"Accuracy":acc,
             "Precision":prec,
             "Recall":recall,
             "F1 Score":f1}
  return metrics

In [27]:
model.predict(X_val)

[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)

In [26]:
# Get predictions
model_preds = model.predict(X_val)

# Compute the metrics
model_metrics = metrics_dict(y_val, model_preds)
model_metrics

[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


ValueError: Input y_pred contains NaN.

In [29]:
import numpy as np
import tensorflow as tf

# Set random seed
tf.random.set_seed(42)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(784,)),  # Example input shape, adjust based on your data
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')  # Assuming 10 classes for classification
])

# Compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),  # Adjust the loss function as needed
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

# Check for NaNs in your data
if np.any(np.isnan(X_train)) or np.any(np.isnan(y_train)):
    print("Training data contains NaNs")
if np.any(np.isnan(X_val)) or np.any(np.isnan(y_val)):
    print("Validation data contains NaNs")

# Fit the model
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_val, y_val),
                    epochs=10)

# Evaluate the model
def metric_dict(y_true, y_pred):
    """
    Given the accuracy, precision, recall and f1-score
    arc = precision_accuracy_score
    rec = recall_score
    f1 = f1_score
    """
    y_pred = np.argmax(y_pred, axis=1)
    acc = tf.keras.metrics.Accuracy()(y_true, y_pred)
    precision = tf.keras.metrics.Precision()(y_true, y_pred)
    recall = tf.keras.metrics.Recall()(y_true, y_pred)
    f1 = 2 * (precision * recall) / (precision + recall)

    return {"accuracy": acc,
            "precision": precision,
            "recall": recall,
            "f1": f1}

# Make predictions
y_pred = model.predict(X_val)

# Compute the metrics
metrics = metric_dict(y_val, y_pred)
print(metrics)

# Check predictions for NaNs
if np.any(np.isnan(y_pred)):
    print("Predictions contain NaNs")

Training data contains NaNs
Epoch 1/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.5696 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 2/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 3/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 4/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 5/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 6/10
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.0999 - loss: nan - val_accuracy: 0.0937 - val_loss: nan
Epoch 7/10
[1m789/789[0m [32m━━━━