In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
from datasets import load_dataset
data = load_dataset("ylecun/mnist")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})

In [3]:
# Preprocessing of the data

import numpy as np

train_images = np.array(data["train"]["image"])
train_labels = np.array(data["train"]["label"])

test_images = np.array(data["test"]["image"])
y_test = np.array(data["test"]["label"])

# Normalization of all the data
train_images_norm=train_images/255
test_images_norm=test_images/255

n1=train_images.shape[0]
n2=test_images.shape[0]

# Flattening the data to make it 1-D from 2-D
train_images_flat=train_images_norm.reshape([n1,-1])
x_test=test_images_norm.reshape([n2,-1])

In [4]:
# Setting a random seed
np.random.seed(42)

# Defining the number of initially labeled samples
num_labeled = 10000

# Generating random indices for the labeled dataset
labeled_indices = np.random.choice(len(train_images_flat), num_labeled, replace=False)

# Spliting the datasets into labeled and unlabeled datasets
x_train = train_images_flat[labeled_indices]
y_train = train_labels[labeled_indices]

## Vanilla Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

start_time = time.time()
# Training using vanilla Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
end_time = time.time()

# Calculating training time
training_time_model_1 = end_time-start_time

# Predicting the value on test and rounding it off to closest integer and cliping it from 0 to 9
y_pred = lin_reg.predict(x_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [24]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_1=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_1=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_1=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_1=(f1_score(y_test,y_pred, average="weighted"))*100

print("Vanilla Linear Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_1:.2f}%")
print(f"Precision: {precision_percentage_model_1:.2f}%")
print(f"Recall: {recall_percentage_model_1:.2f}%")
print(f"F1 Score: {f1_percentage_model_1:.2f}%")
print(f"Training time: {training_time_model_1:.2f}sec")

Vanilla Linear Regression Stats:
Accuracy: 22.42%
Precision: 28.31%
Recall: 22.42%
F1 Score: 22.95%
Training time: 0.70sec


## Polynomial Regression
This can't be performed as each digit has 28 * 28 parameters which make 28 * 28 * 28 * 28 coefficients for degree 2 amking it intensively high for the computer to handle. \
So, we can reduce the dimension of it to 2 or 3 using PCA or tSNE and then apply polynomial regression.

In [25]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

start_time = time.time()
degree = 5
poly = PolynomialFeatures(degree=degree)
x_train_poly = poly.fit_transform(x_train_pca)
x_test_poly = poly.fit_transform(x_test_pca)

lin_reg_2 = LinearRegression()
lin_reg_2.fit(x_train_poly,y_train)
end_time = time.time()

# Calculating training time
training_time_model_2 = end_time-start_time

# Predicting the value on test and rounding it off to closest integer and cliping it from 0 to 9
y_pred = lin_reg_2.predict(x_test_poly)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [26]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_2=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_2=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_2=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_2=(f1_score(y_test,y_pred, average="weighted"))*100

print("Polynomial Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_2:.2f}%")
print(f"Precision: {precision_percentage_model_2:.2f}%")
print(f"Recall: {recall_percentage_model_2:.2f}%")
print(f"F1 Score: {f1_percentage_model_2:.2f}%")
print(f"Training time: {training_time_model_2:.2f}sec")

Polynomial Regression Stats:
Accuracy: 19.96%
Precision: 24.59%
Recall: 19.96%
F1 Score: 19.82%
Training time: 0.12sec


## Guassian Basis Function Regression

In [27]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Apply PCA to reduce to 1 dimensions
pca = PCA(n_components=1)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# Creating Gaussian basis functions
start_time = time.time()
degree = 50
centers = np.linspace(-10, 10, degree).reshape(-1, 1)
sigma = 2.0  # Standard deviation of Gaussians

# Compute Gaussian basis functions
x_train_g = np.exp(-((x_train_pca - centers.T) ** 2) / (2 * sigma**2))
x_test_g = np.exp(-((x_test_pca - centers.T) ** 2) / (2 * sigma**2))

# Train Linear Regression model
lin_reg_3 = LinearRegression()
lin_reg_3.fit(x_train_g, y_train)
end_time = time.time()

# Calculating training time
training_time_model_3 = end_time-start_time

# Make predictions
y_pred = lin_reg_3.predict(x_test_g)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [28]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_3=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_3=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_3=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_3=(f1_score(y_test,y_pred, average="weighted"))*100

print("Guassian Feature Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_3:.2f}%")
print(f"Precision: {precision_percentage_model_3:.2f}%")
print(f"Recall: {recall_percentage_model_3:.2f}%")
print(f"F1 Score: {f1_percentage_model_3:.2f}%")
print(f"Training time: {training_time_model_3:.2f}sec")

Guassian Feature Regression Stats:
Accuracy: 15.66%
Precision: 20.19%
Recall: 15.66%
F1 Score: 14.26%
Training time: 0.04sec


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Fourier Features Regreesion

In [29]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Apply PCA to reduce to 1 dimensions
pca = PCA(n_components=1)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# Creating Fourier Function
start_time = time.time()
num_terms = 50

# Compute Fourier basis features
x_train_fourier = np.hstack([np.sin((i+1) * x_train_pca) for i in range(num_terms)] +
                            [np.cos((i+1) * x_train_pca) for i in range(num_terms)])

x_test_fourier = np.hstack([np.sin((i+1) * x_test_pca) for i in range(num_terms)] +
                           [np.cos((i+1) * x_test_pca) for i in range(num_terms)])

# Train Linear Regression model
lin_reg_4 = LinearRegression()
lin_reg_4.fit(x_train_fourier, y_train)
end_time = time.time()

# Calculating training time
training_time_model_4 = end_time-start_time

# Make predictions
y_pred = lin_reg_4.predict(x_test_fourier)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [30]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_4=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_4=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_4=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_4=(f1_score(y_test,y_pred, average="weighted"))*100

print("Fourier Feature Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_4:.2f}%")
print(f"Precision: {precision_percentage_model_4:.2f}%")
print(f"Recall: {recall_percentage_model_4:.2f}%")
print(f"F1 Score: {f1_percentage_model_4:.2f}%")
print(f"Training time: {training_time_model_4:.2f}sec")

Fourier Feature Regression Stats:
Accuracy: 8.34%
Precision: 3.42%
Recall: 8.34%
F1 Score: 3.82%
Training time: 0.16sec


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Conclusion
Linear Regression will give bad results for this type of data as the data is non linear and less distinction and we are using PCA for dimensionality reduction making the data too similar which reduces the accuracy.

## Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Train Logistic Regression model
start_time = time.time()
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(x_train, y_train)
end_time = time.time()
training_time_log_reg = end_time-start_time

# Make predictions
y_pred = log_reg.predict(x_test)

In [32]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_log_reg=(accuracy_score(y_test,y_pred))*100
precision_percentage_log_reg=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_log_reg=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_log_reg=(f1_score(y_test,y_pred, average="weighted"))*100

print("Logistic Regression Stats:")
print(f"Accuracy: {accuracy_percentage_log_reg:.2f}%")
print(f"Precision: {precision_percentage_log_reg:.2f}%")
print(f"Recall: {recall_percentage_log_reg:.2f}%")
print(f"F1 Score: {f1_percentage_log_reg:.2f}%")
print(f"Training time: {training_time_log_reg:.2f}sec")

Logistic Regression Stats:
Accuracy: 90.83%
Precision: 90.82%
Recall: 90.83%
F1 Score: 90.81%
Training time: 4.43sec


### Implementing the logistic regresion without using direct function.
### We are finding logits and probability through with we are finding loss updating W and b.

In [33]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))

# Initialize weights and bias
num_features = x_train.shape[1]

W = np.random.randn(num_features, 10) * 0.01
b = np.zeros((1, 10))

# Hyperparameters
learning_rate = 0.1
epochs = 500

# Training loop
start_time = time.time()
for epoch in range(epochs):
    # Compute logits: Z = XW + b
    logits = np.dot(x_train, W) + b

    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

    # Compute loss
    loss = -np.mean(np.sum(y_train_onehot * np.log(probs + 1e-8), axis=1))

    grad_logits = probs - y_train_onehot
    dW = np.dot(x_train.T, grad_logits) / x_train.shape[0]
    db = np.mean(grad_logits, axis=0, keepdims=True)

    W -= learning_rate * dW
    b -= learning_rate * db
end_time = time.time()
training_time_log_reg_2 = end_time - start_time

# Predicting the value
logits_test = np.dot(x_test, W) + b
y_pred = np.argmax(logits_test, axis=1)

In [34]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_log_reg_2=(accuracy_score(y_test,y_pred))*100
precision_percentage_log_reg_2=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_log_reg_2=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_log_reg_2=(f1_score(y_test,y_pred, average="weighted"))*100

print("Logistic Regression Stats:")
print(f"Accuracy: {accuracy_percentage_log_reg_2:.2f}%")
print(f"Precision: {precision_percentage_log_reg_2:.2f}%")
print(f"Recall: {recall_percentage_log_reg_2:.2f}%")
print(f"F1 Score: {f1_percentage_log_reg_2:.2f}%")
print(f"Training time: {training_time_log_reg_2:.2f}sec")

Logistic Regression Stats:
Accuracy: 90.16%
Precision: 90.13%
Recall: 90.16%
F1 Score: 90.12%
Training time: 17.09sec


## Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
# Train a Random Forest classifier

start_time = time.time()
rf_model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
rf_model.fit(x_train, y_train)
end_time = time.time()
training_time_random_forest = end_time - start_time

# Make predictions
y_pred = rf_model.predict(x_test)

In [36]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_random_forest=(accuracy_score(y_test,y_pred))*100
precision_percentage_random_forest=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_random_forest=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_random_forest=(f1_score(y_test,y_pred, average="weighted"))*100

print("Logistic Regression Stats:")
print(f"Accuracy: {accuracy_percentage_random_forest:.2f}%")
print(f"Precision: {precision_percentage_random_forest:.2f}%")
print(f"Recall: {recall_percentage_random_forest:.2f}%")
print(f"F1 Score: {f1_percentage_random_forest:.2f}%")
print(f"Training time: {training_time_random_forest:.2f}sec")

Logistic Regression Stats:
Accuracy: 95.28%
Precision: 95.28%
Recall: 95.28%
F1 Score: 95.27%
Training time: 6.46sec


## K MEANS

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# Train KNN classifier (choosing k=3)
start_time = time.time()
knn_model = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
knn_model.fit(x_train, y_train)
end_time = time.time()
training_time_knn = end_time - start_time

# Make predictions
y_pred = knn_model.predict(x_test)

In [38]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_knn=(accuracy_score(y_test,y_pred))*100
precision_percentage_knn=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_knn=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_knn=(f1_score(y_test,y_pred, average="weighted"))*100

print("Logistic Regression Stats:")
print(f"Accuracy: {accuracy_percentage_knn:.2f}%")
print(f"Precision: {precision_percentage_knn:.2f}%")
print(f"Recall: {recall_percentage_knn:.2f}%")
print(f"F1 Score: {f1_percentage_knn:.2f}%")
print(f"Training time: {training_time_knn:.2f}sec")

Logistic Regression Stats:
Accuracy: 94.57%
Precision: 94.67%
Recall: 94.57%
F1 Score: 94.55%
Training time: 0.01sec


## Neural Networks

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import numpy as np

# Convert NumPy arrays to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create DataLoader for batch processing
batch_size = 128
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define Neural Network Model
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(784, 128)  # Hidden Layer 1
        self.fc2 = nn.Linear(128, 64)   # Hidden Layer 2
        self.fc3 = nn.Linear(64,32)     # Hidden Layer 3
        self.fc4 = nn.Linear(32,16)     # Hidden Layer 4
        self.fc5 = nn.Linear(16, 10)    # Output Layer

    def forward(self, x):
        # Activation function (ReLU)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

# Initialize model, loss function, and optimizer
start_time = time.time()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
end_time = time.time()
training_time_nn = end_time - start_time

# Evaluate the model
model.eval()
y_pred = []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        outputs = model(x_batch)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.cpu().numpy())
y_pred = np.array(y_pred)

In [40]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_nn=(accuracy_score(y_test,y_pred))*100
precision_percentage_nn=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_nn=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_nn=(f1_score(y_test,y_pred, average="weighted"))*100

print("Logistic Regression Stats:")
print(f"Accuracy: {accuracy_percentage_nn:.2f}%")
print(f"Precision: {precision_percentage_nn:.2f}%")
print(f"Recall: {recall_percentage_nn:.2f}%")
print(f"F1 Score: {f1_percentage_nn:.2f}%")
print(f"Training time: {training_time_nn:.2f}sec")

Logistic Regression Stats:
Accuracy: 92.87%
Precision: 92.92%
Recall: 92.87%
F1 Score: 92.86%
Training time: 6.55sec
