In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
data = load_dataset("ylecun/mnist")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})

In [3]:
# Preprocessing of the data

import numpy as np

train_images = np.array(data["train"]["image"])
train_labels = np.array(data["train"]["label"])

test_images = np.array(data["test"]["image"])
test_labels = np.array(data["test"]["label"])

# Normalization of all teh data
train_images_norm=train_images/255
test_images_norm=test_images/255

n1=train_images.shape[0]
n2=test_images.shape[0]

# Flattening the data to make it 1-D from 2-D
train_images_flat=train_images_norm.reshape([n1,-1])
test_images_flat=test_images_norm.reshape([n2,-1])

In [7]:
# Dataset to be taken
x_train=train_images_flat[0:10000]
y_train=train_labels[0:10000]
x_test=train_images_flat[0:2000]
y_test=test_labels[0:2000]

## Vanilla Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

start_time = time.time()
# Training using vanilla Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
end_time = time.time()

# Calculating training time
training_time_model_1 = end_time-start_time

# Predicting the value on test and rounding it off to closest integer and cliping it from 0 to 9
y_pred = lin_reg.predict(x_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [9]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_1=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_1=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_1=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_1=(f1_score(y_test,y_pred, average="weighted"))*100

print("Vanilla Linear Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_1:.2f}%")
print(f"Precision: {precision_percentage_model_1:.2f}%")
print(f"Recall: {recall_percentage_model_1:.2f}%")
print(f"F1 Score: {f1_percentage_model_1:.2f}%")
print(f"Training time: {training_time_model_1:.2f}sec")

Vanilla Linear Regression Stats:
Accuracy: 9.45%
Precision: 9.49%
Recall: 9.45%
F1 Score: 9.04%
Training time: 1.12sec


## Polynomial Regression
This can't be performed as each digit has 28 * 28 parameters which make 28 * 28 * 28 * 28 coefficients for degree 2 amking it intensively high for the computer to handle. \
So, we can reduce the dimension of it to 2 or 3 using PCA or tSNE and then apply polynomial regression.

In [51]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

start_time = time.time()
degree = 5
poly = PolynomialFeatures(degree=degree)
x_train_poly = poly.fit_transform(x_train_pca)
x_test_poly = poly.fit_transform(x_test_pca)

lin_reg_2 = LinearRegression()
lin_reg_2.fit(x_train_poly,y_train)
end_time = time.time()

# Calculating training time
training_time_model_2 = end_time-start_time

# Predicting the value on test and rounding it off to closest integer and cliping it from 0 to 9
y_pred = lin_reg_2.predict(x_test_poly)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [22]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_2=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_2=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_2=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_2=(f1_score(y_test,y_pred, average="weighted"))*100

print("Vanilla Linear Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_2:.2f}%")
print(f"Precision: {precision_percentage_model_2:.2f}%")
print(f"Recall: {recall_percentage_model_2:.2f}%")
print(f"F1 Score: {f1_percentage_model_2:.2f}%")
print(f"Training time: {training_time_model_2:.2f}sec")

Vanilla Linear Regression Stats:
Accuracy: 10.25%
Precision: 9.33%
Recall: 10.25%
F1 Score: 8.74%
Training time: 0.06sec


## Guassian Basis Function Regression

In [38]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Apply PCA to reduce to 1 dimensions
pca = PCA(n_components=1)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# Creating Gaussian basis functions
start_time = time.time()
degree = 50
centers = np.linspace(-10, 10, degree).reshape(-1, 1)
sigma = 2.0  # Standard deviation of Gaussians

# Compute Gaussian basis functions
x_train_g = np.exp(-((x_train_pca - centers.T) ** 2) / (2 * sigma**2))
x_test_g = np.exp(-((x_test_pca - centers.T) ** 2) / (2 * sigma**2))

# Train Linear Regression model
lin_reg_3 = LinearRegression()
lin_reg_3.fit(x_train_g, y_train)
end_time = time.time()

# Calculating training time
training_time_model_3 = end_time-start_time

# Make predictions
y_pred = lin_reg_3.predict(x_test_g)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [39]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_3=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_3=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_3=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_3=(f1_score(y_test,y_pred, average="weighted"))*100

print("Vanilla Linear Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_3:.2f}%")
print(f"Precision: {precision_percentage_model_3:.2f}%")
print(f"Recall: {recall_percentage_model_3:.2f}%")
print(f"F1 Score: {f1_percentage_model_3:.2f}%")
print(f"Training time: {training_time_model_3:.2f}sec")

Vanilla Linear Regression Stats:
Accuracy: 9.35%
Precision: 7.01%
Recall: 9.35%
F1 Score: 6.28%
Training time: 0.12sec


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Fourier Features Regreesion

In [49]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# Apply PCA to reduce to 1 dimensions
pca = PCA(n_components=1)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# Creating Fourier Function
start_time = time.time()
num_terms = 50

# Compute Fourier basis features
x_train_fourier = np.hstack([np.sin((i+1) * x_train_pca) for i in range(num_terms)] +
                            [np.cos((i+1) * x_train_pca) for i in range(num_terms)])

x_test_fourier = np.hstack([np.sin((i+1) * x_test_pca) for i in range(num_terms)] +
                           [np.cos((i+1) * x_test_pca) for i in range(num_terms)])

# Train Linear Regression model
lin_reg_4 = LinearRegression()
lin_reg_4.fit(x_train_fourier, y_train)
end_time = time.time()

# Calculating training time
training_time_model_4 = end_time-start_time

# Make predictions
y_pred = lin_reg_4.predict(x_test_fourier)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 9)

In [50]:
# Finding accuracy, precision, recall and f1 score for the model
accuracy_percentage_model_4=(accuracy_score(y_test,y_pred))*100
precision_percentage_model_4=(precision_score(y_test, y_pred, average="weighted"))*100
recall_percentage_model_4=(recall_score(y_test,y_pred, average="weighted"))*100
f1_percentage_model_4=(f1_score(y_test,y_pred, average="weighted"))*100

print("Vanilla Linear Regression Stats:")
print(f"Accuracy: {accuracy_percentage_model_4:.2f}%")
print(f"Precision: {precision_percentage_model_4:.2f}%")
print(f"Recall: {recall_percentage_model_4:.2f}%")
print(f"F1 Score: {f1_percentage_model_4:.2f}%")
print(f"Training time: {training_time_model_4:.2f}sec")

Vanilla Linear Regression Stats:
Accuracy: 9.20%
Precision: 5.53%
Recall: 9.20%
F1 Score: 5.07%
Training time: 0.36sec


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Conclusion
Linear Regression will give bad results for this type of data as the data is non linear and less distinction and we are using PCA for dimensionality reduction making the data too similar which reduces the accuracy.