# Gisette Dataset
#####  GISETTE is a handwritten digit recognition problem.
#####  The problem is to separate the highly confusible digits '4' and '9'.

In [18]:
import pandas as pd
import seaborn as sns
import numpy as np
import csv
import sklearn
import time
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [19]:
# Create a list to store the model names and their corresponding times
model_times = []

# Function to calculate the time difference and store it in the model_times list
def calculate_time(model_name, start_time, end_time, training_score, testing_score):
    end_time = time.time()
    elapsed_time = end_time - start_time
    model_times.append([model_name, elapsed_time, training_score, testing_score])

# Working with the whole dataset:

#### Info regarding the data:

In [20]:
print(f'Some info on the provided Gisette data:\n')
with open('./data/gisette.param') as csvfile:
    for row in csv.reader(csvfile):
        print(row[0])

Some info on the provided Gisette data:

Data type: non-sparse
Number of features: 5000
Number of examples and check-sums:
     	Pos_ex	Neg_ex	Tot_ex	Check_sum
Train	 3000	 3000	 6000	3164568508.00
Valid	  500	  500	 1000	535016668.00
Test	 3250	 3250	 6500	3431572010.00
All  	 6750	 6750	13500	7131157186.00


###### The provided dataset has labels available only for Train and Valid datasets so we made the decision to use Train dataset with its labels to train our machine and then check our machine's performance on the Valid dataset and its labels

#### Reading the Training Dataset and cleaning as needed

In [21]:
gisette_train_data = pd.read_csv('./data/gisette_train.data', delimiter=' ', header=None)
gisette_train_data = gisette_train_data.dropna(axis=1, how='all')

gisette_train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,550,0,495,0,0,0,0,976,0,0,...,0,0,0,991,991,0,0,0,0,983
1,0,0,0,0,0,0,0,976,0,0,...,0,475,0,991,0,0,991,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,742,0,0,0,0,684,0,956,...,0,0,0,0,0,0,674,0,0,838
4,0,0,0,0,0,0,0,608,0,979,...,991,0,0,828,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,0,0,0,0,0,0,0,...,991,0,0,0,0,0,783,0,0,0
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,921,0,886,0
5997,0,0,0,0,0,758,0,0,0,522,...,0,901,0,0,0,0,980,0,0,0
5998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,690,0,0,0,0,0


#### Reading True Positive & Negative Values (Labels) for the Training Dataset

In [22]:
gisette_train_labels = pd.read_csv('./data/gisette_train.labels', delimiter=' ', header=None)

gisette_train_labels

Unnamed: 0,0
0,1
1,-1
2,1
3,1
4,1
...,...
5995,-1
5996,1
5997,-1
5998,-1


#### Reading the Testing Dataset and cleaning as needed

In [23]:
gisette_test_data = pd.read_csv('./data/gisette_valid.data', delimiter=' ', header=None)
gisette_test_data = gisette_test_data.dropna(axis=1, how='all')

gisette_test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,688,0,0,0,0,0,0,952,0,870,...,0,0,0,0,0,0,494,0,769,0
1,778,758,0,0,0,0,0,708,0,991,...,0,0,0,770,0,0,0,0,0,0
2,469,0,816,0,0,0,0,0,0,0,...,0,0,0,467,0,0,0,417,0,0
3,0,0,0,0,0,571,991,983,0,983,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,949,0,991,...,0,0,0,0,0,0,0,0,976,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,511,0,0,0,0,0,0,0,0,...,991,0,0,852,0,0,0,0,0,0
996,0,0,599,0,0,0,0,910,0,0,...,0,0,0,0,0,0,0,0,0,707
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,742
998,0,0,0,0,0,0,0,968,0,0,...,0,0,0,773,0,0,0,0,0,0


#### Reading True Positive & Negative Values (Labels) for the Testing Dataset

In [24]:
gisette_test_labels = pd.read_csv('./data/gisette_valid.labels', delimiter=' ', header=None)

gisette_test_labels

Unnamed: 0,0
0,1
1,1
2,-1
3,1
4,1
...,...
995,-1
996,1
997,1
998,-1


#### Training a model using Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

start = time.time()  #---start time---

#---training the model using Logistic Regression---
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(gisette_train_data, np.ravel(gisette_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = log_reg.predict(gisette_train_data)
test_pred = log_reg.predict(gisette_test_data)

end = time.time()  #---end time---

training_score = (accuracy_score(gisette_train_labels, train_pred)*100)
testing_score = (accuracy_score(gisette_test_labels, test_pred)*100)

print('Percentage Accuracy on the Training Data:', training_score)
print('Percentage Accuracy on the Testing Data:', testing_score)
print('Processing Time (in seconds):', end-start)

calculate_time("log_reg", start, end, training_score, testing_score)

Percentage Accuracy on the Training Data: 100.0
Percentage Accuracy on the Testing Data: 97.8
Processing Time (in seconds): 8.353672742843628


#### Training a model using a linear kernel Support Vector Machine

In [26]:
from sklearn.svm import SVC

start = time.time()  #---start time---

#---training the model using Support Vector Machine with a linear kernel---
svm_linear = SVC(kernel='linear')
svm_linear.fit(gisette_train_data, np.ravel(gisette_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_linear.predict(gisette_train_data)
test_pred = svm_linear.predict(gisette_test_data)

end = time.time()  #---end time---

training_score = accuracy_score(gisette_train_labels, train_pred)*100
testing_score = accuracy_score(gisette_test_labels, test_pred)*100

print('Percentage Accuracy on the Training Data:', training_score)
print('Percentage Accuracy on the Testing Data:', testing_score)
print('Processing Time (in seconds):', end-start)

calculate_time("svm_linear", start, end, training_score, testing_score)


Percentage Accuracy on the Training Data: 100.0
Percentage Accuracy on the Testing Data: 97.6
Processing Time (in seconds): 45.07544302940369


#### Training a model using a degree 2 polynomial kernel Support Vector Machine

In [27]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 2 polynomial kernel---
svm_poly_2 = SVC(kernel='poly', degree=2)
svm_poly_2.fit(gisette_train_data, np.ravel(gisette_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_2.predict(gisette_train_data)
test_pred = svm_poly_2.predict(gisette_test_data)

end = time.time()  #---end time---

training_score = accuracy_score(gisette_train_labels, train_pred)*100
testing_score = accuracy_score(gisette_test_labels, test_pred)*100

print('Percentage Accuracy on the Training Data:', training_score)
print('Percentage Accuracy on the Testing Data:', testing_score)
print('Processing Time (in seconds):', end-start)

calculate_time("svm_poly_2", start, end, training_score, testing_score)


Percentage Accuracy on the Training Data: 99.8
Percentage Accuracy on the Testing Data: 98.0
Processing Time (in seconds): 75.71789193153381


#### Training a model using a degree 3 polynomial kernel Support Vector Machine

In [28]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 3 polynomial kernel---
svm_poly_3 = SVC(kernel='poly', degree=3)
svm_poly_3.fit(gisette_train_data, np.ravel(gisette_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_3.predict(gisette_train_data)
test_pred = svm_poly_3.predict(gisette_test_data)

end = time.time()  #---end time---

training_score = accuracy_score(gisette_train_labels, train_pred)*100
testing_score = accuracy_score(gisette_test_labels, test_pred)*100

print('Percentage Accuracy on the Training Data:', training_score)
print('Percentage Accuracy on the Testing Data:', testing_score)
print('Processing Time (in seconds):', end-start)

calculate_time("svm_poly_3", start, end, training_score, testing_score)


Percentage Accuracy on the Training Data: 99.78333333333333
Percentage Accuracy on the Testing Data: 96.7
Processing Time (in seconds): 292.9827210903168


#### Training a model using a degree 4 polynomial kernel Support Vector Machine

In [29]:
start = time.time()  #---start time---

#---training the model using Support Vector Machine with a degree 4 polynomial kernel---
svm_poly_4 = SVC(kernel='poly', degree=4)
svm_poly_4.fit(gisette_train_data, np.ravel(gisette_train_labels))

#---evaluating the model on the training and testing datasets---
train_pred = svm_poly_4.predict(gisette_train_data)
test_pred = svm_poly_4.predict(gisette_test_data)

end = time.time()  #---end time---

training_score = accuracy_score(gisette_train_labels, train_pred)*100
testing_score = accuracy_score(gisette_test_labels, test_pred)*100

print('Percentage Accuracy on the Training Data:', training_score)
print('Percentage Accuracy on the Testing Data:', testing_score)
print('Processing Time (in seconds):', end-start)

calculate_time("svm_poly_4", start, end, training_score, testing_score)


Percentage Accuracy on the Training Data: 99.2
Percentage Accuracy on the Testing Data: 94.1
Processing Time (in seconds): 336.70822286605835


In [30]:
# Write the model_times list to a CSV file
with open('./outputs/whole_data_model_times.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['whole_data_model_times', ''])
    writer.writerow(['name', 'time (seconds)', 'training score', 'testing score'])
    writer.writerows(model_times)