---
<h1 style="text-align:center;"> Fashion MNIST</h1>

---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings


from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, learning_curve, validation_curve
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
train_data = pd.read_csv('fashion-mnist_train.csv')
test_data = pd.read_csv('fashion-mnist_test.csv')

In [3]:
# split our data to X and y
X_train = train_data.drop("label", axis=1).values
y_train = train_data["label"].values
X_test = test_data.drop("label", axis=1).values
y_test = test_data["label"].values

In [4]:
X_train = X_train.astype('float')
X_test = X_test.astype('float')

#normalising the data
X_train /= 255 
X_test /= 255

---
<h3 style="text-align:center;"> Models</h3>

---

Check some models on normalized data

In [5]:
X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=9)

In [6]:
# Suppress all warnings
warnings.filterwarnings("ignore")

In [6]:
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}


In [8]:
def perform_cross_validation(model, X, y, n_folds=5):
    try:
        start_time = time.time()

        cv_scores = cross_val_score(model, X, y, cv=n_folds)

        end_time = time.time()

        elapsed_time = end_time - start_time

        print("Cross-Validation Scores:", cv_scores)
        print("Average CV Score:", cv_scores.mean())
        print(f"Elapsed Time: {elapsed_time} seconds")
    except Exception as e:
        print(f"An error occurred while cross-validating {type(model).__name__}: {e}")

In [10]:
for model_name, model in models.items():
    print(f"=== {model_name} ===")
    perform_cross_validation(model, X_train, y_train, n_folds=5)
    print("================\n")

=== Naive Bayes ===
Cross-Validation Scores: [0.59783333 0.57675    0.60958333 0.59758333 0.58616667]
Average CV Score: 0.5935833333333334
Elapsed Time: 13.132946729660034 seconds

=== Logistic Regression ===
Cross-Validation Scores: [0.85558333 0.85425    0.85825    0.85416667 0.85233333]
Average CV Score: 0.8549166666666667
Elapsed Time: 79.93741965293884 seconds

=== Decision Tree ===
Cross-Validation Scores: [0.79241667 0.79233333 0.79333333 0.79616667 0.79233333]
Average CV Score: 0.7933166666666667
Elapsed Time: 349.01535987854004 seconds

=== SVM ===
Cross-Validation Scores: [0.88775    0.88691667 0.8945     0.88491667 0.88441667]
Average CV Score: 0.8876999999999999
Elapsed Time: 2523.4591178894043 seconds

=== Random Forest ===
Cross-Validation Scores: [0.87908333 0.8815     0.88275    0.87833333 0.88258333]
Average CV Score: 0.88085
Elapsed Time: 850.1695411205292 seconds

=== XGBoost ===
Cross-Validation Scores: [0.90133333 0.9025     0.90425    0.90058333 0.89908333]
Averag

Check some models on redused data (pca)

In [7]:
# start of pca
pca = PCA()
X_train_pca = pca.fit_transform(X_train)

# keep 95% of the variance
target_variance = 0.95
num_components = next(i for i, cum_var in enumerate(pca.explained_variance_ratio_.cumsum()) if cum_var >= target_variance)

# Apply PCA with the selected number of components
pca = PCA(n_components=num_components)


In [8]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [9]:
X_train_pca.shape, X_test_pca.shape

((60000, 186), (10000, 186))

In [14]:
for model_name, model in models.items():
    print(f"=== {model_name} ===")
    perform_cross_validation(model, X_train_pca, y_train, n_folds=5)
    print("================\n")

=== Naive Bayes ===
Cross-Validation Scores: [0.74666667 0.74641667 0.74808333 0.74391667 0.75008333]
Average CV Score: 0.7470333333333333
Elapsed Time: 1.8628146648406982 seconds

=== Logistic Regression ===
Cross-Validation Scores: [0.85416667 0.85283333 0.85441667 0.85041667 0.85291667]
Average CV Score: 0.8529500000000001
Elapsed Time: 18.182113647460938 seconds

=== Decision Tree ===
Cross-Validation Scores: [0.75733333 0.75683333 0.76108333 0.75791667 0.75725   ]
Average CV Score: 0.7580833333333332
Elapsed Time: 363.16396045684814 seconds

=== SVM ===
Cross-Validation Scores: [0.89091667 0.89391667 0.89591667 0.88825    0.89133333]
Average CV Score: 0.8920666666666666
Elapsed Time: 975.1259844303131 seconds

=== Random Forest ===
Cross-Validation Scores: [0.86408333 0.86333333 0.86891667 0.85883333 0.86166667]
Average CV Score: 0.8633666666666666
Elapsed Time: 1777.7181723117828 seconds

=== XGBoost ===
Cross-Validation Scores: [0.88083333 0.88066667 0.88666667 0.87883333 0.8805

##### Random forest (fast route) on normal data

In [22]:
rf_params = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [23]:
rf_model = RandomForestClassifier()
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_params, n_iter=10, cv=5, scoring='accuracy', random_state=9)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'gini', 'bootstrap': False}


In [24]:
start_time = time.time()

best_rf_model = RandomForestClassifier(**best_params)
best_rf_model.fit(X_train, y_train)

end_time = time.time()
elapsed_time = end_time - start_time

best_rf_predictions = best_rf_model.predict(X_test)

print(classification_report(y_test, best_rf_predictions))
print(f"Elapsed Time: {elapsed_time} seconds")

              precision    recall  f1-score   support

           0       0.82      0.86      0.84      1000
           1       0.99      0.97      0.98      1000
           2       0.81      0.81      0.81      1000
           3       0.89      0.93      0.91      1000
           4       0.80      0.87      0.83      1000
           5       0.98      0.95      0.96      1000
           6       0.76      0.63      0.69      1000
           7       0.93      0.94      0.94      1000
           8       0.95      0.98      0.97      1000
           9       0.94      0.95      0.95      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Elapsed Time: 418.7962918281555 seconds


##### SVM (Slow route more accurate) 

**Note to team**
- the 3 models was trained by mistake on the 784 colums. I didnt delete them for reference , and for validation that the PCA dataset will have the same parameters as those models

In [10]:
# SVM parameters
svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

In [11]:
start_time = time.time()
# Create SVM model
svm_model = SVC()
random_search_svm = RandomizedSearchCV(estimator=svm_model, param_distributions=svm_params, n_iter=10, cv=5, scoring='accuracy', random_state=9)
random_search_svm.fit(X_train, y_train)

end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

# Get the best hyperparameters for SVM
best_params_svm = random_search_svm.best_params_
print("Best Hyperparameters for SVM:", best_params_svm)

Best Hyperparameters for SVM: {'kernel': 'rbf', 'C': 10}


In [12]:
print(elapsed_time_svm)

24118.504173517227


In [13]:
start_time = time.time()
# Create and train the best SVM model
best_svm_model = SVC(**best_params_svm)
best_svm_model.fit(X_train, y_train)

# Make predictions and evaluate the SVM model
best_svm_predictions = best_svm_model.predict(X_test)

print("Classification Report for SVM:")
print(classification_report(y_test, best_svm_predictions))

# Calculate elapsed time
end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

print(f"Elapsed Time for SVM: {elapsed_time_svm} seconds")

Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1000
           1       0.99      0.98      0.99      1000
           2       0.85      0.83      0.84      1000
           3       0.91      0.92      0.91      1000
           4       0.85      0.88      0.86      1000
           5       0.98      0.96      0.97      1000
           6       0.79      0.73      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.98      0.98      0.98      1000
           9       0.96      0.97      0.96      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

Elapsed Time for SVM: 617.405758857727 seconds


Second try wiht more parameters

In [15]:
# SVM parameters
svm_params = {
    'C': [10],
    'kernel': ['rbf'],
    'degree': [2, 3, 4],
}

In [16]:
start_time = time.time()
# Create SVM model
svm_model = SVC()
random_search_svm = RandomizedSearchCV(estimator=svm_model, param_distributions=svm_params, n_iter=10, cv=5, scoring='accuracy', random_state=9)
random_search_svm.fit(X_train, y_train)

end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

# Get the best hyperparameters for SVM
best_params_svm = random_search_svm.best_params_
print("Best Hyperparameters for SVM:", best_params_svm)
print(elapsed_time_svm)

Best Hyperparameters for SVM: {'kernel': 'rbf', 'degree': 2, 'C': 10}
5732.634997367859


In [17]:
start_time = time.time()
# Create and train the best SVM model
best_svm_model = SVC(**best_params_svm)
best_svm_model.fit(X_train, y_train)

# Make predictions and evaluate the SVM model
best_svm_predictions = best_svm_model.predict(X_test)

print("Classification Report for SVM:")
print(classification_report(y_test, best_svm_predictions))

# Calculate elapsed time
end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

print(f"Elapsed Time for SVM: {elapsed_time_svm} seconds")

Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1000
           1       0.99      0.98      0.99      1000
           2       0.85      0.83      0.84      1000
           3       0.91      0.92      0.91      1000
           4       0.85      0.88      0.86      1000
           5       0.98      0.96      0.97      1000
           6       0.79      0.73      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.98      0.98      0.98      1000
           9       0.96      0.97      0.96      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

Elapsed Time for SVM: 456.52554082870483 seconds


Third try wiht one more parameter

In [18]:
# SVM parameters
svm_params = {
    'C': [10],
    'kernel': ['rbf'],
    'degree': [2],
    'gamma': ['scale', 'auto', 0.1, 1, 10],
}

In [19]:
start_time = time.time()
# Create SVM model
svm_model = SVC()
random_search_svm = RandomizedSearchCV(estimator=svm_model, param_distributions=svm_params, n_iter=10, cv=5, scoring='accuracy', random_state=9)
random_search_svm.fit(X_train, y_train)

end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

# Get the best hyperparameters for SVM
best_params_svm = random_search_svm.best_params_
print("Best Hyperparameters for SVM:", best_params_svm)
print(elapsed_time_svm)

Best Hyperparameters for SVM: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 10}
77782.5097591877


In [20]:
start_time = time.time()
# Create and train the best SVM model
best_svm_model = SVC(**best_params_svm)
best_svm_model.fit(X_train, y_train)

# Make predictions and evaluate the SVM model
best_svm_predictions = best_svm_model.predict(X_test)

print("Classification Report for SVM:")
print(classification_report(y_test, best_svm_predictions))

# Calculate elapsed time
end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

print(f"Elapsed Time for SVM: {elapsed_time_svm} seconds")

Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1000
           1       0.99      0.98      0.99      1000
           2       0.85      0.83      0.84      1000
           3       0.91      0.92      0.91      1000
           4       0.85      0.88      0.86      1000
           5       0.98      0.96      0.97      1000
           6       0.79      0.73      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.98      0.98      0.98      1000
           9       0.96      0.97      0.96      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

Elapsed Time for SVM: 492.48690605163574 seconds


---

---

Selected model SVM with PCA

In [11]:
# SVM parameters
svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    
}

In [12]:
start_time = time.time()
# Create SVM model
svm_model = SVC()
random_search_svm = RandomizedSearchCV(estimator=svm_model, param_distributions=svm_params, n_iter=10, cv=5, scoring='accuracy', random_state=9)
random_search_svm.fit(X_train_pca, y_train)

end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

# Get the best hyperparameters for SVM
best_params_svm = random_search_svm.best_params_
print("Best Hyperparameters for SVM:", best_params_svm)
print(elapsed_time_svm)

Best Hyperparameters for SVM: {'kernel': 'rbf', 'C': 10}
10273.709419488907


In [13]:
start_time = time.time()
# Create and train the best SVM model
svm_model = SVC(kernel= 'rbf', C= 10, probability=True)
svm_model.fit(X_train_pca, y_train)

# Make predictions and evaluate the SVM model
svm_predictions = svm_model.predict(X_test_pca)

print("Classification Report for SVM:")
print(classification_report(y_test, svm_predictions))

# Calculate elapsed time
end_time_svm = time.time()
elapsed_time_svm = end_time_svm - start_time

print(f"Elapsed Time for SVM: {elapsed_time_svm} seconds")

Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1000
           1       0.99      0.98      0.99      1000
           2       0.85      0.84      0.85      1000
           3       0.91      0.92      0.92      1000
           4       0.86      0.87      0.86      1000
           5       0.98      0.96      0.97      1000
           6       0.79      0.74      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.98      0.98      0.98      1000
           9       0.97      0.97      0.97      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

Elapsed Time for SVM: 1088.9808773994446 seconds
