In [9]:
"""
Run once ever

Downloads and extracts the dataset
"""


import urllib.request
import tarfile
import os

mp3file = urllib.request.urlopen("https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz")
with open('tmp','wb') as output:
  output.write(mp3file.read())

file = tarfile.open("tmp")
file.extractall(".")
file.close()
os.remove("tmp")
os.remove("cifar-100-python/file.txt~")

  file.extractall(".")


In [10]:
"""
Imports
"""

import pandas as pd
import matplotlib.pyplot as plt
import threading
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, zero_one_loss

In [11]:
"""
load the dataset into memory
"""

# labels = "coarse"
labels = "fine"

# read in the training data
tmp = pd.read_pickle("./cifar-100-python/train")
tmp.pop("batch_label")
tmp.pop("filenames")
d = tmp.pop("data")

data = pd.DataFrame.from_dict(tmp)
d = pd.DataFrame(d)
data = pd.concat([data, d], axis=1)

X_train = data[(x for x in range(3072))]
y_train = data[labels + "_labels"]


# read in the testing data
tmp = pd.read_pickle("./cifar-100-python/test")
tmp.pop("batch_label")
tmp.pop("filenames")
d = tmp.pop("data")

data = pd.DataFrame.from_dict(tmp)
d = pd.DataFrame(d)
data = pd.concat([data, d], axis=1)

X_test = data[(x for x in range(3072))]
y_test = data[labels + "_labels"]

label_names = pd.read_pickle("./cifar-100-python/meta")[labels + "_label_names"]

In [6]:
"""
K-nearest

Achieved slightly better than random (7% accuracy on coarse labels, 1.28% accuracy on fine labels)
"""

scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.fit_transform(X_test)

# Train KNN

neighbors_list = [1,5,10,15,25,33,50]
# knn = KNeighborsRegressor(n_neighbors=150)
# knn.fit(X_train_scaled, y_train)
best_neighbors = 0
best_r2 = 0
for i in neighbors_list:
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train_scaled, y_train)

    y_pred = knn.predict(X_test_scaled)

    r2 = r2_score(y_test, y_pred)
    print(f"Neighbors: {i}, R2: {r2}")
    if r2 > best_r2:
        best_neighbors = i
        best_r2 = r2

Neighbors: 1, R2: -0.547445304530453
Neighbors: 5, R2: -0.040033872187218655
Neighbors: 10, R2: 0.01389046264626459
Neighbors: 15, R2: 0.029376024002400247
Neighbors: 25, R2: 0.04373827686768683
Neighbors: 33, R2: 0.04497737024391146
Neighbors: 50, R2: 0.04431375371137092


In [7]:
# Test KNN

knn = KNeighborsRegressor(n_neighbors=best_neighbors)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)

# r2 = r2_score(y_test, y_pred)
# print("R2 score:", r2)
mse = mean_squared_error(y_test, y_pred)
print(f"mse: {mse}")
y_pred = y_pred.round()

correct = 0
for i in range(len(y_pred)):
    if(y_pred[i] == y_test[i]):
        correct += 1
print("Accuracy: ", correct/len(y_pred))

# print(y_pred)

mse: 795.7726062442608
Accuracy:  0.0128


In [8]:
"""
Logistic Regression
"""

'\nLogistic Regression\n'

In [9]:
"""
Basic Decision Tree

Did much better than KNN
achieving 17.17% accuracy with depth of 52 on coarse labels
          8.41% accuracy with depth of 103 on fine labels
    That was without a depth limit
    With depth limiting, achieved:
        19.27% with depth of 9 on coarse labels
         9.15% with depth of 14 on fine labels
None of these values will be exactly replicatable because of some randomness in the training process, and I did not use a set seed to get these
"""

tree = DecisionTreeClassifier(max_depth=(9 if labels == "coarse" else 14))
_ = tree.fit(X_train, y_train)
print(f"Depth: {tree.get_depth()}")

Depth: 14


In [10]:
# Try a bunch of tree depths to see when overfitting starts
depths = [x for x in range(1,55)]

def train_tree(depth: int):
    tree = DecisionTreeClassifier(max_depth=depth)
    _ = tree.fit(X_train, y_train)
    # compute metrics
    y_pred = tree.predict(X_test)
    print(f"Depth: {depth}, \
            mse: {mean_squared_error(y_test, y_pred)}, \
            r2: {r2_score(y_test, y_pred)}, \
            accuracy: {1 - zero_one_loss(y_test, y_pred)}")

threads = []
for i in depths:
    threads.append(threading.Thread(target=train_tree, args=(i,)))
    threads[len(threads) - 1].start()
    # limit to 10 threads being used
    # adjust to fit your computer
    if len(threads) > 10:
        threads[0].join()
        threads.pop(0)
for thread in threads:
    thread.join()

Depth: 1,             mse: 926.596,             r2: -0.11202640264026398,             accuracy: 0.017199999999999993
Depth: 2,             mse: 1382.4003,             r2: -0.6590462646264625,             accuracy: 0.027699999999999947
Depth: 3,             mse: 1858.6179,             r2: -1.2305645364536453,             accuracy: 0.042200000000000015
Depth: 4,             mse: 1982.0481,             r2: -1.3786955895589559,             accuracy: 0.05030000000000001
Depth: 5,             mse: 2146.6318,             r2: -1.5762157815781577,             accuracy: 0.06259999999999999
Depth: 6,             mse: 1661.3088,             r2: -0.9937699369936994,             accuracy: 0.0675
Depth: 7,             mse: 1546.5719,             r2: -0.856071887188719,             accuracy: 0.07569999999999999
Depth: 8,             mse: 1549.2119,             r2: -0.8592402040204021,             accuracy: 0.08340000000000003
Depth: 9,             mse: 1559.3482,             r2: -0.8714049804980497,  

In [11]:
# Test decision tree
y_pred = tree.predict(X_test)

r2 = r2_score(y_test, y_pred)
print("R2 score:", r2)
mse = mean_squared_error(y_test, y_pred)
print(f"mse: {mse}")

print("Accuracy:", tree.score(X_test, y_test))

R2 score: -0.8286057005700571
mse: 1523.6857
Accuracy: 0.092


In [36]:
"""
Random Forest

"""
n_classifier_array = [5, 25, 50, 100, 250, 500, 1000]
depth_array = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
best_n = 0
best_d = 0
best_acc = 0

"""for n in n_classifier_array:
    for d in depth_array:
        
        #Prepare random forest
        forest = RandomForestClassifier(n_estimators = n, max_depth = d, random_state = 42)
        forest.fit(X_train, y_train)
        y_pred = forest.predict(X_test)
        accuracy = forest.score(X_test, y_test)
        if accuracy > best_acc:
            best_n = n
            best_d = d
            best_acc = accuracy


"""
#Hard-code solution (for individual testing)
best_n = 250
best_d = 2

#Retain with best n and d
forest = RandomForestClassifier(n_estimators = best_n, max_depth = best_d, random_state = 42)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
accuracy = forest.score(X_test, y_test)

#Print metrics
print(f"Best number of estimators: {best_n}")
print(f"Best max depth: {best_d}")
print(f"Accuracy: {accuracy:.3f}")


"""Feature Importance evaluation is buggy
#Prepare for feature importance
forest_importances = forest.feature_importances_
feature_importances_df = pd.DataFrame({"Feature": feature_names, "Importance": forest_importances})

# Sort by importance
feature_importances_df = feature_importances_df.sort_values(by = "Importance", ascending = False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importances_df["Feature"], feature_importances_df["Importance"], color = "skyblue")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Random Forest Feature Importances")
plt.gca().invert_yaxis()  # Invert y-axis to show most important features at the top
plt.show()
"""

Best number of estimators: 250
Best max depth: 2
Accuracy: 0.037


'Feature Importance evaluation is buggy\n#Prepare for feature importance\nforest_importances = forest.feature_importances_\nfeature_importances_df = pd.DataFrame({"Feature": feature_names, "Importance": forest_importances})\n\n# Sort by importance\nfeature_importances_df = feature_importances_df.sort_values(by = "Importance", ascending = False)\n\n# Plot the feature importances\nplt.figure(figsize=(10, 6))\nplt.barh(feature_importances_df["Feature"], feature_importances_df["Importance"], color = "skyblue")\nplt.xlabel("Importance")\nplt.ylabel("Feature")\nplt.title("Random Forest Feature Importances")\nplt.gca().invert_yaxis()  # Invert y-axis to show most important features at the top\nplt.show()\n'

In [None]:
"""
AdaBoost and XGBoost
"""