# MNIST Dataset

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import codecs
from dotenv import load_dotenv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier


load_dotenv()

True

In [2]:
path_to_data = os.environ.get("MNIST_DATAPATH")
mnist_files = os.listdir(path_to_data)
mnist_files = [x for x in mnist_files if x.endswith("ubyte")]


def convert_to_int(byte):
    integer = int(codecs.encode(byte, 'hex'), 16)
    return integer


dataset = {}
for file in mnist_files:
    print("Reading", file)
    with open(path_to_data + file, "rb") as f:
        data = f.read()
        type_of_data = convert_to_int(data[:4])
        length = convert_to_int(data[4:8])
        if type_of_data == 2051:
            category = "images"
            number_of_rows = convert_to_int(data[8:12])
            number_of_columns = convert_to_int(data[12:16])
            parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
            parsed = parsed.reshape(length, number_of_rows, number_of_columns)
        if type_of_data == 2049:
            category = "labels"
            parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
            parsed = parsed.reshape(length)
        if length == 60000:
            set = "train"
        if length == 10000:
            set = "test"
        dataset[set + '_' + category] = parsed

print(dataset["train_images"][0, :, :])

Reading t10k-images-idx3-ubyte
Reading t10k-labels-idx1-ubyte
Reading train-images-idx3-ubyte
Reading train-labels-idx1-ubyte
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 253 253 253 2

In [4]:
train_images = dataset['train_images']
train_labels = dataset['train_labels']
train_images_flattened = train_images.reshape(60000, -1)
train_images_flattened = train_images_flattened / 255

### L2 Regularization Model

In [5]:
log_reg = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=100, multi_class="multinomial", verbose=1, n_jobs=-1)
log_reg.fit(train_images_flattened, train_labels)

y_pred_log = log_reg.predict(dataset['test_images'].reshape(10000, -1))
log_accuracy = accuracy_score(dataset['test_labels'], y_pred_log)
print(f'Logistic Regression accuracy on MNIST dataset: {log_accuracy:.4f}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         7850     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.30259D+00    |proj g|=  6.37317D-02


 This problem is unconstrained.



At iterate   50    f=  2.62748D-01    |proj g|=  2.53447D-03

At iterate  100    f=  2.42439D-01    |proj g|=  9.02920D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 7850    100    107      1     0     0   9.029D-04   2.424D-01
  F =  0.24243853456327233     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Logistic Regression accuracy on MNIST dataset: 0.8699


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
top_features_log = np.argsort(np.abs(log_reg.coef_).max(axis=-0))[-30:]
print(f"Top 30 important features for Logistic Regression (pixel indices):", top_features_log)

Top 30 important features for Logistic Regression (pixel indices): [249 465 221  97 459 387 377 348 369 621 434 708 742 332 388 376 404 370
 343 248 473 360 358 501 277 359 333 276 305 304]


### Decision Tree Model

In [7]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=97)
dt_model.fit(train_images_flattened, train_labels)

y_pred_dt = dt_model.predict(dataset['test_images'].reshape(10000, -1))
dt_accuracy = accuracy_score(dataset['test_labels'], y_pred_dt)
print(f"Decision Tree accuracy on MNSIT dataset: {dt_accuracy:.4f}")

Decision Tree accuracy on MNSIT dataset: 0.8283


In [8]:
used_features = np.where(dt_model.feature_importances_ > 0)[0]

print(f"Number of unique features used in Decision Tree: {len(used_features)}")
print("Feature indices:", used_features)

top_features_dt = np.argsort(dt_model.feature_importances_)[-30:]
print(f"Top 30 important features for Decision Tree (pixel indices): ", top_features_dt)

Number of unique features used in Decision Tree: 328
Feature indices: [ 67  70  73  92  95  96  97  98 100 101 102 103 123 125 126 127 128 129
 132 133 147 148 149 150 152 153 154 155 156 158 159 161 162 164 172 176
 177 178 179 180 182 183 185 186 187 188 190 203 206 207 208 209 210 211
 212 213 214 215 216 217 218 220 228 230 232 234 235 236 237 238 239 240
 241 242 243 244 247 261 262 263 264 265 266 267 268 269 270 271 272 273
 276 286 287 289 290 291 292 293 294 295 296 297 298 299 300 301 314 315
 316 317 318 319 320 321 322 323 324 325 326 328 329 342 343 344 345 346
 347 348 349 350 351 352 353 354 355 356 357 358 360 370 371 372 373 374
 375 376 377 378 379 380 381 382 383 384 385 386 396 400 401 402 403 404
 405 406 407 408 409 410 411 412 413 416 426 427 428 429 430 431 432 433
 434 435 436 437 438 439 440 441 442 443 453 454 455 456 457 458 459 460
 461 462 463 464 465 466 467 468 469 470 473 482 483 484 485 486 487 488
 489 490 491 492 493 494 496 497 498 500 510 512 513 5

# Spambase Dataset

In [11]:
spambase = fetch_openml(name="Spambase", version=1, parser="pandas")
features_sb = spambase.data
labels_sb = spambase.target.astype(int)

print(f"Dataset shape: {len(labels_sb)}")
print("Label distribution:", labels_sb.value_counts())

Dataset shape: 4601
Label distribution: class
0    2788
1    1813
Name: count, dtype: int64


In [12]:
features_sb_train, features_sb_test, labels_sb_train, labels_sb_test = train_test_split(features_sb, labels_sb, test_size=0.2, random_state=97)
print(f"Training set shape: {features_sb_train.shape}")
print(f"Test set shape: {features_sb_test.shape}")

Training set shape: (3680, 57)
Test set shape: (921, 57)


### L2 regularized Logistic Regression

In [13]:
log_reg_sb = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, verbose=1, n_jobs=-1)
log_reg_sb.fit(features_sb_train, labels_sb_train)
y_pred_log_sb = log_reg_sb.predict(features_sb_test)

log_accuracy_sb = accuracy_score(labels_sb_test, y_pred_log_sb)
print(f'Logistic Regression Accuracy on Spambase: {log_accuracy_sb:.4f}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           58     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  4.30524D+01

At iterate   50    f=  2.90396D-01    |proj g|=  3.12759D+00

At iterate  100    f=  2.43916D-01    |proj g|=  1.68112D-01

At iterate  150    f=  2.33950D-01    |proj g|=  5.36825D-01

At iterate  200    f=  2.17091D-01    |proj g|=  1.16747D-01

At iterate  250    f=  2.13982D-01    |proj g|=  6.24229D-03

At iterate  300    f=  2.13660D-01    |proj g|=  1.51554D-01

At iterate  350    f=  2.12783D-01    |proj g|=  1.33006D-01

At iterate  400    f=  2.11331D-01    |proj g|=  9.97603D-02

At iterate  450    f=  2.10776D-01    |proj g|=  1.95436D-02

At iterate  500    f=  2.08648D-01    |proj g|=  1.39754D-02

At iterate  550    f=  2.08364D-01    |proj g|=  4.87844D-01

At iterate  600    f=  2.08112D-01    |proj g|=  1.86070D-01

At iterate  650    f=  2.0

 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy on Spambase: 0.9121


In [14]:
top_features_log_sb = np.argsort(np.abs(log_reg_sb.coef_).flatten())[-30:]
print("Top 30 important features for Logistic Regression (feature indices):", top_features_log_sb)

Top 30 important features for Logistic Regression (feature indices): [42 29 16 14  3  5 35 46 44 30 32 38 53  7 25 15 19 48 34 28 47 40 43 45
 24 41  6 22 52 26]


### Decision Tree Model

In [15]:
dt_model_sb = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model_sb.fit(features_sb_train, labels_sb_train)

y_pred_dt_sb = dt_model_sb.predict(features_sb_test)

dt_accuracy_sb = accuracy_score(y_pred_dt_sb, labels_sb_test)
print(f'Decision Tree Accuracy on Spambase: {dt_accuracy_sb:.4f}')

Decision Tree Accuracy on Spambase: 0.9077


In [16]:
top_features_dt_sb = np.argsort(dt_model_sb.feature_importances_)[-30:]
print("Top 30 important features for Decision Tree (feature indices):", top_features_dt_sb)

Top 30 important features for Decision Tree (feature indices): [32 29 53 25 12 10  9 22  7 11 44 18 27 41 16 36 56 49  4 54 20 45 26 23
 52 24 15 55  6 51]


# 20NG Dataset

In [5]:
categories = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
newsgroups_train = fetch_20newsgroups(categories=categories, remove=('headers', 'footers', 'quotes'))
X = newsgroups_train.data
y = newsgroups_train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=97)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


# newsgroups_test = fetch_20newsgroups(subset="test", categories=categories, remove=("headers", "footers", "quotes"))
# X_test = newsgroups_test.data
# y_test = newsgroups_test.target

# vectorizer_1 = TfidfVectorizer()

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (2712, 35163)
Test set shape: (678, 35163)


In [6]:
logistic_regression = LogisticRegression(penalty="l2", solver="lbfgs", max_iter=1000, multi_class="multinomial", verbose=0, n_jobs=-1)
logistic_regression.fit(X_train, y_train)
y_prediction_logistic_reg = logistic_regression.predict(X_test)
logistic_regression_accuracy = accuracy_score(y_test, y_prediction_logistic_reg)
print(f'Logistic Regression Accuracy on 20 Newsgroups: {logistic_regression_accuracy:.4f}')



Logistic Regression Accuracy on 20 Newsgroups: 0.8304


In [7]:
top_30_features = np.argsort(np.abs(logistic_regression.coef_).sum(axis=0))[-30:]
feature_names = np.array(vectorizer.get_feature_names_out())
top_words_log = feature_names[top_30_features]

print("Top 30 important words for Logistic Regression:", top_words_log)

Top 30 important words for Logistic Regression: ['algorithm' 'he' '3d' 'files' 'use' 'circuit' 'keys' 'they' 'religion'
 'thanks' 'file' 'is' 'people' 'you' 'nsa' 'chip' 'msg' 'weapons' 'image'
 'of' 'guns' 'that' 'god' 'the' 'clipper' 'encryption' 'government' 'key'
 'graphics' 'gun']


### Decision Tree

In [9]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_prediction_decision_tree = decision_tree.predict(X_test)

dt_acc = accuracy_score(y_test, y_prediction_decision_tree)
print(f'Decision Tree Accuracy on 20 Newsgroups: {dt_acc:.4f}')

Decision Tree Accuracy on 20 Newsgroups: 0.5855


In [10]:
top_features_decision_tree = np.argsort(decision_tree.feature_importances_)[-30:]

top_words_decision_tree = feature_names[top_features_decision_tree]

print("Top 30 important words for Decision Tree:", top_words_decision_tree)

Top 30 important words for Decision Tree: ['of' 'treatment' 'electronics' 'morality' 'image' 'it' 'is' 'in' 'and'
 'this' 'that' 'power' 'thanks' 'files' 'msg' 'fbi' 'disease' 'religion'
 'weapons' 'to' 'government' 'doctor' 'clipper' 'circuit' 'gordon' 'god'
 'key' 'graphics' 'gun' 'encryption']


In [15]:
dt_model_small = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model_small.fit(X_train, y_train)

dt_model_large = DecisionTreeClassifier(max_depth=200, random_state=42)
dt_model_large.fit(X_train, y_train)

y_pred_dt_small = dt_model_small.predict(X_test)
y_pred_dt_large = dt_model_large.predict(X_test)

acc_small = accuracy_score(y_test, y_pred_dt_small)
acc_large = accuracy_score(y_test, y_pred_dt_large)

print(f'Decision Tree Accuracy (max_depth=5): {acc_small:.4f}')
print(f'Decision Tree Accuracy (max_depth=15): {acc_large:.4f}')

Decision Tree Accuracy (max_depth=5): 0.3702
Decision Tree Accuracy (max_depth=15): 0.5855
