#### PROBLEM 1: Supervised Classification Libraries: Regression, Decision Tree
6 Runs of Supervised Training / Testing : 3 datasets (MNIST, Spambase, 20NG) x 2 Classification Algorithms (L2-reg Logistic Regression, Decision Trees). You can use a library for the classification algorithms, and also can use any library/script to process data in appropriate formats.
You are required to explain/analyze the model trained in terms of features : for each of the 6 runs list the top F=30 features. For the Regression these correspond to the highest-absolute-value F coefficients; for Decision Tree they are the first F splits. In particular for Decision Tree on 20NG, report performance for two tree sizes ( by depths of the tree, or number of leaves, or number of splits )

MNIST

In [8]:
import numpy as np
import keras
import random
from keras.datasets import mnist
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()

#reshaping images
train_images = np.reshape(train_images, (-1, 784))
test_images = np.reshape(test_images, (-1, 784))

# normalize
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

random_sample_indices = random.sample(range(train_images.shape[0]), 20000)
train_images_25 = train_images[random_sample_indices]
train_labels_25 = train_labels[random_sample_indices]


train_images_final_80, validation_images_final_10, train_labels_final_80,validation_labels_final_10 = train_test_split(train_images_25, train_labels_25, test_size=0.1, random_state=42)

print("Final train dataset size: ", train_images_final_80.shape)
print("Final validation dataset size: ", validation_images_final_10.shape)


Final train dataset size:  (18000, 784)
Final validation dataset size:  (2000, 784)


In [9]:
model = LogisticRegression(solver='saga', multi_class='multinomial')
model.fit(train_images_final_80, train_labels_final_80)



In [10]:
accuracy = model.score(test_images, test_labels)
print("LR Accuracy MNIST:", accuracy)

f_coeff_ind = abs(model.coef_).argsort()[0][-30:]

f_coeff = model.coef_[0][f_coeff_ind]
print("fcoeff: ", f_coeff)




LR Accuracy MNIST: 0.9156
fcoeff:  [ 0.75757825 -0.76162463 -0.7650207   0.7702444  -0.77875584  0.7834242
  0.81925917  0.8280954   0.8308882   0.8390245  -0.8447779  -0.88116956
 -0.8865749  -0.90081567  0.912313   -0.91808105  0.9280532  -0.9292194
  0.9326742  -0.9451164   0.9489257   0.9928934  -1.036317   -1.0842602
 -1.0938262   1.1030852  -1.1611779  -1.2281928  -1.3089774  -1.4567184 ]


In [11]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(train_images_final_80, train_labels_final_80)

y_pred = clf.predict(test_images)
accuracy = accuracy_score(test_labels, y_pred)
print("DT Accuracy MNIST:", accuracy)

f_splits = clf.feature_importances_
indices = np.argsort(f_splits)[::-1]

# Get the top 30 feature splits
top_30_indices = indices[:30]
top_30_f_split = f_splits[top_30_indices]

print("T30 F splits: ", top_30_f_split)

DT Accuracy MNIST: 0.8358
T30 F splits:  [0.05772584 0.04452095 0.04447108 0.04031646 0.03390461 0.03366337
 0.03255168 0.03093454 0.02935279 0.01707411 0.01669946 0.0165759
 0.01576505 0.0154793  0.01533729 0.01479319 0.01429434 0.01255257
 0.01145947 0.01087847 0.01083782 0.00944018 0.00942062 0.00838922
 0.00835544 0.00803158 0.00716175 0.00697    0.00669281 0.00606568]


20NG

In [12]:
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

vectorizer = TfidfVectorizer(stop_words='english')

train_data_vector = vectorizer.fit_transform(newsgroups_train.data)
test_data_vector = vectorizer.fit_transform(newsgroups_test.data)

train_labels = newsgroups_train.target
test_labels = newsgroups_test.target

random_sample_indices = random.sample(range(train_data_vector.shape[0]), 3000)
train_data_20 = train_data_vector[random_sample_indices]
train_labels_20 = train_labels[random_sample_indices]

train_data_final_80, test_data_final_10, train_labels_final_80,test_labels_final_10 = train_test_split(train_data_20, train_labels_20, test_size=0.1, random_state=42)

In [13]:
model2 = LogisticRegression()
model2.fit(train_data_final_80, train_labels_final_80)

In [14]:
test_pred = model2.predict(test_data_final_10)

accuracy = model2.score(test_data_final_10, test_labels_final_10)
print("LR Accuracy 20NG:", accuracy)

f_coeff_ind = abs(model2.coef_).argsort()[0][-30:]

f_coeff = model2.coef_[0][f_coeff_ind]
print("fcoeff: ", f_coeff)

LR Accuracy 20NG: 0.8566666666666667
fcoeff:  [0.91532437 0.92240668 0.92241387 0.92426311 0.9398334  0.94389121
 0.94569611 0.96094287 0.97861929 0.98984247 1.02281557 1.02453323
 1.03358254 1.05005661 1.05920848 1.07873861 1.09826462 1.10974902
 1.19159874 1.24994386 1.28547957 1.40370314 1.40488974 1.4372766
 1.53382311 1.60818404 1.61063752 1.95426715 2.51805292 2.98404774]


In [15]:
clf2 = DecisionTreeClassifier(max_depth=8)
clf2.fit(train_data_final_80, train_labels_final_80)

y_pred = clf2.predict(test_data_final_10)
accuracy = accuracy_score(test_labels_final_10, y_pred)
print("DT Accuracy 20NG:", accuracy)


f_splits = clf2.feature_importances_
indices = np.argsort(f_splits)[::-1]

# Get the top 30 feature splits
top_30_indices = indices[:30]
top_30_f_split = f_splits[top_30_indices]

print("T30 F splits: ", top_30_f_split)

DT Accuracy 20NG: 0.21333333333333335
T30 F splits:  [0.14266684 0.12284707 0.11491116 0.10693462 0.09459648 0.08931108
 0.08661822 0.08086217 0.01484863 0.01062068 0.00945059 0.00863675
 0.0085604  0.0082465  0.00703842 0.00701765 0.00691024 0.00685567
 0.00681183 0.00665764 0.00635587 0.00621798 0.00503261 0.00483204
 0.00441991 0.00394635 0.00363889 0.00363139 0.00362288 0.00359609]


Spambase

In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

dataset_path = 'spambase.data'
df = pd.read_csv(dataset_path, header=None)

X = df.iloc[:, :-1]  # All columns except the last one
y = df.iloc[:, -1]   # Last column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
model3 = LogisticRegression()
model3.fit(X_train_scaled, y_train)

In [21]:

from sklearn.metrics import r2_score
y_pred = model3.predict(X_test_scaled)

accuracy = accuracy_score(y_test.to_numpy(), y_pred)
print("LR Accuracy Spambase:", accuracy)


f_coeff_ind = abs(model3.coef_).argsort()[0][-30:]

f_coeff = model3.coef_[0][f_coeff_ind]
print("fcoeff: ", f_coeff)

LR Accuracy Spambase: 0.9196525515743756
fcoeff:  [ 0.27572044 -0.29124024 -0.31191256 -0.39870543  0.40805579  0.41034278
 -0.426562    0.43306716  0.44123747  0.45301971  0.63453786 -0.73160824
 -0.77656561 -0.79440478  0.79749098  0.79904244  0.85784772 -0.8662908
  0.89126107  0.89796792  0.98401877 -0.9911781  -1.01771114 -1.15577101
  1.23395593 -1.28144088 -1.48178192 -1.68677479 -2.79647274 -4.47254885]


In [20]:
clf3 = DecisionTreeClassifier()
clf3.fit(X_train_scaled, y_train)

y_pred = clf3.predict(X_test_scaled)
accuracy = accuracy_score(y_test.to_numpy(), y_pred)
print("DT Accuracy Spambase:", accuracy)

f_splits = clf3.feature_importances_
indices = np.argsort(f_splits)[::-1]

# Get the top 30 feature splits
top_30_indices = indices[:30]
top_30_f_split = f_splits[top_30_indices]

print("T30 F splits: ", top_30_f_split)

DT Accuracy Spambase: 0.9098805646036916
T30 F splits:  [0.33971147 0.15577538 0.08775222 0.06149761 0.05025442 0.03464847
 0.03050636 0.02584688 0.022009   0.01681025 0.01473176 0.01286078
 0.01162578 0.01105047 0.01012196 0.00893628 0.00876501 0.00722972
 0.0062277  0.00620862 0.00585309 0.0056136  0.00534979 0.00519072
 0.00491891 0.00473048 0.00451871 0.00402582 0.00375304 0.00319552]
