In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt 
import os
!pip install Pillow
import seaborn as sn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn import ensemble
import xgboost as xgb

%pip install git+https://github.com/mwalmsley/galaxy_mnist.git
from galaxy_mnist import GalaxyMNIST

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#  Import galaxyMNIST dataset
train_dataset = GalaxyMNIST(
    root='ML_DP/gal_mnist',
    download=True
)

test_dataset = GalaxyMNIST(
    root='ML_DP/gal_mnist',
    download=True,
    train=False
)


images_train=train_dataset.data
y_train = train_dataset.targets
images_val=test_dataset.data
y_val = test_dataset.targets

# reshape of the images from  3 to 2 dimensions
print(images_train[0].shape)
nsamples , n , nx, ny = images_train.shape
d2_images_train = images_train.reshape((nsamples,n*nx*ny))
nsamples , n , nx, ny = images_val.shape
d2_images_val = images_val.reshape((nsamples,n*nx*ny))

# ***Gini***

In [None]:
# a test run to see when the tree ends without limitations on its max depth
classifier_gini_nomax = tree.DecisionTreeClassifier(criterion='gini', random_state=0)
classifier_gini_nomax.fit(d2_images_train, y_train)


n_nodes = classifier_gini_nomax.tree_.node_count
print(n_nodes)
print(classifier_gini_nomax.tree_.max_depth)

In [None]:
# 5-fold cross-validation (stopping at 27, as seen from the previous step)
values = np.arange(3,27,3)
cv_scs = {}
for i in values:
    clf = tree.DecisionTreeClassifier(max_depth=i,random_state=0)
    cv_scs[i]= cross_val_score(clf, d2_images_train, y_train, cv=5, n_jobs=-1)

# plot of the error for each value considered before
error_score=[]
for i in cv_scs.values():
    error_score.append(1-i)
plt.plot(cv_scs.keys(),error_score)
plt.show()

In [None]:
# check around value 9 (the best value, according to the previous step)
values_new = [6,7,8,9,10,11]

cv_scs_new = {}
for i in values_new:
    clf = tree.DecisionTreeClassifier(max_depth=i,random_state=0)
    cv_scs_new[i]= cross_val_score(clf, d2_images_train, y_train, cv=5, n_jobs=-1)

error_score_new=[]
for i in cv_scs_new.values():
    error_score_new.append(1-i)
plt.plot(cv_scs_new.keys(),error_score_new)
plt.show()

In [None]:
depthmax = 3 # this parameter can be changed to explore different results

classifier_gini = tree.DecisionTreeClassifier(criterion='gini', max_depth=depthmax, random_state=0)
classifier_gini.fit(d2_images_train, y_train)

n_nodes = classifier_gini.tree_.node_count
print(n_nodes)
print(classifier_gini.tree_.max_depth)

y_pred_train_gini = classifier_gini.predict(d2_images_train)
y_pred_gini = classifier_gini.predict(d2_images_val)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))
print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_val, y_pred_gini)))

In [None]:
# confusion matrix
cf_matrix = confusion_matrix(y_val, y_pred_gini)
classes = GalaxyMNIST.classes
df_cm = pd.DataFrame(cf_matrix 
                     , index = [i for i in classes], columns = [i for i in classes])
plt.figure(figsize = (10,6))
sn.heatmap(df_cm, annot=True, fmt='.0f')

In [None]:
plt.figure(figsize=(15,15)) # size for the following tree plot (still too hard to read)

tree.plot_tree(classifier_gini.fit(d2_images_train, y_train))

# ***Entropy***

In [None]:
# a test run to see when the tree ends without limitations on its max depth
classifier_entropy_nomax = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
# fit the model
classifier_entropy_nomax.fit(d2_images_train, y_train)

n_nodes = classifier_entropy_nomax.tree_.node_count
print(n_nodes)
print(classifier_entropy_nomax.tree_.max_depth)

In [None]:
# 5-fold cross-validation (stopping at 20, as seen from the previous step)
values = np.arange(2,20,3)

cv_scs = {}
for i in values:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=i,random_state=0)
    cv_scs[i]= cross_val_score(clf, d2_images_train, y_train, cv=5, n_jobs=-1)

# plot of the error for each value considered before
error_score=[]
for i in cv_scs.values():
    error_score.append(1-i)
plt.plot(cv_scs.keys(),error_score)
plt.show()

In [None]:
# check around value 8 (the best value, according to the previous step)
values_new = [6,7,8,9,10]
from sklearn.model_selection import cross_val_score
cv_scs_new = {}
for i in values_new:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=i,random_state=0)
    cv_scs_new[i]= cross_val_score(clf, d2_images_train, y_train, cv=5, n_jobs=-1)

error_score_new=[]
for i in cv_scs_new.values():
    error_score_new.append(1-i)
plt.plot(cv_scs_new.keys(),error_score_new)
plt.show()

In [None]:
depthmax = 8 # this parameter can be changed to explore different results

classifier_entropy = tree.DecisionTreeClassifier(criterion='entropy', max_depth=depthmax, random_state=0)
# fit the model
classifier_entropy.fit(d2_images_train, y_train)

n_nodes = classifier_entropy.tree_.node_count
print(n_nodes)
print(classifier_entropy.tree_.max_depth)

y_pred_train_entropy = classifier_entropy.predict(d2_images_train)
y_pred_entropy = classifier_entropy.predict(d2_images_val)

print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_entropy)))
print('Model accuracy score with criterion entropy index: {0:0.4f}'. format(accuracy_score(y_val, y_pred_entropy)))

In [None]:
# confusion matrtix
cf_matrix = confusion_matrix(y_val, y_pred_entropy)
classes = GalaxyMNIST.classes
df_cm = pd.DataFrame(cf_matrix 
                     , index = [i for i in classes], columns = [i for i in classes])
plt.figure(figsize = (10,6))
sn.heatmap(df_cm, annot=True, fmt= '.0f')

# ***Bagging***

In [None]:
bagging_model = ensemble.BaggingClassifier(tree.DecisionTreeClassifier(max_depth = 9), n_estimators = 15, max_samples = 0.50, max_features = 0.50, random_state=42) # bootstrap, di default è true
# fit the model
bagging_model.fit(d2_images_train, y_train)

In [None]:
# use the model to predict in train and test
y_train_predBag = bagging_model.predict(d2_images_train)
y_val_predBag = bagging_model.predict(d2_images_val)


print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_train_predBag)))
print('Model accuracy score with bagging method: {0:0.4f}'. format(accuracy_score(y_val, y_val_predBag)))

In [None]:
# confusion  matrix
cf_matrix = confusion_matrix(y_val, y_val_predBag)
classes = GalaxyMNIST.classes
df_cm = pd.DataFrame(cf_matrix 
                     , index = [i for i in classes], columns = [i for i in classes])
plt.figure(figsize = (10,6))
sn.heatmap(df_cm, annot=True, fmt= '.0f')

# ***XGBoosting***

In [None]:
depthmax = 3 # this parameter can be changed to explore different results

xgb_model = xgb.XGBClassifier(objective='multi:softmax', tree_method='hist',  device='cuda', max_depth=depthmax, random_state=42)
xgb_model.fit(d2_images_train, y_train)

In [None]:
y_train_pred = xgb_model.predict(d2_images_train)
y_val_pred = xgb_model.predict(d2_images_val)


print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_train_pred)))
print('Model accuracy score with XGBoost: {0:0.4f}'. format(accuracy_score(y_val, y_val_pred)))

In [None]:
# confusion matrix
cf_matrix = confusion_matrix(y_val, y_val_pred)
classes = GalaxyMNIST.classes
df_cm = pd.DataFrame(cf_matrix 
                     , index = [i for i in classes], columns = [i for i in classes])
plt.figure(figsize = (10,6))
sn.heatmap(df_cm, annot=True, fmt= '.0f')