In [1]:
# ==================================== Import Libraries ============================================
import pandas as pd
import numpy as np
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from random import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm
import time
from hyperopt import fmin, tpe, hp
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.models import Model

In [2]:
# Read data of initial dataset
tsv_file = "Raw_common18704genes_antiTNF_normalized.tsv"
df = pd.read_csv(tsv_file, sep="\t")
print(df.head(10))


      Gene    A_Wt.1    A_Wt.2    A_Wt.3    A_Wt.4    A_Wt.5    A_Wt.6  \
0     A1bg  3.070662  3.439500  3.156002  3.090952  2.995979  3.082066   
1     A1cf  4.203059  4.163411  4.090821  4.063431  4.244164  4.304573   
2    A2ld1  5.688889  5.985790  6.008787  7.747365  7.666683  7.691302   
3      A2m  5.643947  6.761445  5.575969  5.979892  5.427408  6.560586   
4  A3galt2  4.848963  4.872971  4.875028  4.532527  4.888920  4.799635   
5   A4galt  7.968874  7.840223  8.045192  7.943682  7.862246  7.870894   
6    A4gnt  4.766360  4.508001  4.703926  4.669750  4.655340  4.617132   
7     Aaas  6.955212  7.195978  6.989937  7.357529  7.634138  7.262624   
8     Aacs  7.917138  8.385624  8.005533  6.794933  7.004142  7.143298   
9    Aadac  3.028571  2.869241  2.999758  2.924827  3.060868  3.189684   

     A_Wt.7    A_Wt.8    A_Wt.9  ...  G_Ther_Cim.1  G_Ther_Cim.2  \
0  3.056484  4.211936  4.176905  ...      2.953014      2.972845   
1  4.019474  4.075409  4.092110  ...      4.29359

In [3]:
# ---------------------------------------- PREPARE & ORGANIZE DATA INTO NEW DATASET --------------------------------
genes = df['Gene'].values
print(genes)

classes = df.columns.to_list()
# Exclude "Gene" class
classes = classes[1:]
print(classes)

# Modify class names 
for i in range(0, len(classes)):
    if '.' in classes[i]:
        parts = classes[i].split('.')
        classes[i] = parts[0]

# Create final dataframe with gene names as column headers
final_df = pd.DataFrame(columns=genes)
for i in range(1, len(classes)+1):
    values = df.iloc[:, i].tolist()
    new_data = {column: [value] for column, value in zip(genes, values)}
    final_df = final_df._append(pd.DataFrame(new_data), ignore_index=True)

# Create class of numeric labels
final_df['label']=classes
final_df['label'] = pd.factorize(final_df['label'])[0]

# Create list of row indices
rows = list(final_df.index)

# Randomly shuffle list of row indices
shuffle(rows)

# Reorder rows and reset index
final_df = final_df.loc[rows].reset_index(drop=True)

# Display first 20 rows of new dataframe
final_df.head(20)


['A1bg' 'A1cf' 'A2ld1' ... 'Zyx' 'Zzef1' 'Zzz3']
['A_Wt.1', 'A_Wt.2', 'A_Wt.3', 'A_Wt.4', 'A_Wt.5', 'A_Wt.6', 'A_Wt.7', 'A_Wt.8', 'A_Wt.9', 'A_Wt.10', 'B_Tg.1', 'B_Tg.2', 'B_Tg.3', 'B_Tg.4', 'B_Tg.5', 'B_Tg.6', 'B_Tg.7', 'B_Tg.8', 'B_Tg.9', 'B_Tg.10', 'B_Tg.11', 'B_Tg.12', 'B_Tg.13', 'C_Proph_Ther_Rem.1', 'C_Proph_Ther_Rem.2', 'C_Proph_Ther_Rem.3', 'D_Ther_Rem.1', 'D_Ther_Rem.2', 'D_Ther_Rem.3', 'D_Ther_Rem.4', 'D_Ther_Rem.5', 'D_Ther_Rem.6', 'D_Ther_Rem.7', 'D_Ther_Rem.8', 'D_Ther_Rem.9', 'D_Ther_Rem.10', 'E_Ther_Hum.1', 'E_Ther_Hum.2', 'E_Ther_Hum.3', 'E_Ther_Hum.4', 'E_Ther_Hum.5', 'E_Ther_Hum.6', 'E_Ther_Hum.7', 'E_Ther_Hum.8', 'E_Ther_Hum.9', 'E_Ther_Hum.10', 'F_Ther_Enb.1', 'F_Ther_Enb.2', 'F_Ther_Enb.3', 'F_Ther_Enb.4', 'F_Ther_Enb.5', 'F_Ther_Enb.6', 'F_Ther_Enb.7', 'F_Ther_Enb.8', 'F_Ther_Enb.9', 'F_Ther_Enb.10', 'G_Ther_Cim.1', 'G_Ther_Cim.2', 'G_Ther_Cim.3', 'G_Ther_Cim.4', 'G_Ther_Cim.5', 'G_Ther_Cim.6', 'G_Ther_Cim.7', 'G_Ther_Cim.8', 'G_Ther_Cim.9', 'G_Ther_Cim.10']


  final_df = final_df._append(pd.DataFrame(new_data), ignore_index=True)


Unnamed: 0,A1bg,A1cf,A2ld1,A2m,A3galt2,A4galt,A4gnt,Aaas,Aacs,Aadac,...,Zwint,Zxda,Zxdb,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3,label
0,4.32107,4.021391,7.369673,6.121501,5.147899,9.822775,4.682857,6.962096,6.852633,3.196332,...,7.763792,4.622556,6.151181,8.119445,4.354939,8.772263,8.774053,8.267863,8.159677,5
1,2.919909,4.354033,7.474313,6.002996,4.779167,8.321367,4.740331,7.694418,6.988879,3.186709,...,8.125422,6.525314,7.21495,7.679649,4.684149,8.911109,8.567431,8.192386,8.862444,6
2,3.206897,4.073325,7.57164,5.777709,4.939243,7.90905,4.710475,7.804725,7.004807,3.160626,...,8.096005,6.545766,7.083403,7.623476,4.47781,8.999331,8.539705,8.004594,8.658114,4
3,2.972845,4.315601,7.525415,6.544025,4.813948,8.262546,4.765568,7.950718,6.773098,2.998891,...,8.166265,6.605128,7.16998,7.472457,4.534786,8.796447,8.509368,7.969855,8.860479,6
4,4.587729,4.045415,7.357232,4.798315,5.096405,9.446098,4.802614,7.014394,6.723467,3.038146,...,7.911696,4.689726,6.434438,7.93709,4.246788,9.232032,8.47488,8.148693,8.176003,6
5,4.221542,4.0149,8.189553,6.411558,5.145728,9.073336,4.246255,7.951498,7.118635,3.345147,...,8.483164,5.696999,7.139869,8.09463,4.410693,8.927373,8.792674,8.044391,8.235345,4
6,3.232535,4.254009,7.269357,5.429067,4.710475,7.989371,4.81127,7.922743,7.020712,3.072391,...,8.164049,6.594374,7.176467,7.481956,4.080114,8.683916,8.246039,8.113242,8.77952,1
7,4.224174,4.016924,7.612807,6.044607,5.060565,8.913594,3.998422,8.379063,7.132631,3.275255,...,8.502794,5.76263,6.980724,7.866077,4.308622,8.56349,9.262654,8.266389,8.349752,3
8,4.145197,4.089815,7.252869,5.059333,5.060895,9.600157,4.441106,7.277305,6.976342,3.060018,...,7.814398,4.690094,6.861327,7.954723,4.022942,8.830288,8.707521,8.191723,8.009167,6
9,3.156002,4.090821,6.008787,5.575969,4.875028,8.045192,4.703926,6.989937,8.005533,2.999758,...,8.244772,5.653223,7.061028,7.93786,6.376859,9.284346,8.051423,8.702022,9.122607,0


In [4]:
# ------------------------------------- SPLIT DATASET INTO TRAINING & TESTING DATA ---------------------------------------
y=final_df['label']

# Remove label column from dataframe
final_df.drop(['label'], 
              axis=1,  # Removing a column
              inplace=True) # Modification happens in-place --> original dataframe is changed

''' Assign remaining data to x:
x contains gene expression values for all the genes across samples. 
Each row represents a sample / Each column represents a gene.
'''
x = final_df

''' Split data into training and validation sets:
x_train: Training data for the features (gene expression values)
x_val: Validation data for the features (gene expression values)
y_train: Training data for the labels (class labels)
y_val: Validation data for the labels (class labels)
'''
x_train, x_val, y_train, y_val = train_test_split(x, y, 
                                                  test_size=0.3, # 30% of data will be used as testing/validation set
                                                  random_state=10) # Seed for the random number generator -> Makes split reproducible



In [5]:
x_train.head(10)

Unnamed: 0,A1bg,A1cf,A2ld1,A2m,A3galt2,A4galt,A4gnt,Aaas,Aacs,Aadac,...,Zwilch,Zwint,Zxda,Zxdb,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3
53,4.310112,4.058462,7.664548,6.591627,4.83954,8.732336,4.107264,8.555904,7.29628,3.348826,...,5.628507,8.495026,5.772443,7.070543,7.832033,4.210723,8.501608,8.959582,8.241116,8.230142
10,3.201218,3.724559,5.512228,4.909472,4.936379,7.509391,4.766212,7.561179,7.074273,3.222948,...,6.415518,8.009167,5.489534,7.18707,7.755945,4.289164,8.990053,8.351559,8.598761,9.207717
46,2.734864,3.928354,5.959974,6.163791,4.811149,7.825811,4.943579,7.033396,7.397792,3.105009,...,5.852874,8.101346,5.784241,7.121693,7.926408,5.524635,9.334896,8.358463,8.669588,9.039516
44,3.086808,4.161198,6.092865,5.717453,4.827016,7.701078,4.868709,6.993167,7.97759,2.912311,...,5.98384,8.114938,5.594973,7.158818,7.950397,5.556018,9.380916,7.934088,8.623265,9.006445
35,3.009204,4.073736,7.518545,5.986411,4.804974,7.86743,4.605314,7.722711,6.978855,3.070662,...,6.827716,8.241116,6.633596,7.151413,7.648888,4.553253,8.939947,8.580009,7.951099,8.688412
18,4.35307,4.080114,7.399227,5.857244,4.995655,9.70745,4.420678,7.162189,6.860641,3.199159,...,5.083333,7.526748,4.614155,6.577314,7.946963,3.966876,8.866846,8.764438,8.316751,8.146425
4,4.587729,4.045415,7.357232,4.798315,5.096405,9.446098,4.802614,7.014394,6.723467,3.038146,...,5.39989,7.911696,4.689726,6.434438,7.93709,4.246788,9.232032,8.47488,8.148693,8.176003
31,2.995979,4.244164,7.666683,5.427408,4.88892,7.862246,4.65534,7.634138,7.004142,3.060868,...,6.381893,8.174538,6.567327,7.101713,7.685325,4.293126,9.134554,8.274243,8.046331,8.786375
1,2.919909,4.354033,7.474313,6.002996,4.779167,8.321367,4.740331,7.694418,6.988879,3.186709,...,6.151492,8.125422,6.525314,7.21495,7.679649,4.684149,8.911109,8.567431,8.192386,8.862444
12,4.211936,4.163802,8.161904,6.092865,5.000407,8.641947,4.070842,7.917748,6.99464,3.257637,...,5.41371,8.58268,5.821607,6.95995,8.127945,4.304869,9.01853,9.180087,8.035402,8.269289


In [6]:
''' Gaussian Naive Bayes Classifier
Based on applying Bayes' theorem with the “naive” assumption of independence between every pair of features.
'''
nbc = GaussianNB()

# Train model
nbc.fit(x_train,y_train)

In [7]:
''' Support Vector Classifier
Aims to find the hyperplane in an N-dimensional space that distinctly classifies the data points.
To separate two classes, the SVM finds the hyperplane with the maximum margin,
which is the maximum distance between data points of both classes. 
'''
svc = SVC()

# Train model
svc.fit(x_train, y_train)

In [8]:
''' Desicion Tree Classifier
Uses a decision tree to go from observations about an item to conclusions about the item's target value.
It's a simple flowchart-like structure where each internal node represents a “test” on an attribute,
each branch represents the outcome of the test,and each leaf node represents a class label.
The paths from root to leaf represent classification rules.
'''
tree = DecisionTreeClassifier()

# Train model
tree.fit(x_train, y_train)

In [9]:
''' Random Forest Classifier
Builds multiple decision trees and merges them together to get a more accurate and stable prediction. 
Often more accurate than the prediction of any individual tree,
because the forest corrects for the overfitting of individual trees to their training set.
'''
rf = RandomForestClassifier()

# Train model
rf.fit(x_train, y_train)

In [10]:
# Make predictions using models
y_pred_nbc = nbc.predict(x_val)
y_pred_svc = svc.predict(x_val)
y_pred_tree = tree.predict(x_val)
y_pred_rf = rf.predict(x_val)

In [11]:
# Evaluate predictions and decide which model is best fit for our dataset

''' Calculate F1 score between true labels and labels from model's prediction.
The average=parameter argument uses the value 'micro' to calculate the F1 score by aggregating the contributions of all classes.
[Calculating metrics globally by counting the total true positives, false negatives, and false positives.]
It's useful when you have a multi-class classification problem and want to get an overall F1 score across all classes.
'''
parameter = 'micro'
s1 = f1_score(y_val, y_pred_nbc, average = parameter)

# Calculate accuracy of model prediction
accuracy = accuracy_score(y_val, y_pred_nbc)

print(f"GaussianNB F1 score: {s1} \nGaussianNB accuracy: {accuracy}")

GaussianNB F1 score: 0.3 
GaussianNB accuracy: 0.3


In [12]:
# Calculate F1 & accuracy scores for SVC
s2=f1_score(y_val, y_pred_svc, average=parameter)
accuracy2 = accuracy_score(y_val, y_pred_svc)
print(f"SVC F1 score: {s2} \nSVC accuracy: {accuracy2}")

SVC F1 score: 0.25 
SVC accuracy: 0.25


In [13]:
# Calculate F1 & accuracy scores for Decision Tree Classifier
s3=f1_score(y_val, y_pred_tree, average=parameter)
accuracy3 = accuracy_score(y_val, y_pred_svc)
print(f"Decision Tree F1 score: {s3} \nDecision Tree accuracy: {accuracy3}")

Decision Tree F1 score: 0.5 
Decision Tree accuracy: 0.25


In [14]:
# Calculate F1 & accuracy scores for Random Forest Classifier
s4=f1_score(y_val, y_pred_rf, average=parameter)
accuracy4 = accuracy_score(y_val, y_pred_svc)
print(f"Random Forest F1 score: {s4} \nRandom Forest accuracy: {accuracy4}")

Random Forest F1 score: 0.3 
Random Forest accuracy: 0.25


In [15]:
'''Confusion Matrix Calculation for Random Forest Classifier model predictions:
A confusion matrix is a table used to evaluate the performance of a classification model
by showing how well the predicted values match the actual values.
'''
cnf_matrix = confusion_matrix(y_val, y_pred_rf)
print(cnf_matrix)

[[0 0 0 2 0 0 0]
 [0 4 0 0 0 1 0]
 [1 0 0 0 0 0 0]
 [0 2 0 1 0 0 0]
 [1 1 0 1 0 0 0]
 [0 1 0 1 1 1 1]
 [0 1 0 0 0 0 0]]


In [16]:
'''Because of the small amount of data that we have, if we execute again the above cells, we may get slightly different results.
SOLUTION: run these experiments multiple times to get average results.
'''
# Define function for trials
def average_trainer(trials):
    '''The function calculates the minimum, maximum, and average F1 scores for each classifier based on the predictions made on a validation dataset.
    The F1 score ranges from 0 (worst performance) to 1 (perfect performance).
    '''
    # Initialize values for min variable for each classifier
    min1 = 1
    min2 = 1
    min3 = 1
    min4 = 1
    
    # Initialize values for max variable for each classifier
    max1 = 0
    max2 = 0
    max3 = 0
    max4 = 0

    # Initialize values for avg variable for each classifier
    avg1 = 0
    avg2 = 0
    avg3 = 0
    avg4 = 0

    # Set parameter to 'micro'
    parameter = 'micro'

    for i in tqdm(range(trials), desc="Processing", unit="iteration"):

        # Split dataset to training and validation sets for features and labels
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=10)

        # Define & Train Classifiers
        nbc = GaussianNB()
        nbc.fit(x_train,y_train)
        svc = SVC()
        svc.fit(x_train, y_train)
        tree = DecisionTreeClassifier()
        tree.fit(x_train, y_train)
        rf = RandomForestClassifier()
        rf.fit(x_train, y_train)

        # Make predictions using classifiers
        y_pred_nbc = nbc.predict(x_val)
        y_pred_svc = svc.predict(x_val)
        y_pred_tree = tree.predict(x_val)
        y_pred_rf = rf.predict(x_val)

        # Calculate F1 scores for GaussianNB classifier
        s1=f1_score(y_val, y_pred_nbc, average=parameter)
        # Assign f1 score to min or max value for classifier
        if(s1<min1):
            min1=s1
        if(s1>max1):
            max1=s1
        # Add to average of f1 values for classifier
        avg1 += s1


        # Calculate F1 scores for SVM classifier
        s2=f1_score(y_val, y_pred_svc, average=parameter)
        # Assign f1 score to min or max value for classifier
        if(s2<min2):
            min2=s2
        if(s2>max2):
            max2=s2
        # Add to average of f1 values for classifier
        avg2 += s2

        # Calculate F1 scores for Decission Tree classifier
        s3=f1_score(y_val, y_pred_tree, average=parameter)
        # Assign f1 score to min or max value for classifier
        if(s3<min3):
            min3=s3
        if(s3>max3):
            max3=s3
        # Add to average of f1 values for classifier
        avg3 += s3

        # Calculate F1 scores for Random Forest classifier
        s4=f1_score(y_val, y_pred_rf, average=parameter)
        # Assign f1 score to min or max value for classifier
        if(s4<min4):
            min4=s4
        if(s4>max4):
            max4=s4
        # Add to average of f1 values for classifier
        avg4 += s4

    # Fix averages
    avg1=avg1/trials
    avg2=avg2/trials
    avg3=avg3/trials
    avg4=avg4/trials   

    # Print metrics of classifiers' performance
    print(f"GNB: max: {max1}, min: {min1}, average: {avg1}")
    print(f"SVC: max: {max2}, min: {min2}, average: {avg2}")
    print(f"DecisionTree: max: {max3}, min: {min3}, average: {avg3}")
    print(f"RandomForest: max: {max4}, min: {min4}, average: {avg4}")


In [17]:
average_trainer(50)

Processing: 100%|██████████| 50/50 [05:57<00:00,  7.15s/iteration]

GNB: max: 0.3, min: 0.3, average: 0.30000000000000027
SVC: max: 0.25, min: 0.25, average: 0.25
DecisionTree: max: 0.6, min: 0.3, average: 0.4379999999999999
RandomForest: max: 0.5, min: 0.25, average: 0.36200000000000004





In [18]:
# Check how different model parameters change results --> Select the best possible random forest classifier

def objective(params : dict) -> int :
    ''' Function that evaluates the performance of a Random Forest Classifier
    with a specific set of hyperparameters.
    It returns a negative score.
    '''
    # Dictionary of hyperparameters
    n_estimators = params['n_estimators'] # Number of trees in the Random Forest
    max_depth = params['max_depth'] # Maximum depth of each tree
    min_samples_split = params['min_samples_split']  # Minimum number of samples required to split an internal node
    min_samples_leaf = params['min_samples_leaf'] # Minimum numbere of samples required to be at a leaf node

    # Initialize Classifier
    clf = RandomForestClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth, 
        min_samples_split = min_samples_split,
        min_samples_leaf = min_samples_leaf,
        random_state = 42
    )

    # Initialize final_score variable
    final_score = 0
    
    for i in range(0,20):

        # Split data into training and validation sets of features and labels
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=10)

        # Fit model
        clf.fit(x_train, y_train)

        # Model prediction of labels
        y_pred = clf.predict(x_val)

        # Calculate F1 scores
        parameter = 'micro'
        score = f1_score(y_val, y_pred, average = parameter)
        final_score += score
    
    final_score = -(final_score / 20) # Negative value because it will be used for the fmin() function which minimizes the score for optimization purposes.

    return final_score


In [19]:
''' Define search space (space dictionary) for possible hyperparameter values the objective() function will use.
hp.choice() function allows for discrete choices within a specific range.
'''
space = {
    'n_estimators': hp.choice('n_estimators', range(10, 101)),
    'max_depth' : hp.choice('max_depth', range(1,21)),
    'min_samples_split' : hp.choice('min_samples_split', range(2,11)),
    'min_samples_leaf' : hp.choice('min_samples_leaf', range(1, 11))
}

''' Hyperparameter Optimization:
The fmin() function aims to minimize the output of the objective() function, 
which is negative, so it will lead to the highest F1 score.
'''
best = fmin(fn = objective, # Function being minimized: Trains a RF using the hyperparameters defined by space and returns a negative F1 score
            space = space,
            algo = tpe.suggest, # Optimization algorithm: Tree-structured Parzen Estimator [a Bayesian optimization algorithm]
            max_evals = 100, # Maximum number of evaluations / trials for the optimization
            ) # fmin will try 100 combinations of hyperparameters before it finishes

print(f"Best hyperparameters: {best}")

100%|██████████| 100/100 [27:18<00:00, 16.38s/trial, best loss: -0.4000000000000001]
Best hyperparameters: {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 1, 'n_estimators': 6}


In [20]:
''' Try PCA for dimensionality reduction to check if we get better results:
The number of principal components (n_components) is a new hyperparameter that is added to the search space.
The model will search for the best number of components to retain during PCA by exploring values between 2 and 100.
'''
def pca_objective(params):
    n_components = params['n_components']
    n_estimators = params['n_estimators']
    max_depth = params['max_depth']
    min_samples_split = params['min_samples_split']
    min_samples_leaf = params['min_samples_leaf']

    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    final_score=0

    for i in range(0,20):
      x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=10)

      # Scale training and validation data to normalize features
      scaler = StandardScaler()
      x_train_scaled = scaler.fit_transform(x_train) # Learns scaling parameters from training data and applies them
      x_val_scaled = scaler.transform(x_val) # Only applies learned parameters to validation data (!Model should not learn from validation data)

      pca = PCA(n_components=n_components)
      x_train_pca = pca.fit_transform(x_train_scaled)
      x_val_pca = pca.transform(x_val_scaled)
      clf.fit(x_train_pca, y_train)
      y_pred = clf.predict(x_val_pca)
      parameter='micro'
      score=f1_score(y_val, y_pred, average=parameter)
      final_score=final_score+score
    final_score=final_score/20

    return -final_score

# Define the search space for hyperparameters
space = {
    'n_components': hp.choice('n_components', range(2, 101)),
    'n_estimators': hp.choice('n_estimators', range(10, 101)),
    'max_depth': hp.choice('max_depth', range(1, 21)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 11)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 11))
}

# Use Tree-structured Parzen Estimator (TPE) as the optimization algorithm
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100)


100%|██████████| 100/100 [26:09<00:00, 15.70s/trial, best loss: -0.45]             


In [21]:
print(f"Best hyperparameters: {best}")

Best hyperparameters: {'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 0, 'n_components': 22, 'n_estimators': 17}


NOTE: We do not get any better results using PCA

In [22]:
# Using Neural Networks 

# Split dataset into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

# Split training dataset into training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=10)

# Define model
model = Sequential([
    Dense(64, activation = 'relu', input_shape = (18703,)),
    Dense(32, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
# Train model
model.fit(x_train, y_train, epochs=100, batch_size=2, validation_data=(x_val, y_val), verbose=2)

Epoch 1/100
16/16 - 8s - 509ms/step - accuracy: 0.0938 - loss: 127.0492 - val_accuracy: 0.1429 - val_loss: 28.6411
Epoch 2/100
16/16 - 1s - 49ms/step - accuracy: 0.0625 - loss: 34.8420 - val_accuracy: 0.2143 - val_loss: 9.3806
Epoch 3/100
16/16 - 1s - 42ms/step - accuracy: 0.1562 - loss: 14.1368 - val_accuracy: 0.2143 - val_loss: 10.3651
Epoch 4/100
16/16 - 1s - 32ms/step - accuracy: 0.1875 - loss: 10.0614 - val_accuracy: 0.1429 - val_loss: 6.0540
Epoch 5/100
16/16 - 1s - 48ms/step - accuracy: 0.2188 - loss: 5.0448 - val_accuracy: 0.1429 - val_loss: 6.8176
Epoch 6/100
16/16 - 1s - 43ms/step - accuracy: 0.0625 - loss: 6.6973 - val_accuracy: 0.2143 - val_loss: 7.1860
Epoch 7/100
16/16 - 1s - 35ms/step - accuracy: 0.2500 - loss: 7.4285 - val_accuracy: 0.3571 - val_loss: 6.1970
Epoch 8/100
16/16 - 1s - 35ms/step - accuracy: 0.0938 - loss: 6.6395 - val_accuracy: 0.2143 - val_loss: 6.5573
Epoch 9/100
16/16 - 1s - 34ms/step - accuracy: 0.1250 - loss: 6.3141 - val_accuracy: 0.1429 - val_loss: 

<keras.src.callbacks.history.History at 0x29250f65490>

In [24]:
# Evaluate model on test set
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f} %")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379ms/step - accuracy: 0.2000 - loss: 2.3527
Test Accuracy: 20.00 %


In [25]:
# Use objective() function and PCA for potential optimization of the neural network model 

# Define function that returns a neural network model
def neural_network(input_size, n_hidden, hidden_size):
    # Define input shape as a first layer in the form of an Input(shape) object 
    input_layer = Input(shape = (input_size,))

    # Define architecture of hidden layers as it is defined from the Input(shape) object layer
    hidden_layer = input_layer

    # Define hidden layers
    for _ in range(0, n_hidden):
        hidden_layer = Dense(hidden_size, activation='relu')(hidden_layer)

    # Define output layer
    output_layer = Dense(7, activation = 'softmax')(hidden_layer)

    # Define model
    model = Model(inputs=input_layer, outputs=output_layer)

    return model

In [26]:
def neural_pca_objective(params):
    ''' Optimization function with dimensionality reduction of data with PCA.
    Returns negative test accuracy result of model evaluation for further use in the optimization function.
    '''
    n_components = params['n_components']
    n_hidden = params['n_hidden']
    hidden_size = params['hidden_size']

    # Split dataset into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

    # Split training dataset into training and validation sets
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=10)

    # Normalize datasets
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_val_scaled = scaler.transform(x_val)
    x_test_scaled = scaler.transform(x_test)

    # Perform PCA on training and validation data
    pca = PCA(n_components=n_components)
    x_train_pca = pca.fit_transform(x_train_scaled)
    x_val_pca = pca.transform(x_val_scaled)

    # Call neural_network function to create model 
    clf=neural_network(n_components, n_hidden, hidden_size)

    # Compile model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train model
    model.fit(x_train, y_train, epochs=20, batch_size=2, validation_data=(x_val, y_val), verbose=2)

    # Test model's prediction
    test_loss, test_accuracy = model.evaluate(x_test, y_test)

    return -test_accuracy

In [27]:
# Define the search space for hyperparameters
space = {
    'n_components': hp.choice('n_components', range(2, 32)),
    'n_hidden': hp.choice('n_hidden', range(1, 10)),
    'hidden_size': hp.choice('hidden_size', [32, 64, 128, 256, 512, 1024]),
}

# Use Tree-structured Parzen Estimator (TPE) as the optimization algorithm
best = fmin(fn=neural_pca_objective, space=space, algo=tpe.suggest, max_evals=100)

Epoch 1/20                                             

16/16 - 5s - 287ms/step - accuracy: 0.1875 - loss: 4.7214 - val_accuracy: 0.0714 - val_loss: 3.6927

Epoch 2/20                                             

16/16 - 1s - 37ms/step - accuracy: 0.1250 - loss: 4.0582 - val_accuracy: 0.2143 - val_loss: 2.8100

Epoch 3/20                                             

16/16 - 1s - 43ms/step - accuracy: 0.2188 - loss: 2.3580 - val_accuracy: 0.1429 - val_loss: 3.7164

Epoch 4/20                                             

16/16 - 1s - 77ms/step - accuracy: 0.2500 - loss: 2.5986 - val_accuracy: 0.2143 - val_loss: 3.4354

Epoch 5/20                                             

16/16 - 1s - 35ms/step - accuracy: 0.1875 - loss: 2.3727 - val_accuracy: 0.2857 - val_loss: 2.4380

Epoch 6/20                                             

16/16 - 1s - 42ms/step - accuracy: 0.1562 - loss: 2.8887 - val_accuracy: 0.2857 - val_loss: 2.4802

Epoch 7/20                                             



In [28]:
print(f"Best hyperparameters: {best}")

Best hyperparameters: {'hidden_size': 4, 'n_components': 3, 'n_hidden': 3}
