In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier
from tune_sklearn import TuneSearchCV

from data_prepare_func import convert_to_array

In [2]:
x,y = convert_to_array("data_train/",size=28)

In [3]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [4]:
c = np.cov(x)
c

array([[ 1.62124166e+00,  3.79524355e-01, -3.10492170e-01, ...,
        -1.26921901e-01, -3.14365556e-02, -9.54611695e-02],
       [ 3.79524355e-01,  1.01841765e+00, -1.01058590e-01, ...,
        -7.35167836e-02, -7.55400382e-03, -1.50609188e-03],
       [-3.10492170e-01, -1.01058590e-01,  1.27935792e+00, ...,
        -2.87762279e-02,  1.17776895e-01, -5.27562330e-02],
       ...,
       [-1.26921901e-01, -7.35167836e-02, -2.87762279e-02, ...,
         6.58434127e-01,  7.81557680e-02,  1.44305384e-01],
       [-3.14365556e-02, -7.55400382e-03,  1.17776895e-01, ...,
         7.81557680e-02,  5.86020646e-01,  1.27598651e-03],
       [-9.54611695e-02, -1.50609188e-03, -5.27562330e-02, ...,
         1.44305384e-01,  1.27598651e-03,  4.39831231e-01]])

In [5]:
eigenvalues, eigenvectors = np.linalg.eig(c)
print('Eigen values:\n', eigenvalues)
print('Eigen values Shape:', eigenvalues.shape)
print('Eigen Vector Shape:', eigenvectors.shape)

Eigen values:
 [ 1.02777874e+02+0.00000000e+00j  8.80250774e+01+0.00000000e+00j
  6.64228107e+01+0.00000000e+00j ...  1.73562535e-32+4.59556231e-33j
  1.73562535e-32-4.59556231e-33j -1.01893447e-47+0.00000000e+00j]
Eigen values Shape: (1753,)
Eigen Vector Shape: (1753, 1753)


In [6]:
idx = eigenvalues.argsort()[::-1]
 
# Sort the eigenvalues in descending order
eigenvalues = eigenvalues[idx]
 
# sort the corresponding eigenvectors accordingly
eigenvectors = eigenvectors[:,idx]

In [7]:
explained_var = np.cumsum(eigenvalues) / np.sum(eigenvalues)
explained_var

array([0.06049262+0.00000000e+00j, 0.11230209+0.00000000e+00j,
       0.15139698+0.00000000e+00j, ..., 1.        +1.21117459e-18j,
       1.        +0.00000000e+00j, 1.        +0.00000000e+00j])

In [8]:
n_components = np.argmax(explained_var >= 0.80) + 1
n_components

140

In [9]:
pca = PCA(n_components=0.8)
pca_x = pca.fit_transform(x)

In [10]:
pca_x.shape

(1753, 136)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(pca_x, y, test_size=0.3, random_state=42, stratify=y)

In [12]:
s = setup(x_train, target = y_train, session_id = 10,fold=5,preprocess=False)

In [13]:
best = compare_models()

In [14]:
pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7914,0.9728,0.7914,0.8007,0.7914,0.7682,0.7691,0.42
svm,SVM - Linear Kernel,0.7856,0.0,0.7856,0.7978,0.7863,0.7617,0.7628,0.024
lda,Linear Discriminant Analysis,0.7856,0.972,0.7856,0.7918,0.7839,0.7617,0.7627,0.022
lightgbm,Light Gradient Boosting Machine,0.7832,0.9691,0.7832,0.7887,0.7808,0.7591,0.7603,0.87
rf,Random Forest Classifier,0.7646,0.9594,0.7646,0.7713,0.7629,0.7384,0.7396,0.108
et,Extra Trees Classifier,0.7646,0.9576,0.7646,0.7642,0.7586,0.7384,0.7396,0.078
ridge,Ridge Classifier,0.7599,0.0,0.7599,0.7632,0.755,0.7332,0.7346,0.024
knn,K Neighbors Classifier,0.7459,0.941,0.7459,0.7718,0.7381,0.7176,0.722,0.178
gbc,Gradient Boosting Classifier,0.7133,0.9565,0.7133,0.7275,0.7148,0.6814,0.6828,3.638
nb,Naive Bayes,0.6993,0.9538,0.6993,0.7109,0.6976,0.6658,0.6673,0.02


In [15]:
best.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 1000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 10,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [16]:
rf = RandomForestClassifier(bootstrap=True,ccp_alpha=0.0,class_weight=None,criterion='gini',max_depth=None,
                            max_features='sqrt',max_leaf_nodes=None,max_samples=None,min_impurity_decrease=0.0,
                            min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.0,
                            n_estimators=100,n_jobs=-1,oob_score=False,random_state=10,verbose=0,warm_start=False)

In [17]:
rf.fit(x_train,y_train)

train_yhat = rf.predict(x_train)
train_f1 = f1_score(train_yhat,y_train,average='macro')

test_yhat = rf.predict(x_test)
test_f1 = f1_score(test_yhat,y_test,average='macro')

print('Train Score\n',classification_report(y_train,train_yhat))
print('Test Score\n',classification_report(y_test,test_yhat))
print(f"AVG F1-Score Train: {train_f1}\nAVG F1-Score Test: {test_f1}")

Train Score
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       121
           1       1.00      1.00      1.00       122
           2       1.00      1.00      1.00       124
           3       1.00      1.00      1.00       125
           4       1.00      1.00      1.00       122
           5       1.00      1.00      1.00       122
           6       1.00      1.00      1.00       122
           7       1.00      1.00      1.00       122
           8       1.00      1.00      1.00       122
           9       1.00      1.00      1.00       125

    accuracy                           1.00      1227
   macro avg       1.00      1.00      1.00      1227
weighted avg       1.00      1.00      1.00      1227

Test Score
               precision    recall  f1-score   support

           0       0.89      0.92      0.91        52
           1       0.92      0.87      0.89        53
           2       0.89      0.89      0.89        53

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

#train the neural network model
model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', random_state=42)
model.fit(x_train, y_train)

train_yhat = model.predict(x_train)
train_score = classification_report(y_train, train_yhat)
print("Train Score:\n", train_score)

# Test Score
test_yhat = model.predict(x_test)
test_score = classification_report(y_test, test_yhat)
print("Test Score:\n", test_score)

# F1-Score
train_f1 = f1_score(y_train, train_yhat, average='weighted')
print(f"AVG F1-Score Train: {train_f1}")



Train Score:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       121
           1       1.00      1.00      1.00       122
           2       1.00      1.00      1.00       124
           3       1.00      1.00      1.00       125
           4       1.00      1.00      1.00       122
           5       1.00      1.00      1.00       122
           6       1.00      1.00      1.00       122
           7       1.00      1.00      1.00       122
           8       1.00      1.00      1.00       122
           9       1.00      1.00      1.00       125

    accuracy                           1.00      1227
   macro avg       1.00      1.00      1.00      1227
weighted avg       1.00      1.00      1.00      1227

Test Score:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92        52
           1       0.96      0.89      0.92        53
           2       0.92      0.91      0.91        

In [19]:
#predictions
y_pred = model.predict(x_test)


print("Accuracy_nn")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy_nn
              precision    recall  f1-score   support

           0       0.91      0.94      0.92        52
           1       0.96      0.89      0.92        53
           2       0.92      0.91      0.91        53
           3       0.94      0.94      0.94        53
           4       0.77      0.71      0.74        52
           5       0.90      0.81      0.85        53
           6       0.98      0.90      0.94        52
           7       0.79      0.94      0.86        53
           8       0.73      0.77      0.75        52
           9       0.84      0.89      0.86        53

    accuracy                           0.87       526
   macro avg       0.87      0.87      0.87       526
weighted avg       0.87      0.87      0.87       526

Confusion Matrix:
[[49  1  0  1  0  0  0  0  1  0]
 [ 3 47  0  1  0  0  1  1  0  0]
 [ 0  1 48  0  1  1  0  1  0  1]
 [ 0  0  0 50  0  0  0  1  0  2]
 [ 2  0  0  0 37  3  0  3  7  0]
 [ 0  0  0  0  3 43  0  0  5  2]
 [ 0  0  4  1

In [20]:
s = setup(x_train, target=y_train, session_id=10, fold=5, preprocess=False)

Unnamed: 0,Description,Value
0,Session id,10
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(1227, 137)"
4,Transformed data shape,"(1227, 137)"
5,Transformed train set shape,"(858, 137)"
6,Transformed test set shape,"(369, 137)"
7,Numeric features,136


In [21]:
# Train a neural network model
nn_model = create_model('mlp')

# Tune the neural network model
tuned_nn = tune_model(nn_model)

# Evaluate the tuned model
evaluate_model(tuned_nn)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7791,0.9691,0.7791,0.787,0.7766,0.7545,0.7558
1,0.7674,0.9625,0.7674,0.7799,0.7641,0.7416,0.7439
2,0.7558,0.9717,0.7558,0.7691,0.7554,0.7287,0.7302
3,0.7661,0.9639,0.7661,0.777,0.7679,0.7401,0.7412
4,0.8012,0.974,0.8012,0.8055,0.8024,0.7791,0.7793
Mean,0.7739,0.9682,0.7739,0.7837,0.7733,0.7488,0.7501
Std,0.0155,0.0044,0.0155,0.0123,0.0161,0.0172,0.0167


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8663,0.983,0.8663,0.8767,0.8671,0.8514,0.8524
1,0.7849,0.98,0.7849,0.7982,0.7767,0.761,0.7634
2,0.8314,0.9845,0.8314,0.8321,0.8299,0.8127,0.8131
3,0.7719,0.9743,0.7719,0.7728,0.7697,0.7466,0.7472
4,0.8655,0.9813,0.8655,0.8753,0.8672,0.8505,0.8512
Mean,0.824,0.9806,0.824,0.831,0.8221,0.8044,0.8055
Std,0.0395,0.0035,0.0395,0.0413,0.0422,0.0439,0.0436


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8663,0.983,0.8663,0.8767,0.8671,0.8514,0.8524
1,0.7849,0.98,0.7849,0.7982,0.7767,0.761,0.7634
2,0.8314,0.9845,0.8314,0.8321,0.8299,0.8127,0.8131
3,0.7719,0.9743,0.7719,0.7728,0.7697,0.7466,0.7472
4,0.8655,0.9813,0.8655,0.8753,0.8672,0.8505,0.8512
Mean,0.824,0.9806,0.824,0.831,0.8221,0.8044,0.8055
Std,0.0395,0.0035,0.0395,0.0413,0.0422,0.0439,0.0436


Fitting 5 folds for each of 10 candidates, totalling 50 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…