In [54]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from tune_sklearn import TuneSearchCV

from data_prepare_func import convert_to_array

In [55]:
x_train,y_train = convert_to_array("data_train/",size=28)
x_test,y_test = convert_to_array('data_test/',size=28)

In [56]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [57]:
c = np.cov(x_train)
c

array([[ 1.62124166e+00,  3.79524355e-01, -3.10492170e-01, ...,
        -1.26921901e-01, -3.14365556e-02, -9.54611695e-02],
       [ 3.79524355e-01,  1.01841765e+00, -1.01058590e-01, ...,
        -7.35167836e-02, -7.55400382e-03, -1.50609188e-03],
       [-3.10492170e-01, -1.01058590e-01,  1.27935792e+00, ...,
        -2.87762279e-02,  1.17776895e-01, -5.27562330e-02],
       ...,
       [-1.26921901e-01, -7.35167836e-02, -2.87762279e-02, ...,
         6.58434127e-01,  7.81557680e-02,  1.44305384e-01],
       [-3.14365556e-02, -7.55400382e-03,  1.17776895e-01, ...,
         7.81557680e-02,  5.86020646e-01,  1.27598651e-03],
       [-9.54611695e-02, -1.50609188e-03, -5.27562330e-02, ...,
         1.44305384e-01,  1.27598651e-03,  4.39831231e-01]])

In [58]:
eigenvalues, eigenvectors = np.linalg.eig(c)
print('Eigen values:\n', eigenvalues)
print('Eigen values Shape:', eigenvalues.shape)
print('Eigen Vector Shape:', eigenvectors.shape)

Eigen values:
 [ 1.02777874e+02+0.00000000e+00j  8.80250774e+01+0.00000000e+00j
  6.64228107e+01+0.00000000e+00j ...  1.73562535e-32+4.59556231e-33j
  1.73562535e-32-4.59556231e-33j -1.03615086e-47+0.00000000e+00j]
Eigen values Shape: (1753,)
Eigen Vector Shape: (1753, 1753)


In [59]:
idx = eigenvalues.argsort()[::-1]

# Sort the eigenvalues in descending order
eigenvalues = eigenvalues[idx]

# sort the corresponding eigenvectors accordingly
eigenvectors = eigenvectors[:,idx]

In [60]:
explained_var = np.cumsum(eigenvalues) / np.sum(eigenvalues)
explained_var

array([0.06049262-9.83045445e-35j, 0.11230209-1.82498396e-34j,
       0.15139698-2.46030204e-34j, ..., 1.        +1.21665077e-18j,
       1.        -1.62506681e-33j, 1.        -1.62506681e-33j])

In [61]:
n_components = np.argmax(explained_var >= 0.80) + 1
n_components

140

In [62]:
pca = PCA(n_components=n_components)
pca_x_train = pca.fit_transform(x_train)
pca_x_test = pca.fit_transform(x_test)

In [63]:
pca_x_train.shape

(1753, 140)

In [64]:
pca_x_test.shape

(904, 140)

In [65]:
# x_train, x_test, y_train, y_test = train_test_split(pca_x, y, test_size=0.3, random_state=42,stratify=y)

In [66]:
s = setup(x_train, target = y_train, session_id = 10,fold=5,preprocess=False)

In [67]:
best = compare_models()

In [68]:
pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8321,0.9816,0.8321,0.8371,0.8313,0.8135,0.8142,0.064
lightgbm,Light Gradient Boosting Machine,0.8313,0.982,0.8313,0.8353,0.8292,0.8125,0.8134,0.082
rf,Random Forest Classifier,0.8305,0.9781,0.8305,0.8355,0.8279,0.8116,0.8127,0.062
lr,Logistic Regression,0.8272,0.9812,0.8272,0.8346,0.8271,0.808,0.8089,0.048
svm,SVM - Linear Kernel,0.815,0.0,0.815,0.823,0.8151,0.7944,0.7954,0.04
xgboost,Extreme Gradient Boosting,0.8125,0.9788,0.8125,0.8206,0.8092,0.7917,0.7932,0.046
gbc,Gradient Boosting Classifier,0.7783,0.9731,0.7783,0.7852,0.7776,0.7537,0.7545,0.112
knn,K Neighbors Classifier,0.7229,0.9403,0.7229,0.7791,0.7164,0.6921,0.7005,0.048
nb,Naive Bayes,0.661,0.8963,0.661,0.6829,0.6388,0.6232,0.6312,0.044
dt,Decision Tree Classifier,0.6145,0.7859,0.6145,0.6296,0.6171,0.5717,0.5728,0.048


In [69]:
best.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [70]:
rf = RandomForestClassifier(bootstrap=True,ccp_alpha=0.0,class_weight=None,criterion='gini',max_depth=None,
                            max_features='sqrt',max_leaf_nodes=None,max_samples=None,min_impurity_decrease=0.0,
                            min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.0,
                            n_estimators=100,n_jobs=-1,oob_score=False,random_state=10,verbose=0,warm_start=False)
et = ExtraTreesClassifier(
    bootstrap = False,
 ccp_alpha = 0.0,
 class_weight =  None,
 criterion =  'gini',
 max_depth =  None,
 max_features = 'sqrt',
 max_leaf_nodes = None,
 max_samples = None,
 min_impurity_decrease = 0.0,
 min_samples_leaf = 1,
 min_samples_split = 2,
 min_weight_fraction_leaf = 0.0,
 n_estimators = 100,
 n_jobs = -1,
 oob_score = False,
 random_state = 10,
 verbose = 0,
 warm_start = False
)

In [71]:
et.fit(x_train,y_train)

train_yhat = et.predict(x_train)
train_f1 = f1_score(train_yhat,y_train,average='macro')

test_yhat = et.predict(x_test)
test_f1 = f1_score(test_yhat,y_test,average='macro')

print('Train Score\n',classification_report(y_train,train_yhat))
print('Test Score\n',classification_report(y_test,test_yhat))
print(f"AVG F1-Score Train: {train_f1}\nAVG F1-Score Test: {test_f1}")

Train Score
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       173
           1       1.00      1.00      1.00       175
           2       1.00      1.00      1.00       177
           3       1.00      1.00      1.00       178
           4       1.00      1.00      1.00       174
           5       1.00      1.00      1.00       175
           6       1.00      1.00      1.00       174
           7       1.00      1.00      1.00       175
           8       1.00      1.00      1.00       174
           9       1.00      1.00      1.00       178

    accuracy                           1.00      1753
   macro avg       1.00      1.00      1.00      1753
weighted avg       1.00      1.00      1.00      1753

Test Score
               precision    recall  f1-score   support

           0       0.88      0.66      0.75        91
           1       0.84      0.83      0.84        90
           2       0.84      0.58      0.68        90