In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier
from tune_sklearn import TuneSearchCV

from data_prepare_func import convert_to_array

In [2]:
x,y = convert_to_array("data_train/",size=28)

In [3]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [4]:
c = np.cov(x)
c

array([[ 0.20120025, -0.00435104,  0.03794997, ..., -0.02316054,
         0.04359057, -0.00392678],
       [-0.00435104,  0.36546416,  0.01341268, ..., -0.0463836 ,
        -0.03184393, -0.02626015],
       [ 0.03794997,  0.01341268,  0.13745426, ..., -0.01706125,
        -0.00511994, -0.00663046],
       ...,
       [-0.02316054, -0.0463836 , -0.01706125, ...,  0.32393225,
         0.02702315,  0.02491563],
       [ 0.04359057, -0.03184393, -0.00511994, ...,  0.02702315,
         2.5921294 ,  0.0084614 ],
       [-0.00392678, -0.02626015, -0.00663046, ...,  0.02491563,
         0.0084614 ,  0.88151797]])

In [5]:
eigenvalues, eigenvectors = np.linalg.eig(c)
print('Eigen values:\n', eigenvalues)
print('Eigen values Shape:', eigenvalues.shape)
print('Eigen Vector Shape:', eigenvectors.shape)

Eigen values:
 [ 3.68945569e+01+0.00000000e+00j  3.17825234e+01+0.00000000e+00j
  2.64450528e+01+0.00000000e+00j ... -2.05921038e-34-4.96291326e-33j
  1.16148839e-31+0.00000000e+00j -1.45500567e-47+0.00000000e+00j]
Eigen values Shape: (1753,)
Eigen Vector Shape: (1753, 1753)


In [6]:
idx = eigenvalues.argsort()[::-1]
 
# Sort the eigenvalues in descending order
eigenvalues = eigenvalues[idx]
 
# sort the corresponding eigenvectors accordingly
eigenvectors = eigenvectors[:,idx]

In [7]:
explained_var = np.cumsum(eigenvalues) / np.sum(eigenvalues)
explained_var

array([0.02270951+6.89181705e-36j, 0.04227244+1.28287182e-35j,
       0.05855002+1.77685923e-35j, ..., 1.        +3.03477114e-34j,
       1.        +8.32297830e-19j, 1.        +3.03477114e-34j])

In [8]:
n_components = np.argmax(explained_var >= 0.80) + 1
n_components

215

In [9]:
pca = PCA(n_components=0.8)
pca_x = pca.fit_transform(x)

In [10]:
pca_x.shape

(1753, 213)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(pca_x, y, test_size=0.3, random_state=42)

In [12]:
s = setup(x_train, target = y_train, session_id = 10,fold=5,preprocess=False)

Unnamed: 0,Description,Value
0,Session id,10
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(1227, 214)"
4,Transformed data shape,"(1227, 214)"
5,Transformed train set shape,"(858, 214)"
6,Transformed test set shape,"(369, 214)"
7,Numeric features,213


In [13]:
best = compare_models()

In [14]:
pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.5326,0.8736,0.5326,0.544,0.5293,0.4803,0.4822,0.118
svm,SVM - Linear Kernel,0.5292,0.0,0.5292,0.5456,0.5312,0.4769,0.4782,0.026
lightgbm,Light Gradient Boosting Machine,0.5222,0.881,0.5222,0.5305,0.5174,0.4689,0.4707,1.152
lda,Linear Discriminant Analysis,0.521,0.8494,0.521,0.5385,0.5192,0.4674,0.4695,0.028
gbc,Gradient Boosting Classifier,0.5187,0.8706,0.5187,0.5286,0.5172,0.465,0.4662,5.668
lr,Logistic Regression,0.5129,0.8401,0.5129,0.5269,0.514,0.4587,0.4598,0.304
xgboost,Extreme Gradient Boosting,0.5116,0.8743,0.5116,0.5179,0.5063,0.4572,0.4588,1.192
ridge,Ridge Classifier,0.4883,0.0,0.4883,0.5028,0.488,0.4311,0.4326,0.02
et,Extra Trees Classifier,0.4826,0.84,0.4826,0.4955,0.4772,0.4244,0.4271,0.086
knn,K Neighbors Classifier,0.3835,0.7941,0.3835,0.5426,0.3607,0.313,0.3332,0.162


In [15]:
best.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 10,
 'verbose': 0,
 'warm_start': False}

In [16]:
rf = RandomForestClassifier(bootstrap=True,ccp_alpha=0.0,class_weight=None,criterion='gini',max_depth=None,
                            max_features='sqrt',max_leaf_nodes=None,max_samples=None,min_impurity_decrease=0.0,
                            min_samples_leaf=1,min_samples_split=2,min_weight_fraction_leaf=0.0,
                            n_estimators=100,n_jobs=-1,oob_score=False,random_state=10,verbose=0,warm_start=False)

In [17]:
rf.fit(x_train,y_train)

train_yhat = rf.predict(x_train)
train_f1 = f1_score(train_yhat,y_train,average='macro')

test_yhat = rf.predict(x_test)
test_f1 = f1_score(test_yhat,y_test,average='macro')

print('Train Score\n',classification_report(y_train,train_yhat))
print('Test Score\n',classification_report(y_test,test_yhat))
print(f"AVG F1-Score Train: {train_f1}\nAVG F1-Score Test: {test_f1}")

Train Score
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       132
           1       1.00      1.00      1.00       121
           2       1.00      1.00      1.00       120
           3       1.00      1.00      1.00       124
           4       1.00      1.00      1.00       121
           5       1.00      1.00      1.00       125
           6       1.00      1.00      1.00       116
           7       1.00      1.00      1.00       124
           8       1.00      1.00      1.00       116
           9       1.00      1.00      1.00       128

    accuracy                           1.00      1227
   macro avg       1.00      1.00      1.00      1227
weighted avg       1.00      1.00      1.00      1227

Test Score
               precision    recall  f1-score   support

           0       0.42      0.68      0.52        41
           1       0.52      0.56      0.54        54
           2       0.72      0.77      0.75        57