In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np 

import pandas as pd
import sklearn as sklearn
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.fixes import loguniform

from utils.randomized_search import logUnifD
from utils.base_set import X_train, y_train, seed
from utils.cross_validation import cross_validate
import utils.metrics as metrics
from utils.estimators import DecisionTree

### Decision Tree

In [2]:
model1 = DecisionTree()

In [79]:
k = 5
param = {
    'criterion':["gini","entropy"],
    'max_depth':[1,2,3,4,5]
}

In [5]:
classifier_Dtree = RandomizedSearchCV(estimator = model1.estimator, 
                                param_distributions = param,
                                cv = k,
                                scoring = "roc_auc"
                               )

In [6]:
classifier_Dtree.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [1, 2, 3, 4, 5]},
                   scoring='roc_auc')

In [7]:
best = classifier_Dtree.best_params_
print(best)
auc_roc = classifier_Dtree.best_score_
print(auc_roc)

{'max_depth': 3, 'criterion': 'entropy'}
0.6617407645236837


In [42]:
classifier_Dtree.cv_results_['params']

[{'max_depth': 1, 'criterion': 'gini'},
 {'max_depth': 2, 'criterion': 'gini'},
 {'max_depth': 3, 'criterion': 'gini'},
 {'max_depth': 4, 'criterion': 'gini'},
 {'max_depth': 5, 'criterion': 'gini'},
 {'max_depth': 1, 'criterion': 'entropy'},
 {'max_depth': 2, 'criterion': 'entropy'},
 {'max_depth': 3, 'criterion': 'entropy'},
 {'max_depth': 4, 'criterion': 'entropy'},
 {'max_depth': 5, 'criterion': 'entropy'}]

In [57]:
grid = {
    'random_state' : [seed],
    'auc_roc': ["roc"],
}
keys = grid.keys()
score = classifier_Dtree.cv_results_['mean_test_score']
params = classifier_Dtree.cv_results_['params']

pd.DataFrame(
    score,
    #params
#columns = list(keys) + list(classifier_Dtree.cv_results_['params'][1].keys())
)


Unnamed: 0,0
0,0.614622
1,0.620432
2,0.570875
3,0.568978
4,0.562236
5,0.602087
6,0.63218
7,0.661741
8,0.624766
9,0.589144


In [83]:
res = []
keys = param.keys()
for hs in params: 
    for k in keys: 
        print(hs[k])
    res.append(hs[x] for x in keys)

gini
1
gini
2
gini
3
gini
4
gini
5
entropy
1
entropy
2
entropy
3
entropy
4
entropy
5


In [82]:
param.keys()

dict_keys(['criterion', 'max_depth'])

### KNN 

In [16]:
model2 = KNeighborsClassifier()
K = len(X_train)
param = {
    'n_neighbors': logUnifD(10,K/2)
}

In [17]:
classifier_KNN = RandomizedSearchCV(estimator = model2, 
                                param_distributions = param,
                                cv = k,
                                scoring = "roc_auc", 
                                random_state = seed
                               )

In [18]:
classifier_KNN.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(),
                   param_distributions={'n_neighbors': <utils.randomized_search.logUnifD object at 0x76e8b2fa2620>},
                   random_state=8241, scoring='roc_auc')

In [19]:
classifier_KNN.cv_results_

{'mean_fit_time': array([0.00196524, 0.00193357, 0.0019886 , 0.00193868, 0.00195065,
        0.0019105 , 0.00192995, 0.00184841, 0.00182638, 0.00183163]),
 'std_fit_time': array([9.99048840e-05, 1.21960460e-04, 1.10352873e-04, 1.18801909e-04,
        9.81920601e-05, 1.41881564e-04, 5.32004548e-05, 7.15712402e-05,
        3.54577727e-05, 2.69813999e-05]),
 'mean_score_time': array([0.00863423, 0.01019711, 0.01007285, 0.00886674, 0.00959163,
        0.00965133, 0.00890245, 0.00975299, 0.00925045, 0.00840349]),
 'std_score_time': array([1.37415385e-04, 3.33858360e-04, 3.51185455e-04, 8.64537986e-05,
        2.78723389e-04, 2.33225856e-04, 2.72213283e-04, 1.80676739e-04,
        2.42238317e-04, 2.12085858e-04]),
 'param_n_neighbors': masked_array(data=[14, 166, 153, 34, 93, 107, 56, 160, 110, 33],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 14},

### SVM

In [159]:
model3 = svm.SVC()
param = {
    'C': scipy.stats.expon(scale=100), 
    'gamma': scipy.stats.expon(scale=.1),
    'kernel': ["rbf","linear","poly","sigmoid"]
        }

In [160]:
classifier_SVM = RandomizedSearchCV(estimator = model3, 
                                param_distributions = param,
                                cv = k,
                                scoring = "roc_auc", 
                                random_state = seed
                               )

In [161]:
classifier_SVM.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=SVC(),
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7813cb3d77c0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7813cb3d7490>,
                                        'kernel': ['rbf', 'linear', 'poly',
                                                   'sigmoid']},
                   random_state=8241, scoring='roc_auc')

### LDA 

In [172]:
model4 = LinearDiscriminantAnalysis()
param = {
    'solver': ["svd","lsqr","eigen"], 
    'n_components' : [0,1]
}

In [173]:
classifier_LDA = RandomizedSearchCV(estimator = model4,
                                   param_distributions=param,
                                   cv = k, 
                                   scoring= "roc_auc",
                                   random_state=seed)

In [174]:
classifier_LDA.fit(X_train,y_train)

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/discriminant_analysis.py", line 468, in fit
    self._solve_eigen(X, y, shrinkage=self.shrinkage)
  File "/usr/lib/python3/dist-packages/sklearn/discriminant_analysis.py", line 335, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
  File "/usr/lib/python3/dist-packages/scipy/linalg/_decomp.py", line 580, in eigh
    raise LinAlgError('The leading minor of order {} of B is not '
numpy.linalg.LinAlgError: The leading minor of order 141 of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed.

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_par

RandomizedSearchCV(cv=5, estimator=LinearDiscriminantAnalysis(),
                   param_distributions={'n_components': [0, 1],
                                        'solver': ['svd', 'lsqr', 'eigen']},
                   random_state=8241, scoring='roc_auc')

In [175]:
classifier_LDA.best_params_

{'solver': 'svd', 'n_components': 0}

### Naive Bayes 