In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from blogreg.binary_logistic_regression import BinaryLogisticRegression
from transformers.collinearity import CollinearFeaturesVIFTransformer
from sklearn.preprocessing import  StandardScaler
from scipy.stats import uniform
from hyperparams_tuning.tune import tune_pipe_with_randomsearch

# data 
from data_preparation.preprocessing import load_and_preprocess_heart, load_and_preprocess_bank, load_and_preprocess_banknote_auth, load_and_preprocess_ckd

# log-likelihood analisys
from log_likelihood.utils import visualize_log_likelihood

# comparison
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from comparison.comparison import compare_models
from comparison.vis import plot_comparison_results


# hyperparamters tuning
from hyperparams_tuning.tune import tune_pipe_with_randomsearch
from hyperparams_tuning.vis import plot_acc_vs_hyperparameter


In [2]:
# for tests comparability
np.random.seed(42)

# Common options

In [3]:
# constants
HYPERPARAMS_ITERS = 20
VIF_THRESHOLD = 20

# common steps
selection = 'removing_collinear_features', CollinearFeaturesVIFTransformer(vif_threshold=VIF_THRESHOLD),
scaling =  'scalling', StandardScaler()


# models definitions
irls_model = Pipeline([
    selection,
    scaling,
    ('logistic_regression', BinaryLogisticRegression(method='IRLS'))
])

gd_model = Pipeline([
    selection,
    scaling,
    ('logistic_regression', BinaryLogisticRegression(method='GD'))
])

sgd_model = Pipeline([
    selection,
    scaling,
    ('logistic_regression', BinaryLogisticRegression(method='SGD'))
])

adam_model = Pipeline([
    selection,
    scaling,
    ('logistic_regression', BinaryLogisticRegression(method='ADAM'))
])


# definitons of hyperparameters spaces 
lr_hyperparams = {"logistic_regression__lr": uniform(loc=0.0005, scale=0.0095)}
irls_lambda_hyperparams = {"logistic_regression__irls_lambda": uniform(loc = 0.01, scale = 1.0)}
adam_full_hyperparams = {
    "logistic_regression__lr": uniform(loc = 0.01, scale = 1.0), 
    'logistic_regression__adam_b1': [0.9, 0.99, 0.999, 0.9999, 0.99999], 
    "logistic_regression__adam_b2": [0.9, 0.99, 0.999, 0.9999, 0.99999], 
    "logistic_regression__adam_e": [10**-12, 10**-10, 10**-8, 10**-6]
},


# comparison models
cmp_models = [
    ('lda', Pipeline([
        selection,
        scaling,
        ('lda', LinearDiscriminantAnalysis())
    ])),
    ('qda', Pipeline([
        selection,
        scaling,
        ('qda', QuadraticDiscriminantAnalysis())
    ])),
    ('knn', Pipeline([
        selection,
        scaling,
        ('knn', KNeighborsClassifier())
    ]))
]


# 1. heart.dat dataset

Attribute Information:
1. age
2. sex
3. chest pain type (4 values)
4. resting blood pressure
5. serum cholesterol in mg/dl
6. fasting blood sugar > 120 mg/dl
7. resting electrocardiographic results (values 0,1,2)
8. maximum heart rate achieved
9. exercise induced angina
10. oldpeak = ST depression induced by exercise relative to rest
11. the slope of the peak exercise ST segment
12. number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. absence (1) or presence (2) of heart disease

Attributes types:
Real: 1,4,5,8,10,12
Ordered:11,
Binary: 2,6,9
Nominal: 7,3,13



Number of observations: 270


In [4]:
X_train, X_test, y_train, y_test = load_and_preprocess_heart()
dataset_name = 'heart.dat'
dataset_name = 'heart.dat'

## 1.1 Hyperparamters tuning 
(default 5-fold cross-validation)

### 1.1.1 Iteratively Reweighted Least Squartes (IRLS)

In [5]:
irls_search = tune_pipe_with_randomsearch(irls_model, irls_lambda_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_irls = irls_search.best_estimator_
plot_acc_vs_hyperparameter(irls_search, "logistic_regression__irls_lambda", dataset_name, 'irls')

### 1.1.2 Gradient Descent (GD)

In [6]:
gd_search = tune_pipe_with_randomsearch(gd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_gd = gd_search.best_estimator_
plot_acc_vs_hyperparameter(gd_search, "logistic_regression__lr", dataset_name, 'gd')

### 1.1.3 Stochastic Gradient Descent (SGD)

In [7]:
sgd_search = tune_pipe_with_randomsearch(sgd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_sgd = sgd_search.best_estimator_
plot_acc_vs_hyperparameter(sgd_search, "logistic_regression__lr", dataset_name, 'sgd')

### 1.1.4 Adam

Tuning only learning rate:

In [8]:
adam_search = tune_pipe_with_randomsearch(adam_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_adam = adam_search.best_estimator_
plot_acc_vs_hyperparameter(adam_search, "logistic_regression__lr", dataset_name, 'adam')

Tuning all hyperparameters may be useful:

In [9]:
adam_full_search = tune_pipe_with_randomsearch(adam_model, adam_full_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
adam_full_search.best_estimator_

Pipeline(steps=[('removing_collinear_features',
                 CollinearFeaturesVIFTransformer(vif_threshold=20)),
                ('scalling', StandardScaler()),
                ('logistic_regression',
                 BinaryLogisticRegression(adam_b1=0.9999, adam_b2=0.99,
                                          adam_e=1e-06, lr=0.023364670150314397,
                                          method='ADAM'))])

In [10]:
adam_full_search.best_score_

0.8424947145877377

In [11]:
plot_acc_vs_hyperparameter([gd_search, sgd_search, adam_search], "logistic_regression__lr", dataset_name, model_names=['gd', 'sgd', 'adam'])

## 1.2 Log-likelihood function analysis

In [12]:
visualize_log_likelihood(best_model_irls, best_model_gd, best_model_sgd, best_model_adam, dataset_name)

## 1.3 Models comparison

In [13]:
models = [
    ('log_reg_irls', best_model_irls),
    ('log_reg_gd', best_model_gd),
    ('log_reg_sgd', best_model_gd),
    ('log_reg_adam', best_model_adam)
]

model_cmp_res = compare_models(models, cmp_models, X_train, X_test, y_train, y_test)
model_cmp_res


Variables are collinear



Unnamed: 0,model,accuracy,f1_score,precision,recall
0,log_reg_irls,0.833333,0.823529,0.875,0.777778
3,log_reg_adam,0.814815,0.8,0.833333,0.769231
4,lda,0.814815,0.807692,0.875,0.75
1,log_reg_gd,0.796296,0.792453,0.875,0.724138
2,log_reg_sgd,0.796296,0.792453,0.875,0.724138
6,knn,0.777778,0.76,0.791667,0.730769
5,qda,0.592593,0.685714,1.0,0.521739


In [14]:
plot_comparison_results(model_cmp_res)

# 2. bank dataset

Attribute Information:

1.  age (numeric)
2.  job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3.  marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4.  education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5.  default: has credit in default? (categorical: 'no','yes','unknown')
6.  housing: has housing loan? (categorical: 'no','yes','unknown')
7.  loan: has personal loan? (categorical: 'no','yes','unknown')
related with the last contact of the current campaign:
8.  contact: contact communication type (categorical: 'cellular','telephone')
9.  month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10.  day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11.  duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
other attributes:
12.  campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13.  pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14.  previous: number of contacts performed before this campaign and for this client (numeric)
15.  poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
social and economic context attributes
16.  emp.var.rate: employment variation rate - quarterly indicator (numeric)
17.  cons.price.idx: consumer price index - monthly indicator (numeric)
18.  cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19.  euribor3m: euribor 3 month rate - daily indicator (numeric)
20.  nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):
21. - y - has the client subscribed a term deposit? (binary: 'yes','no')

Number of observations: 45211


In [15]:
X_train, X_test, y_train, y_test = load_and_preprocess_bank()
dataset_name = 'bank'

## 2.1 Hyperparamters tuning 
(default 5-fold cross-validation)

### 2.1.1 Iteratively Reweighted Least Squartes (IRLS)

In [16]:
irls_search = tune_pipe_with_randomsearch(irls_model, irls_lambda_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_irls = irls_search.best_estimator_
plot_acc_vs_hyperparameter(irls_search, "logistic_regression__irls_lambda", dataset_name, 'irls')

### 2.1.2 Gradient Descent (GD)

In [17]:
gd_search = tune_pipe_with_randomsearch(gd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_gd = gd_search.best_estimator_
plot_acc_vs_hyperparameter(gd_search, "logistic_regression__lr", dataset_name, 'gd')

### 2.1.3 Stochastic Gradient Descent (SGD)

In [18]:
sgd_search = tune_pipe_with_randomsearch(sgd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_sgd = sgd_search.best_estimator_
plot_acc_vs_hyperparameter(sgd_search, "logistic_regression__lr", dataset_name, 'sgd')

### 2.1.4 Adam

Tuning only learning rate:

In [19]:
adam_search = tune_pipe_with_randomsearch(adam_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_adam = adam_search.best_estimator_
plot_acc_vs_hyperparameter(adam_search, "logistic_regression__lr", dataset_name, 'adam')

Tuning all hyperparameters may be useful:

In [20]:
adam_full_search = tune_pipe_with_randomsearch(adam_model, adam_full_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
adam_full_search.best_estimator_

Pipeline(steps=[('removing_collinear_features',
                 CollinearFeaturesVIFTransformer(vif_threshold=20)),
                ('scalling', StandardScaler()),
                ('logistic_regression',
                 BinaryLogisticRegression(adam_b1=0.999, adam_b2=0.99999,
                                          lr=0.6906813062156666,
                                          method='ADAM'))])

In [21]:
adam_full_search.best_score_

0.8711297310928223

In [22]:
plot_acc_vs_hyperparameter([gd_search, sgd_search, adam_search], "logistic_regression__lr", dataset_name, model_names=['gd', 'sgd', 'adam'])

## 2.2 Log-likelihood function analysis

In [23]:
visualize_log_likelihood(best_model_irls, best_model_gd, best_model_sgd, best_model_adam, dataset_name)

## 2.3 Models comparison

In [24]:
models = [
    ('log_reg_irls', best_model_irls),
    ('log_reg_gd', best_model_gd),
    ('log_reg_sgd', best_model_gd),
    ('log_reg_adam', best_model_adam)
]

model_cmp_res = compare_models(models, cmp_models, X_train, X_test, y_train, y_test)
model_cmp_res

Unnamed: 0,model,accuracy,f1_score,precision,recall
4,lda,0.895028,0.424242,0.336538,0.57377
0,log_reg_irls,0.892818,0.366013,0.269231,0.571429
1,log_reg_gd,0.891713,0.319444,0.221154,0.575
2,log_reg_sgd,0.891713,0.319444,0.221154,0.575
3,log_reg_adam,0.890608,0.277372,0.182692,0.575758
6,knn,0.887293,0.28169,0.192308,0.526316
5,qda,0.845304,0.401709,0.451923,0.361538


In [25]:
plot_comparison_results(model_cmp_res)

# 3. banknote authentication dataset

Attribute Information:
1. variance of Wavelet Transformed image (continuous)
2. skewness of Wavelet Transformed image (continuous)
3. curtosis of Wavelet Transformed image (continuous)
4. entropy of image (continuous)
5. class (integer)

Number of observations: 1372


In [26]:
X_train, X_test, y_train, y_test = load_and_preprocess_banknote_auth()
dataset_name = 'authentication dataset'

## 3.1 Hyperparamters tuning 
(default 5-fold cross-validation)

### 3.1.1 Iteratively Reweighted Least Squartes (IRLS)

In [27]:
irls_search = tune_pipe_with_randomsearch(irls_model, irls_lambda_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_irls = irls_search.best_estimator_
plot_acc_vs_hyperparameter(irls_search, "logistic_regression__irls_lambda", dataset_name, 'irls')

### 3.1.2 Gradient Descent (GD)

In [28]:
gd_search = tune_pipe_with_randomsearch(gd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_gd = gd_search.best_estimator_
plot_acc_vs_hyperparameter(gd_search, "logistic_regression__lr", dataset_name, 'gd')

### 3.1.3 Stochastic Gradient Descent (SGD)

In [29]:
sgd_search = tune_pipe_with_randomsearch(sgd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_sgd = sgd_search.best_estimator_
plot_acc_vs_hyperparameter(sgd_search, "logistic_regression__lr", dataset_name, 'sgd')

### 3.1.4 Adam

Tuning only learning rate:

In [30]:
adam_search = tune_pipe_with_randomsearch(adam_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_adam = adam_search.best_estimator_
plot_acc_vs_hyperparameter(adam_search, "logistic_regression__lr", dataset_name, 'adam')

Tuning all hyperparameters may be useful:

In [31]:
adam_full_search = tune_pipe_with_randomsearch(adam_model, adam_full_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
adam_full_search.best_estimator_

Pipeline(steps=[('removing_collinear_features',
                 CollinearFeaturesVIFTransformer(vif_threshold=20)),
                ('scalling', StandardScaler()),
                ('logistic_regression',
                 BinaryLogisticRegression(adam_b2=0.99, adam_e=1e-06,
                                          lr=0.545302693379227,
                                          method='ADAM'))])

In [32]:
adam_full_search.best_score_

0.9772063096720632

In [33]:
plot_acc_vs_hyperparameter([gd_search, sgd_search, adam_search], "logistic_regression__lr", dataset_name, model_names=['gd', 'sgd', 'adam'])

## 3.2 Log-likelihood function analysis

In [34]:
visualize_log_likelihood(best_model_irls, best_model_gd, best_model_sgd, best_model_adam, dataset_name)

## 3.3 Models comparison

In [35]:
models = [
    ('log_reg_irls', best_model_irls),
    ('log_reg_gd', best_model_gd),
    ('log_reg_sgd', best_model_gd),
    ('log_reg_adam', best_model_adam)
]

model_cmp_res = compare_models(models, cmp_models, X_train, X_test, y_train, y_test)
model_cmp_res

Unnamed: 0,model,accuracy,f1_score,precision,recall
6,knn,0.996364,0.995918,1.0,0.99187
0,log_reg_irls,0.992727,0.991803,0.991803,0.991803
5,qda,0.978182,0.976,1.0,0.953125
4,lda,0.967273,0.964427,1.0,0.931298
3,log_reg_adam,0.956364,0.951613,0.967213,0.936508
1,log_reg_gd,0.927273,0.915966,0.893443,0.939655
2,log_reg_sgd,0.927273,0.915966,0.893443,0.939655


In [36]:
plot_comparison_results(model_cmp_res)

# 4. Chronic Kidney Disease dataset

Attribute Information:
1. Age (yrs)                         
2. Blood Pressure (mm/Hg)            
3. Specific Gravity                  
4. Albumin                           
5. Sugar                             
6. Blood Glucose Random (mgs/dL)     
7. Blood Urea (mgs/dL)               
8. Serum Creatinine (mgs/dL)         
9. Sodium (mEq/L)                    
10. Potassium (mEq/L)                 
11. Hemoglobin (gms)                  
12. Packed Cell Volume                
13. White Blood Cells (cells/cmm)     
14. Red Blood Cells (millions/cmm)    
15. Red Blood Cells: normal           
16. Pus Cells: normal                 
17. Pus Cell Clumps: present          
18. Bacteria: present                 
19. Hypertension: yes                 
20. Diabetes Mellitus: yes            
21. Coronary Artery Disease: yes      
22. Appetite: poor                    
23. Pedal Edema: yes                  
24. Anemia: yes                       
25. Chronic Kidney Disease: yes       

Number of observations: 400


In [37]:
X_train, X_test, y_train, y_test = load_and_preprocess_ckd()
dataset_name = 'chronic kidney disease'

## 4.1 Hyperparamters tuning 
(default 5-fold cross-validation)

### 4.1.1 Iteratively Reweighted Least Squartes (IRLS)

In [38]:
irls_search = tune_pipe_with_randomsearch(irls_model, irls_lambda_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_irls = irls_search.best_estimator_
plot_acc_vs_hyperparameter(irls_search, "logistic_regression__irls_lambda", dataset_name, 'irls')

### 4.1.2 Gradient Descent (GD)

In [39]:
gd_search = tune_pipe_with_randomsearch(gd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_gd = gd_search.best_estimator_
plot_acc_vs_hyperparameter(gd_search, "logistic_regression__lr", dataset_name, 'gd')

### 4.1.3 Stochastic Gradient Descent (SGD)

In [40]:
sgd_search = tune_pipe_with_randomsearch(sgd_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_sgd = sgd_search.best_estimator_
plot_acc_vs_hyperparameter(sgd_search, "logistic_regression__lr", dataset_name, 'sgd')

### 4.1.4 Adam

Tuning only learning rate:

In [41]:
adam_search = tune_pipe_with_randomsearch(adam_model, lr_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
best_model_adam = adam_search.best_estimator_
plot_acc_vs_hyperparameter(adam_search, "logistic_regression__lr", dataset_name, 'adam')

Tuning all hyperparameters may be useful:

In [42]:
adam_full_search = tune_pipe_with_randomsearch(adam_model, adam_full_hyperparams, X_train, y_train, n_iter=HYPERPARAMS_ITERS)
adam_full_search.best_estimator_

Pipeline(steps=[('removing_collinear_features',
                 CollinearFeaturesVIFTransformer(vif_threshold=20)),
                ('scalling', StandardScaler()),
                ('logistic_regression',
                 BinaryLogisticRegression(adam_b2=0.9999, adam_e=1e-12,
                                          lr=0.29586705939330604,
                                          method='ADAM'))])

In [43]:
adam_full_search.best_score_

0.95625

In [44]:
plot_acc_vs_hyperparameter([gd_search, sgd_search, adam_search], "logistic_regression__lr", dataset_name, model_names=['gd', 'sgd', 'adam'])

## 4.2 Log-likelihood function analysis

In [45]:
visualize_log_likelihood(best_model_irls, best_model_gd, best_model_sgd, best_model_adam, dataset_name)

## 4.3 Models comparison

In [46]:
models = [
    ('log_reg_irls', best_model_irls),
    ('log_reg_gd', best_model_gd),
    ('log_reg_sgd', best_model_gd),
    ('log_reg_adam', best_model_adam)
]

model_cmp_res = compare_models(models, cmp_models, X_train, X_test, y_train, y_test)
model_cmp_res


Variables are collinear



Unnamed: 0,model,accuracy,f1_score,precision,recall
0,log_reg_irls,0.95,0.958333,0.92,1.0
6,knn,0.9375,0.947368,0.9,1.0
1,log_reg_gd,0.8875,0.901099,0.82,1.0
2,log_reg_sgd,0.8875,0.901099,0.82,1.0
3,log_reg_adam,0.875,0.888889,0.8,1.0
4,lda,0.875,0.888889,0.8,1.0
5,qda,0.625,0.769231,1.0,0.625


In [47]:
plot_comparison_results(model_cmp_res)