In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

In [None]:
#imputation
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
import sys

In [None]:
#model
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
#from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV

from sklearn import metrics
import pickle

In [None]:
#model
from sklearn.model_selection import StratifiedKFold

#classifers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

import lightgbm as lgbm
from lightgbm.sklearn import LGBMClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

In [None]:
#display setting
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', None)

In [None]:
#import model data
mdl_df = pd.read_pickle("./data/mdl_df_121420.pkl")

## categorical features encode

In [None]:
#encoding category variable
#1.set all to category
for col in categ_columns3:
    mdl_df.loc[:,col] = mdl_df[col].astype('category')

In [None]:
#2. set to codes
for col in categ_columns3:
    mdl_df.loc[:, col] = mdl_df[col].cat.codes

In [None]:
#3. reset -1 to NaN
for col in categ_columns3:
    mdl_df.loc[(mdl_df[col] == -1),col] = np.nan
    mdl_df[col] = mdl_df[col].astype('Int8')

## numeric feature transformation

In [None]:
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn import preprocessing

In [None]:
for col in num_columns1:
    mdl_df[col] = np.log(mdl_df[col])
    
qtranform = QuantileTransformer(n_quantiles = int(mdl_df.shape[0]/2), output_distribution='normal').fit(mdl_df['x_num1'].to_numpy().reshape(-1, 1))
mdl_df['x_num1'] = qtranform.transform(mdl_df['x_num1'].to_numpy().reshape(-1, 1))

# imputation

In [None]:
np.set_printoptions(threshold=sys.maxsize)

### simple imputer

In [None]:
#simple impute pipeline
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

numeric_transformer = Pipeline(
    steps=[('imputer', imputer_num)]
)
categorical_transformer = Pipeline(
    steps=[('imputer', imputer_cat)]
)

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, np.arange(13)),
                  ('num', numeric_transformer, np.arange(13,22,1))
                 ]
)

In [None]:
mdl_df_imputed_simple = pd.DataFrame(preprocessor.fit_transform(mdl_df.to_numpy()),columns = mdl_df.columns)

### iterataive imputer

In [None]:
#impute_estimator =  ExtraTreesRegressor(n_estimators=10, random_state=0) 
#imputer_all = IterativeImputer(random_state = 0, estimator = impute_estimator,max_iter = 20,tol = 0.001)

In [None]:
#X_train_imputed_iter = imputer_all.fit_transform(X_train[:,range(0,3)])

### knn imputer

In [None]:
# n_neighbors=10 is optimal???
knn_impute = KNNImputer(n_neighbors=10)

In [None]:
mdl_df_imputed_knn = knn_impute.fit_transform((mdl_df).to_numpy())

In [None]:
#round up categircal features
for col in categ_columns:
    mdl_df_imputed_knn.loc[:,col] = np.round(mdl_df_imputed_knn[col])

In [None]:
#pip install -U imbalanced-learn
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import AllKNN
#from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import TomekLinks

from collections import Counter

## imbalance: under resampling

In [None]:
def get_modeldt_imb_imp(imb_method,imp_method_name,imputed_dt):
    """ get the model train data using imbalance method and imputation method"""
    print('imbalance resampling method: ',type(imb_method).__name__)
    print('imputation method: ',imp_method_name)
    return imb_method.fit_resample(imputed_dt[feature_columns].to_numpy(), imputed_dt[['y_var']].to_numpy())

In [None]:
# grid search, build models
def buildmodel(clsifier,param_grid,n_folds,val_metric):
    """build best model with sepcific classifer by gird search
    clsifier: classifer
    val_metric: metric to validate the best model """
    
    estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clsifier)])
    grid_classifier = GridSearchCV(estimator, param_grid = param_grid, cv = n_folds, scoring= val_metric)
    grid_mdls = grid_classifier.fit(X_train, y_train.ravel())
    y_pred = grid_mdls.predict(X_test)
    y_prob = grid_mdls.predict_proba(X_test)
    
    print(type(clsifier).__name__)
    print("tuned hpyerparameters :(best parameters) ",grid_mdls.best_params_)
    print("performance metric :",grid_mdls.best_score_)
    print("roc auc:", metrics.roc_auc_score(y_test,y_prob[:,1]))
    print('classification report:', metrics.classification_report(y_test, y_pred), sep='\n')
    return grid_mdls
    

In [None]:
#plot features importance for tree based model
def featr_imprt(tree_mdl,encoded_feature_names):
    """plot the features importance"""
    pd.Series(tree_mdl.named_steps['classifier'].feature_importances_, encoded_feature_names)\
    .sort_values(ascending=True)\
    .plot(kind='barh', title='Feature Importances',figsize=(10, 20))
    return pd.Series(tree_mdl.named_steps['classifier'].feature_importances_, encoded_feature_names).sort_values(ascending=False).nlargest(10)

### pipeline

In [None]:
#pipeline
numeric_transformer = Pipeline(
    steps=[('scaler', StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, np.arange(13)),\
                  ('passthrough','passthrough',np.arange(13,14)),\
                  ('num', numeric_transformer, np.arange(14,19,1))])

#preprocessor = make_column_transformer((OneHotEncoder(), np.arange(13)),("passthrough",np.arange(13,14)),(StandardScaler(), np.arange(14,19,1)))

## create model manually/single

In [None]:
#imputed dataset: mdl_df_imputed_simple,mdl_df_imputed_knn
#resampling methods
nm3 = NearMiss(version = 3)
oss = OneSidedSelection(random_state=0)
allknn = AllKNN()
nbcr = NeighbourhoodCleaningRule()
tl = TomekLinks

imb_method = allknn
imp_method_name = 'Simple'
imputed_dt = mdl_df_imputed_simple

X_resampled, y_resampled = get_modeldt_imb_imp(imb_method,imp_method_name,imputed_dt)
print("resample classes(class, N):",sorted(Counter(y_resampled).items()))

### split train /test datasets

In [None]:
#split model data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, random_state=0)

### Logistic regression(rigid)

In [None]:
lr_estimator = LogisticRegression(max_iter = 1000)
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_grid = dict(classifier__C = C_values)
n_folds = 5
val_metric = "roc_auc"

grid_lr_mdl = buildmodel(lr_estimator,C_grid,n_folds,val_metric)

In [None]:
#grid_lr_mdl.cv_results_
#grid_lr_mdl
#grid_lr_mdl.best_estimator_.named_steps['classifier'].coef_.shape
#grid_lr_mdl.best_estimator_.named_steps['preprocessor'].transformers_[0][1].get_feature_names().reshape(-1,1).shape

# Multi class classification

### KNN classifier

In [None]:
# KNN, LightGBM, SVM, LMP using metric with macro, minor,...

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
m_X_train, m_X_test, m_y_train, m_y_test = train_test_split(mdl_df_imputed_knn_Y[feature_columns].to_numpy(), mdl_df_imputed_knn_Y[['y_var']].to_numpy(), test_size = 0.3, random_state=0)

In [None]:
sorted(Counter(m_y_train.ravel()).items())

In [None]:
#classifer estimator
knn_estimator = KNeighborsClassifier(n_neighbors = 10,weights = 'uniform')
m_param_grid = dict(classifier__n_neighbors = [2,5,10], 
                    classifier__weights = ['uniform', 'distance'])

m_n_folds = 2
m_val_metric = "roc_auc_ovr"

In [None]:
m_estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', knn_estimator)])
#no grid search
#m_grid_knn_mdls = m_estimator.fit(m_X_train, m_y_train.ravel())
m_grid_knn = GridSearchCV(m_estimator, param_grid = m_param_grid, cv = m_n_folds, scoring = m_val_metric)
# with grid search
m_grid_knn_mdls = m_grid_knn.fit(m_X_train, m_y_train.ravel())

In [None]:
#no grid search
m_y_pred = m_grid_knn_mdls.predict(m_X_test)
print('classification report:', metrics.classification_report(m_y_test, m_y_pred), sep='\n')

In [None]:
# with grid search
m_y_pred = m_grid_knn_mdls.predict(m_X_test)
print('classification report:', metrics.classification_report(m_y_test, m_y_pred), sep='\n')

### lightGBM

### outcome y have 5 classes/levels

In [None]:
m_X_train, m_X_test, m_y_train, m_y_test = train_test_split(mdl_df_imputed_simple_Y[feature_columns].to_numpy(), mdl_df_imputed_simple_Y[['y_var']].to_numpy(), test_size = 0.3, random_state=1)

In [None]:
sorted(Counter(m_y_train.ravel()).items())

In [None]:
#classifer estimator
m_lgbm_estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=0, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)
#parameters
m_param_grid = dict(classifier__learning_rate = [0.001,0.1,1], 
                  classifier__n_estimators = [100,150],
                  classifier__num_leaves = [10,31,100],
                  classifier__subsample = [0.5,0.8,1],
                  classifier__colsample_bytree = [0.5,0.8,1],
                  classifier__min_child_weight = [0.001,1]                  
                 )
m_n_folds = 5
m_val_metric = "roc_auc_ovo"

In [None]:
m_estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', m_lgbm_estimator)])
m_grid_classifier = GridSearchCV(m_estimator, param_grid = m_param_grid, cv = m_n_folds, scoring= m_val_metric)
m_grid_mdls = m_grid_classifier.fit(m_X_train, m_y_train.ravel())

In [None]:
m_y_pred = m_grid_mdls.predict(m_X_test)

In [None]:
m_prob_pred = m_grid_mdls.predict_proba(m_X_test)

In [None]:
print("roc auc:", metrics.roc_auc_score(m_y_test.ravel(), m_prob_pred, multi_class = 'ovo',average = 'macro'))

In [None]:
pd.DataFrame(metrics.confusion_matrix(m_y_test, m_y_pred))

In [None]:
#metrics.confusion_matrix(m_y_test, m_y_pred)
disp = metrics.plot_confusion_matrix(m_grid_mdls, m_X_test, m_y_test)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")
plt.show()

In [None]:
print("tuned hpyerparameters :(best parameters) ",m_grid_mdls.best_params_)
print("performance metric :",m_grid_mdls.best_score_)
#print("roc auc:", metrics.roc_auc_score(np.argmax(m_y_test, axis = 1),m_y_pred,multi_class = 'ovo'))
print('classification report:', metrics.classification_report(m_y_test, m_y_pred), sep='\n')

In [None]:
metrics.classification_report(m_y_test, m_y_pred)

In [None]:
#m_grid_mdls.cv_results_
m_grid_mdls.scorer_

### OutputCodeClassifier strategy

In [None]:
from sklearn.multiclass import OutputCodeClassifier

In [None]:
m_lgbm_estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=100, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_samples=20, \
                   subsample_freq=0,\
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1,subsample=0.5,colsample_bytree=0.5,min_child_weight = 1)

In [None]:
m_estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', m_lgbm_estimator)])

In [None]:
output_lgbm_classifer = OutputCodeClassifier(m_estimator,code_size=3, random_state=0)

In [None]:
output_lgbm_mdl = output_lgbm_classifer.fit(m_X_train, m_y_train.ravel())

In [None]:
m_y_pred = output_lgbm_mdl.predict(m_X_test)
#m_prob_pred = output_lgbm_mdl.predict_proba(m_X_test)

In [None]:
#print("tuned hpyerparameters :(best parameters) ",m1_grid_mdls1.best_params_)
#print("performance metric :",m1_grid_mdls1.best_score_)
#print("roc auc:", metrics.roc_auc_score(m_y_test.ravel(),m_prob_pred,multi_class = 'ovo'))
print('classification report:', metrics.classification_report(m_y_test, m_y_pred), sep='\n')

# THE END OF MULTCLASS MODELS BUILDING

### light GBM

In [None]:
#classifer estimator
lgbm_estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=0, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)
#parameters
param_grid = dict(classifier__learning_rate = [0.001,0.1,1], 
                  classifier__n_estimators = [100,150],
                  classifier__num_leaves = [10,31,100],
                  classifier__subsample = [0.5,0.8,1],
                  classifier__colsample_bytree = [0.5,0.8,1],
                  classifier__min_child_weight = [0.001,1]                  
                 )
n_folds = 5
val_metric = "roc_auc"

grid_lgbm_mdl = buildmodel(lgbm_estimator,param_grid,n_folds,val_metric)

In [None]:
#save model
#mdl_pkl_name = "best_mdl_simple_allknn_lightgbm.pkl"
#with open(mdl_pkl_name, 'wb') as file:
#    pickle.dump(grid_lgbm_mdl, file)
# Load the Model back from file
#with open(mdl_pkl_name, 'rb') as file:
#    lr_mdl = pickle.load(file)

In [None]:
#feature importance
top10features = featr_imprt(grid_lgbm_mdl.best_estimator_,encoded_feature_names)
print('top 10 important features:',top10features)

## Loop searching only for logistic regression
* loop by method of undersampling methods and imputation methods

In [None]:
#imbmethods = [NearMiss(version = 3),
#              OneSidedSelection(random_state=0),
#              AllKNN(),
#              NeighbourhoodCleaningRule()]
#imputednames = ['Simple','KNN']
#imputedDFs = [mdl_df_imputed_simple,mdl_df_imputed_knn]

imbmethods = [NearMiss(version = 3)]
imputednames = ['Simple']
imputedDFs = [mdl_df_imputed_simple]

#logistic regression parameters
lr_estimator = LogisticRegression(max_iter = 1000)
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_grid = dict(classifier__C = C_values)
n_folds = 5
val_metric = "roc_auc"
grid_lr_mdl = dict()

for imp_method_name,imputed_dt in zip(imputednames,imputedDFs):
    for imb_method in imbmethods:
        X_resampled, y_resampled = get_modeldt_imb_imp(imb_method,imp_method_name,imputed_dt)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, random_state=0)
        
        grid_lr_mdl[(imp_method_name + type(imb_method).__name__)] =  buildmodel(lr_estimator,C_grid,n_folds,val_metric)

## Loop searching for all classifiers(split train and test datasets after resampling)
* by imputation methods
* by undersampling methods
* by classifier

In [None]:
imbmethods = ['None',
              NearMiss(version = 3),
              OneSidedSelection(random_state=0),
              AllKNN(),
              NeighbourhoodCleaningRule(),
              TomekLinks()]
imputednames = ['Simple','KNN']
imputedDFs = [mdl_df_imputed_simple,mdl_df_imputed_knn]

classifiers = [LogisticRegression(max_iter = 1000),
              LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=0, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)]
classifiers_paras = [dict(classifier__C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]),
                     dict(classifier__learning_rate = [0.001,0.1,1], 
                          classifier__n_estimators = [100,150],
                          classifier__num_leaves = [10,31,100],
                          classifier__subsample = [0.5,0.8,1],
                          classifier__colsample_bytree = [0.5,0.8,1],
                          classifier__min_child_weight = [0.001,1])
                    ]

n_folds = 5
val_metric = "roc_auc"
grid_mdl = dict()

for imp_method_name,imputed_dt in zip(imputednames,imputedDFs):
    for imb_method in imbmethods:
        if (imb_method == 'None'):
            print(imp_method_name)
            print('None sampling')
            X_train, X_test, y_train, y_test = train_test_split(imputed_dt[feature_columns].to_numpy(), imputed_dt[['y_var']].to_numpy(), test_size = 0.3, random_state=0)
        else:
            X_resampled, y_resampled = get_modeldt_imb_imp(imb_method,imp_method_name,imputed_dt)
            X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, random_state=0)
        
        for param_grid, estimator in zip(classifiers_paras,classifiers):
            grid_mdl[(imp_method_name + type(imb_method).__name__+ type(estimator).__name__)] =  buildmodel(estimator,param_grid,n_folds,val_metric)
        

## Loop searching for all classifiers(only resampling train data)
* by imputation methods
* by undersampling methods
* by classifier

In [None]:
imbmethods = ['None',
              NearMiss(version = 3),
              OneSidedSelection(random_state=0),
              AllKNN(),
              NeighbourhoodCleaningRule(),
              TomekLinks()
             ]
imputednames = ['Simple','KNN']
imputedDFs = [mdl_df_imputed_simple,mdl_df_imputed_knn]

classifiers = [LogisticRegression(max_iter = 1000),
              LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100, \
                   subsample_for_bin=200000, objective=None,\
                   min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, \
                   subsample=1.0, subsample_freq=0, colsample_bytree=1.0, \
                   reg_alpha=0.0, reg_lambda=0.0, random_state=1)]
classifiers_paras = [dict(classifier__C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]),
                     dict(classifier__learning_rate = [0.001,0.1,1], 
                          classifier__n_estimators = [100,150],
                          classifier__num_leaves = [10,31,100],
                          classifier__subsample = [0.5,0.8,1],
                          classifier__colsample_bytree = [0.5,0.8,1],
                          classifier__min_child_weight = [0.001,1])
                    ]

n_folds = 5
val_metric = "roc_auc"
grid_mdl = dict()

for imp_method_name,imputed_dt in zip(imputednames,imputedDFs):
    print(imp_method_name)
    
    for imb_method in imbmethods:
        X_train, X_test, y_train, y_test = train_test_split(imputed_dt[feature_columns].to_numpy(), imputed_dt[['y_var']].to_numpy(), test_size = 0.3, random_state=0)
        if (imb_method == 'None'):
            print('None sampling')
        else:
            print(type(imb_method).__name__)
            X_train_resampled, y_train_resampled = imb_method.fit_resample(X_train, y_train)
            X_train = X_train_resampled
            y_train = y_train_resampled 
        
        for param_grid, estimator in zip(classifiers_paras,classifiers):
            grid_mdl[(imp_method_name + type(imb_method).__name__+ type(estimator).__name__)] =  buildmodel(estimator,param_grid,n_folds,val_metric)
        

# imbalance analysis

In [None]:
def plot_2d_space(X, y, label='Classes'):   
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    for l, c, m in zip(np.unique(y), colors, markers):
        plt.scatter(
            X[y==l, 0],
            X[y==l, 1],
            c=c, label=l, marker=m
        )
    plt.title(label)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
# 2way to show all model data(population)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X = pca.fit_transform(mdl_df_imputed_simple[feature_columns].to_numpy())
y = mdl_df_imputed_simple[['y_var']].to_numpy().ravel()

In [None]:
plot_2d_space(X, y, 'Imbalanced dataset (2 PCA components)')

### TomekLinks

In [None]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
X_tl, y_tl = tl.fit_sample(X, y)

#print('Removed indexes:', id_tl)
print('sampling size%:',X_tl.shape[0], 100 * X_tl.shape[0]/X.shape[0])

plot_2d_space(X_tl, y_tl, 'Tomek links under-sampling')

In [None]:
from imblearn.under_sampling import AllKNN
ak = AllKNN()
X_ak, y_ak = ak.fit_sample(X, y)
print('sampling size: %d, percentage: %d', X_ak.shape[0],100* X_ak.shape[0]/X.shape[0])
plot_2d_space(X_ak, y_ak, 'ALLKNN under-sampling')

### compare feature distributions between populatin and sample

In [None]:
X_ak,y_ak = ak.fit_sample(mdl_df_imputed_simple[feature_columns].to_numpy(), mdl_df_imputed_simple[['y_var']].to_numpy().ravel())
Xy_ak_df = pd.concat([pd.DataFrame(data = X_ak, columns = (feature_columns)), pd.DataFrame(data = y_ak, columns = (['y_var']))], axis = 1)

In [None]:
#one way summary frequency table
dt = mdl_df_imputed_simple
for col in categ_columns3:
    print(col)
    #mdl_df[col].value_counts(dropna = False)
    pd.concat([dt[col].value_counts(dropna = False),100 * dt[col].value_counts(dropna = False, normalize = True)],axis = 1)

In [None]:
# histogram for numeric features
dt = mdl_df_imputed_simple
fig, axs = plt.subplots(len(num_columns), 1, sharey=False, tight_layout=False,figsize=(10, 20))
i = 0
for col in num_columns:
    axs[i].hist(dt[col],100)
    i = i+1
plt.tight_layout()
plt.show()