<a href="https://colab.research.google.com/github/chervov/Test01/blob/master/bench_automl_v01_0010.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os 

# classifiers: 
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import  RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# datatsets: 
import sklearn.datasets

# auxilliary: metrics etc. 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score,train_test_split



In [0]:
def get_dataset(dataset_id  ):
  '''
  Returns X, y - feature dataframe, and y - target, 
  
  Input:
  dataset_id - identificator of dataset 
  
  '''
  if dataset_id == 'Randn 1000 10 p001':
    X = pd.DataFrame( np.random.randn(1000,10) )
    y = (np.random.rand(1000) <= 0.01 ).astype(float)
    return X,y
  if dataset_id == 'Make Classification 10000 40 20':
    X, y = sklearn.datasets.make_classification(n_samples=10000, n_features=40, n_informative=20, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=0) 
    X = pd.DataFrame( X )
    return X,y
  
  raise ValueError('Unknown dataset_id')
  

In [0]:
def get_clf(clf_id  ):
  '''
  Returns classifier class  
  Clf supports methods: .fit and .predict_proba
  
  Input:
  clf_id - identificator of classifier  
  '''
  
  if clf_id == 'lgb_1000':
    clf = lgb.LGBMClassifier(n_estimators = 1000)
    return clf 
    
  raise ValueError('Unknown clf_id')
  

In [42]:
start_time_0 = time.time()

list_dataset_id = [ 'Randn 1000 10 p001', 'Make Classification 10000 40 20' ]


list_clf_id = ['lgb_1000'] 

df_stat = pd.DataFrame() # Output statistics 
for dataset_id in list_dataset_id:
  X, y =   get_dataset(dataset_id) 
  for clf_id in list_clf_id:
    clf = get_clf(clf_id  )
    df_stat_random_state = pd.DataFrame()
    for random_state in range(10):
      print(dataset_id, clf_id,  random_state, time.time() - start_time_0, 'Seconds passed ' )
      X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, random_state = random_state  )

      start_time_1 = time.time()
      
      clf.fit(X_train,y_train)
      p = clf.predict_proba(X_train)[:,1]
      roc_auc_train = roc_auc_score(y_train,p)
      
      p = clf.predict_proba(X_test)[:,1]
      roc_auc_test = roc_auc_score(y_test,p)

      df_stat_random_state.loc[random_state, 'Time'] = time.time() - start_time_1

      df_stat_random_state.loc[random_state, 'AUC Test'] = roc_auc_test
      df_stat_random_state.loc[random_state, 'AUC Train'] = roc_auc_train
      
  df_stat.loc[clf_id, dataset_id +  ' AUC Test' ] = df_stat_random_state['AUC Test'].mean()
  df_stat.loc[clf_id, dataset_id +  ' AUC Test std' ] = df_stat_random_state['AUC Test'].std()
  df_stat.loc[clf_id, dataset_id +  ' AUC Train' ] = df_stat_random_state['AUC Train'].mean()
  df_stat.loc[clf_id, dataset_id +  ' AUC Train std' ] = df_stat_random_state['AUC Train'].std()
  df_stat.loc[clf_id, dataset_id +  ' Time' ] = df_stat_random_state['Time'].mean()
  
      
print(time.time() - start_time_0, 'Seconds passed ')
df_stat
      
  

Randn 1000 10 p001 lgb_1000 0 0.003739595413208008 Seconds passed 
Randn 1000 10 p001 lgb_1000 1 0.2688436508178711 Seconds passed 
Randn 1000 10 p001 lgb_1000 2 0.5263102054595947 Seconds passed 
Randn 1000 10 p001 lgb_1000 3 0.7810399532318115 Seconds passed 
Randn 1000 10 p001 lgb_1000 4 1.0343003273010254 Seconds passed 
Randn 1000 10 p001 lgb_1000 5 1.276581048965454 Seconds passed 
Randn 1000 10 p001 lgb_1000 6 1.4948639869689941 Seconds passed 
Randn 1000 10 p001 lgb_1000 7 1.7255213260650635 Seconds passed 
Randn 1000 10 p001 lgb_1000 8 1.9869258403778076 Seconds passed 
Randn 1000 10 p001 lgb_1000 9 2.247584342956543 Seconds passed 
Make Classification 10000 40 20 lgb_1000 0 2.5405325889587402 Seconds passed 
Make Classification 10000 40 20 lgb_1000 1 11.829620599746704 Seconds passed 
Make Classification 10000 40 20 lgb_1000 2 21.152600049972534 Seconds passed 
Make Classification 10000 40 20 lgb_1000 3 30.39512324333191 Seconds passed 
Make Classification 10000 40 20 lgb_100

Unnamed: 0,Randn 1000 10 p001 AUC Test,Randn 1000 10 p001 AUC Test std,Randn 1000 10 p001 AUC Train,Randn 1000 10 p001 AUC Train std,Randn 1000 10 p001 Time,Make Classification 10000 40 20 AUC Test,Make Classification 10000 40 20 AUC Test std,Make Classification 10000 40 20 AUC Train,Make Classification 10000 40 20 AUC Train std,Make Classification 10000 40 20 Time
lgb_1000,0.702823,0.155296,1.0,0.0,0.014499,0.990781,0.001434,1.0,0.0,0.047995


In [14]:
X,y 


(array([[ 1.16138583,  0.47466997, -0.30158789, ...,  0.02240581,
          0.89247571, -0.19813418],
        [-1.0631909 ,  0.1417697 ,  1.12540204, ...,  2.06776172,
          0.27274267, -0.14811773],
        [ 2.14781014, -1.11143017,  0.08341629, ...,  0.36842522,
         -0.0679126 , -0.29693638],
        ...,
        [-0.31118917,  0.33440043,  0.38947091, ...,  1.51175227,
         -0.67015941,  1.59362693],
        [-0.73420322, -0.90228013,  0.05314848, ..., -0.71081685,
         -0.9821167 , -0.17993841],
        [ 2.16783149, -0.94022549,  0.34158301, ..., -0.79490623,
          0.33495137, -0.63340859]]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0