In [None]:
!pip install scikit-learn
!pip install pandas
!pip install scipy
!pip install imblearn
!pip install matplotlib
!pip install xgboost
!pip install dill

In [None]:
import csv
import time
import math
import pandas as pd
import numpy as np
import os
import gc
from scipy import stats
from sklearn.feature_selection import SelectFromModel
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import normalize

In [None]:
#tools
from sklearn.feature_selection import VarianceThreshold
#from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import matplotlib.pyplot as plt
import dill

In [None]:
#dimensional reduction & feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

In [None]:
#cross_val
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
df = pd.read_csv("Dataset.csv", sep=",", encoding='utf-8')
#df = df.drop(['library_name', 'Tags'], axis=1)
df = df.drop(['library_name'], axis=1)
print(len(df.index))
print(len(df.columns))
df.reusability.describe()

In [None]:
#df.iloc[:, 82:] = df.iloc[:, 82:].astype('bool')

In [None]:
dropped_columns = []
def data_preprocessing(df, split):
    print(len(df.columns))
    #df_ = df.copy(deep=True)
    conditions = [(df['reusability'] <= split), (df['reusability'] > split)]
    values = [False, True]
    df['reuse_bin'] = np.select(conditions, values)
    # Splits high and low on target variable
    
    #p_corr = df.corr(method='pearson')
    #k_corr = df.corr(method='kendall')
    s_corr = df.corr(method='spearman')
    #corr_stat = pd.concat([p_corr['maven_reuse'].round(3), k_corr['maven_reuse'].round(3), s_corr['maven_reuse'].round(3)], axis=1)
    #corr_stat.columns = ['pearson', 'kendall', 'spearman']
    #corr_stat
    for i in range(len(s_corr['reuse_bin'])):
        if pd.isnull(s_corr['reuse_bin'][i]) == True:
            df.drop(s_corr['reuse_bin'].index[i], axis='columns', inplace=True)
            dropped_columns.append(s_corr['reuse_bin'].index[i])
    #remove features which have NAN pearson coorelation        
    
    print(len(df.columns))
    #print(len(df.columns))
    
    y = df['reuse_bin']
    X = df.drop(['reuse_bin', 'reusability'], axis=1)
    ###X = X.apply(pd.to_numeric)
    
    r_array = []
    for col in X:
        #print('{}: {}'.format(col, stats.ttest_ind(X[col], y_val)[1].round(5)))
        if stats.ttest_ind(X[col], y)[1].round(5) > 0.05:
            r_array.append(col)
    X = X.drop(columns=r_array)
    dropped_columns.extend(r_array)
    #remove all insignificant features
    print(len(X.columns))
    print(len(X.index))
    print(dropped_columns)
    return X, y

In [None]:
X_17, y_17 = data_preprocessing(df, 21)

In [None]:
class CLF():
    
    def __init__(self, X, y, classifier, preprocess_method = None, d_reduction = 0,
                 f_selection = None, seed = 27033074, vt = False, resampling = False, 
                 cv = 4, cv_r = 10, permute = False, top = 10, **params):
        
        self.X = X
        self.y = y
        self.classifier = classifier
        self.pm = preprocess_method
        self.dr = d_reduction
        self.fs = f_selection
        self.vt = vt
        self.resampling = resampling
        self.p = params
        self.results_array = [[], [], [], []]  #f1, pre, rec, acc
        
        if permute == False:
            self.skf = RepeatedStratifiedKFold(n_splits = cv, n_repeats = cv_r, random_state=seed)
            for train_index, test_index in self.skf.split(X, y):
                self.X_train, self.X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                self.y_train, self.y_test = self.y[train_index], self.y[test_index]
                self.clf = make_pipeline(self.sampling(), self.variance_t(), self.preprocess(),
                                         self.dim_reduction(), self.fea_selection(), self.model())
                self.clf.fit(self.X_train, self.y_train)
                self.y_pred = self.clf.predict(self.X_test)
                self.results_array[0].append(self.get_f1())
                self.results_array[1].append(self.get_pre())
                self.results_array[2].append(self.get_rec())
                self.results_array[3].append(self.get_acc())
                #print(classification_report(self.y_test, self.y_pred))

            #self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=seed)
            #self.clf = Pipeline([('preprocessor', self.preprocess()), ('model', self.model())])
        else:
            #take top 10 important features (default)
            self.skf = RepeatedStratifiedKFold(n_splits = cv, n_repeats = cv_r, random_state=seed)
            for train_index, test_index in self.skf.split(X, y):
                self.X_train, self.X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                self.y_train, self.y_test = self.y[train_index], self.y[test_index]
                self.clf = make_pipeline(self.sampling(), self.variance_t(), self.preprocess(),
                                         self.dim_reduction(), self.fea_selection(), self.model())
                self.clf.fit(self.X_train, self.y_train)
                self.y_pred = self.clf.predict(self.X_test)
                print('Computing permutation importance...')
                start = time.time()
                result = permutation_importance(self.clf, self.X_test, self.y_test, random_state=seed, n_jobs=-1)
                print('Process completed at ' + str(round((time.time() - start)/60, 3)) + ' min')
                with open("PI-" + str(self.get_model())[:10] + "_" + str(self.get_preprocess()) + ".csv", 'a') as csv_file:
                    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    w = 1
                    for i in result.importances_mean.argsort()[::-1]:
                        #if round(result.importances_mean[i] - 2 * result.importances_std[i], 4) > 0:
                        writer.writerow([self.X.columns[i], 
                                         round(result.importances_mean[i], 3), 
                                         round(result.importances_std[i], 3), self.get_f1()])
                        w += 1
                        if w > 10:
                            break
        
    def variance_t(self, t = 0.8 * (1 - 0.8)):
        
        if self.vt == True:
            return VarianceThreshold(threshold = t)
        
        return None
    
    def sampling(self):
        if self.resampling == True:
            return RandomOverSampler(sampling_strategy='minority', random_state=27033074)
        
        return None

    def preprocess(self, seed = 27033074):
        if self.pm == 'ss':
            return StandardScaler()
        elif self.pm == 'mms':
            return MinMaxScaler()
        elif self.pm == 'mas':
            return MaxAbsScaler()
        elif self.pm == 'rs':
            return RobustScaler()
        elif self.pm == 'pty':
            return PowerTransformer()
        elif self.pm == 'ptb':
            return PowerTransformer(method = 'box-cox')
        elif self.pm == 'qtu':
            return QuantileTransformer(random_state = seed)
        elif self.pm == 'qtn':
            return QuantileTransformer(output_distribution = 'normal', random_state = seed)
        elif self.pm == 'n':
            self.X_train = normalize(self.X_train, axis = 0)
            self.X_test = normalize(self.X_test, axis = 0)
            return None
        else:
            return None
        
    def dim_reduction(self, seed = 27033074):
        
        if self.dr == 0:
            return None
        elif self.dr == 1:
            return PCA(n_components=self.p['f'], random_state=seed)
        else:
            return
        
    def fea_selection(self, seed = 27033074):
        
        if self.fs == None:
            return None
        elif self.fs == 'kb':
            return SelectKBest(self.p['kbest_f'], k=self.p['f'])
        elif self.fs == 'rf':
            return SelectFromModel(RandomForestClassifier(n_estimators = self.p['fs_n'], n_jobs = -1, random_state=seed), 
                                   threshold=-np.inf, max_features=self.p['f'])
        else:
            return
    
    def model(self, seed = 27033074):
        if self.classifier == 'rf':
            return RandomForestClassifier(n_estimators = self.p['n'], n_jobs = -1, random_state=seed)
        elif self.classifier == 'knn':
            return KNeighborsClassifier(n_neighbors = self.p['n'], n_jobs = -1)
        elif self.classifier == 'mlp':
            return MLPClassifier(random_state = seed, max_iter=10000)
        elif self.classifier == 'dt':
            return DecisionTreeClassifier(random_state = seed)
        elif self.classifier == 'sgd':
            return SGDClassifier(loss = self.p['l'], n_jobs = -1, random_state = seed)
        elif self.classifier == 'r':
            return RidgeClassifierCV(cv = self.p['n'])
        elif self.classifier == 'svm':
            return SVC(gamma = self.p['gamma'], random_state = seed)
        elif self.classifier == 'xg':
            return xgb.XGBClassifier(random_state = seed)
        elif self.classifier == 'gpc':
            return GaussianProcessClassifier(random_state = seed)
        elif self.classifier == 'qda':
            return QuadraticDiscriminantAnalysis()
        elif self.classifier == 'ada':
            return AdaBoostClassifier(random_state = seed)
        elif self.classifier == 'gb':
            return GradientBoostingClassifier(n_estimators=self.p['n'], random_state=seed)
        else:
            print('No input model!')
    
    def mean_results_array(self):
        return [np.mean(self.results_array[0]), np.mean(self.results_array[1]), 
                np.mean(self.results_array[2]), np.mean(self.results_array[3]), 
                self.get_model(), self.num_of_features(), self.get_preprocess(), self.fs, self.dr, self.vt, 
                classification_report(self.y_test, self.y_pred, output_dict=True, zero_division=0)['1']['support'],
                self.resampling]
    
    def write_result(self, filename):
        
        isNone = not os.path.isfile(filename + '.csv')
        
        with open(filename +  ".csv", 'a') as csv_file:
            writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            if isNone:
                writer.writerow(['F1-Score', 'Precision', 'Recall', 'Accuracy', 'Model', 
                                 'Num_Features', 'Preprocessing', 'Fea_Selection', 
                                 'Dim_Reduction', 'Var_Threshold', 'support', 'resampling'])
            writer.writerow(self.mean_results_array())
        
        return
    
    def num_of_features(self):
        try:
            return self.p['f']
        except:
            return len(self.X.columns)
        
    def get_preprocess(self):
        
        if self.pm == 'n':
            return 'normalize'
        else:
            return self.clf[2]
    
    def get_model(self):
        return self.clf[-1]
    
    def get_model_params(self):
        return self.clf[-1].get_params()
    
    def __str__(self):
        return str(self.clf.steps)
    
    def get_acc(self):
        return round(self.clf.score(self.X_test, self.y_test), 3)
    
    def get_f1(self):
        #print(classification_report(self.y_test, self.y_pred, output_dict=True, zero_division=0)['1']['support'])
        return round(classification_report(self.y_test, self.y_pred, output_dict=True, zero_division=0)['1']['f1-score'],3)
    
    def get_pre(self):
        return round(classification_report(self.y_test, self.y_pred, output_dict=True, zero_division=0)['1']['precision'],3)
    
    def get_rec(self):
        return round(classification_report(self.y_test, self.y_pred, output_dict=True, zero_division=0)['1']['recall'], 3)
    
    def get_cmatrix(self):
        return confusion_matrix(self.y_test, self.y_pred)
    
    def p_importance(self, train_or_test, seed = 27033074):
        
        print('Computing permutation importance...')
        start = time.time()
        if train_or_test == 'train':
            result = permutation_importance(self.clf, self.X_train, self.y_train, random_state=seed, n_jobs=-1)
        elif train_or_test == 'test':
            result = permutation_importance(self.clf, self.X_test, self.y_test, random_state=seed, n_jobs=-1)
        else:
            print('Input train or test!')
            return
        
        print('Process completed at ' + str(round((time.time() - start)/60, 3)) + ' min')
        
        with open("PI-" + str(self.get_model()) + "_" + str(self.get_preprocess()) + ".csv", 'w') as csv_file:
            writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for i in result.importances_mean.argsort()[::-1]:
                #if round(result.importances_mean[i] - 2 * result.importances_std[i], 4) > 0:
                writer.writerow([self.X.columns[i], 
                                 round(result.importances_mean[i], 3), 
                                 round(result.importances_std[i], 3)])
        
        return
    
    def f_importance(self):
        #problem with the feature removals
        x = list(zip(self.clf[-1].feature_importances_, self.X.columns.values))
        x = pd.DataFrame(x, columns=["Importance","Feature_Name"])
        x = x.sort_values(by=['Importance'], ascending=False)
        x.to_csv('FI-' + str(self.get_model()) + "_" + str(self.get_preprocess()) +  ".csv", index=False)
        return
    
    def coefs(self):
        
        x = list(zip(self.clf[-1].feature_importances_, self.X.columns.values))
        x = pd.DataFrame(x, columns=["Importance","Feature_Name"])
        x = x.sort_values(by=['Importance'], ascending=False)
        x.to_csv('FI-' + str(self.get_model()) + "_" + str(self.get_preprocess()) +  ".csv", index=False)
        return

In [None]:
CLF(X_17, y_17, 'rf', n=500).write_result('test')
CLF(X_17, y_17, 'rf','ss', n=500).write_result('test')
CLF(X_17, y_17, 'rf','mms', n=500).write_result('test')
CLF(X_17, y_17, 'rf','mas', n=500).write_result('test')
CLF(X_17, y_17, 'rf','rs', n=500).write_result('test')
CLF(X_17, y_17, 'rf','pty', n=500).write_result('test')
CLF(X_17, y_17, 'rf','qtu', n=500).write_result('test')
CLF(X_17, y_17, 'rf','qtn', n=500).write_result('test')
CLF(X_17, y_17, 'rf','n', n=500).write_result('test')

CLF(X_17, y_17, 'mlp').write_result('test')
CLF(X_17, y_17, 'mlp','ss').write_result('test')
CLF(X_17, y_17, 'mlp','mms').write_result('test')
CLF(X_17, y_17, 'mlp','mas').write_result('test')
CLF(X_17, y_17, 'mlp','rs').write_result('test')
CLF(X_17, y_17, 'mlp','pty').write_result('test')
CLF(X_17, y_17, 'mlp','qtu').write_result('test')
CLF(X_17, y_17, 'mlp','qtn').write_result('test')
CLF(X_17, y_17, 'mlp','n').write_result('test')

CLF(X_17, y_17, 'svm', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','ss', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','mms', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','mas', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','rs', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','pty', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','qtu', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','qtn', gamma='auto').write_result('test')
CLF(X_17, y_17, 'svm','n', gamma='auto').write_result('test')

CLF(X_17, y_17, 'knn', n=10).write_result('test')
CLF(X_17, y_17, 'knn','ss', n=10).write_result('test')
CLF(X_17, y_17, 'knn','mms', n=10).write_result('test')
CLF(X_17, y_17, 'knn','mas', n=10).write_result('test')
CLF(X_17, y_17, 'knn','rs', n=10).write_result('test')
CLF(X_17, y_17, 'knn','pty', n=10).write_result('test')
CLF(X_17, y_17, 'knn','qtu', n=10).write_result('test')
CLF(X_17, y_17, 'knn','qtn', n=10).write_result('test')
CLF(X_17, y_17, 'knn','n', n=10).write_result('test')

CLF(X_17, y_17, 'sgd', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','ss', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','mms', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','mas', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','rs', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','pty', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','qtu', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','qtn', l='log').write_result('test')
CLF(X_17, y_17, 'sgd','n', l='log').write_result('test')

CLF(X_17, y_17, 'dt').write_result('test')
CLF(X_17, y_17, 'dt','ss').write_result('test')
CLF(X_17, y_17, 'dt','mms').write_result('test')
CLF(X_17, y_17, 'dt','mas').write_result('test')
CLF(X_17, y_17, 'dt','rs').write_result('test')
CLF(X_17, y_17, 'dt','pty').write_result('test')
CLF(X_17, y_17, 'dt','qtu').write_result('test')
CLF(X_17, y_17, 'dt','qtn').write_result('test')
CLF(X_17, y_17, 'dt','n').write_result('test')


CLF(X_17, y_17, 'r', n=None).write_result('test')
CLF(X_17, y_17, 'r','ss', n=None).write_result('test')
CLF(X_17, y_17, 'r','mms', n=None).write_result('test')
CLF(X_17, y_17, 'r','mas', n=None).write_result('test')
CLF(X_17, y_17, 'r','rs', n=None).write_result('test')
CLF(X_17, y_17, 'r','pty', n=None).write_result('test')
CLF(X_17, y_17, 'r','qtu', n=None).write_result('test')
CLF(X_17, y_17, 'r','qtn', n=None).write_result('test')
CLF(X_17, y_17, 'r','n', n=None).write_result('test')

CLF(X_17, y_17, 'xg').write_result('test')
CLF(X_17, y_17, 'xg','ss').write_result('test')
CLF(X_17, y_17, 'xg','mms').write_result('test')
CLF(X_17, y_17, 'xg','mas').write_result('test')
CLF(X_17, y_17, 'xg','rs').write_result('test')
CLF(X_17, y_17, 'xg','pty').write_result('test')
CLF(X_17, y_17, 'xg','qtu').write_result('test')
CLF(X_17, y_17, 'xg','qtn').write_result('test')
CLF(X_17, y_17, 'xg','n').write_result('test')

CLF(X_17, y_17, 'gpc').write_result('test')
CLF(X_17, y_17, 'gpc','ss').write_result('test')
CLF(X_17, y_17, 'gpc','mms').write_result('test')
CLF(X_17, y_17, 'gpc','mas').write_result('test')
CLF(X_17, y_17, 'gpc','rs').write_result('test')
CLF(X_17, y_17, 'gpc','pty').write_result('test')
CLF(X_17, y_17, 'gpc','qtu').write_result('test')
CLF(X_17, y_17, 'gpc','qtn').write_result('test')
CLF(X_17, y_17, 'gpc','n').write_result('test')

CLF(X_17, y_17, 'ada').write_result('test')
CLF(X_17, y_17, 'ada','ss').write_result('test')
CLF(X_17, y_17, 'ada','mms').write_result('test')
CLF(X_17, y_17, 'ada','mas').write_result('test')
CLF(X_17, y_17, 'ada','rs').write_result('test')
CLF(X_17, y_17, 'ada','pty').write_result('test')
CLF(X_17, y_17, 'ada','qtu').write_result('test')
CLF(X_17, y_17, 'ada','qtn').write_result('test')
CLF(X_17, y_17, 'ada','n').write_result('test')

CLF(X_17, y_17, 'gb', n=500).write_result('test')
CLF(X_17, y_17, 'gb','ss', n=500).write_result('test')
CLF(X_17, y_17, 'gb','mms', n=500).write_result('test')
CLF(X_17, y_17, 'gb','mas', n=500).write_result('test')
CLF(X_17, y_17, 'gb','rs', n=500).write_result('test')
CLF(X_17, y_17, 'gb','pty', n=500).write_result('test')
CLF(X_17, y_17, 'gb','qtu', n=500).write_result('test')
CLF(X_17, y_17, 'gb','qtn', n=500).write_result('test')
CLF(X_17, y_17, 'gb','n', n=500).write_result('test')

In [None]:
CLF(X_17, y_17, 'rf', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','ss', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','mms', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','mas', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','rs', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','pty', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','qtu', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','qtn', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'rf','n', n=500, resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'mlp', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','ss', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','mms', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','mas', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','rs', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','pty', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','qtu', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','qtn', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'mlp','n', resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'svm', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','ss', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','mms', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','mas', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','rs', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','pty', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','qtu', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','qtn', gamma='auto', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'svm','n', gamma='auto', resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'knn', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','ss', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','mms', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','mas', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','rs', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','pty', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','qtu', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','qtn', n=10, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'knn','n', n=10, resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'sgd', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','ss', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','mms', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','mas', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','rs', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','pty', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','qtu', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','qtn', l='log', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'sgd','n', l='log', resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'dt', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','ss', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','mms', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','mas', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','rs', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','pty', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','qtu', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','qtn', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'dt','n', resampling = True).write_result('test_resampled')


CLF(X_17, y_17, 'r', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','ss', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','mms', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','mas', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','rs', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','pty', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','qtu', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','qtn', n=None, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'r','n', n=None, resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'xg', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','ss', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','mms', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','mas', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','rs', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','pty', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','qtu', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','qtn', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'xg','n', resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'gpc', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','ss', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','mms', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','mas', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','rs', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','pty', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','qtu', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','qtn', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gpc','n', resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'ada', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','ss', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','mms', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','mas', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','rs', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','pty', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','qtu', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','qtn', resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'ada','n', resampling = True).write_result('test_resampled')

CLF(X_17, y_17, 'gb', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','ss', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','mms', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','mas', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','rs', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','pty', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','qtu', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','qtn', n=500, resampling = True).write_result('test_resampled')
CLF(X_17, y_17, 'gb','n', n=500, resampling = True).write_result('test_resampled')

In [None]:
CLF(X_17, y_17, 'rf', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','ss', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','mms', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','mas', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','rs', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','pty', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','qtu', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'rf','qtn', n=500, vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'rf','n', n=500, vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'mlp', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','ss', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','mms', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','mas', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','rs', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','pty', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','qtu', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'mlp','qtn', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'mlp','n', vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'svm', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','ss', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','mms', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','mas', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','rs', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','pty', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','qtu', gamma='auto', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'svm','qtn', gamma='auto', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'svm','n', gamma='auto', vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'knn', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','ss', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','mms', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','mas', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','rs', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','pty', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','qtu', n=10, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'knn','qtn', n=10, vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'knn','n', n=10, vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'sgd', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','ss', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','mms', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','mas', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','rs', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','pty', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','qtu', l='log', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'sgd','qtn', l='log', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'sgd','n', l='log', vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'dt', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','ss', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','mms', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','mas', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','rs', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','pty', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','qtu', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'dt','qtn', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'dt','n', vt=True).write_result('test_vt_no_resampling')


CLF(X_17, y_17, 'r', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','ss', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','mms', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','mas', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','rs', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','pty', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','qtu', n=None, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'r','qtn', n=None, vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'r','n', n=None, vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'xg', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','ss', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','mms', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','mas', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','rs', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','pty', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','qtu', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'xg','qtn', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'xg','n', vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'gpc', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','ss', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','mms', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','mas', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','rs', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','pty', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','qtu', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gpc','qtn', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'gpc','n', vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'ada', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','ss', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','mms', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','mas', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','rs', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','pty', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','qtu', vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'ada','qtn', vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'ada','n', vt=True).write_result('test_vt_no_resampling')

CLF(X_17, y_17, 'gb', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','ss', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','mms', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','mas', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','rs', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','pty', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','qtu', n=500, vt=True).write_result('test_vt_no_resampling')
CLF(X_17, y_17, 'gb','qtn', n=500, vt=True).write_result('test_vt_no_resampling')
#CLF(X_17, y_17, 'gb','n', n=500, vt=True).write_result('test_vt_no_resampling')

In [None]:
CLF(X_17, y_17, 'rf', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','ss', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','mms', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','mas', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','rs', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','pty', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','qtu', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'rf','qtn', n=500, vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'rf','n', n=500, vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'mlp', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','ss', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','mms', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','mas', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','rs', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','pty', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','qtu', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'mlp','qtn', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'mlp','n', vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'svm', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','ss', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','mms', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','mas', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','rs', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','pty', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','qtu', gamma='auto', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'svm','qtn', gamma='auto', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'svm','n', gamma='auto', vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'knn', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','ss', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','mms', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','mas', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','rs', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','pty', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','qtu', n=10, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'knn','qtn', n=10, vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'knn','n', n=10, vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'sgd', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','ss', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','mms', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','mas', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','rs', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','pty', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','qtu', l='log', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'sgd','qtn', l='log', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'sgd','n', l='log', vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'dt', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','ss', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','mms', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','mas', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','rs', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','pty', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','qtu', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'dt','qtn', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'dt','n', vt=True, resampling = True).write_result('test_vt')


CLF(X_17, y_17, 'r', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','ss', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','mms', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','mas', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','rs', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','pty', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','qtu', n=None, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'r','qtn', n=None, vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'r','n', n=None, vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'xg', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','ss', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','mms', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','mas', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','rs', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','pty', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','qtu', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'xg','qtn', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'xg','n', vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'gpc', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','ss', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','mms', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','mas', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','rs', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','pty', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','qtu', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gpc','qtn', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'gpc','n', vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'ada', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','ss', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','mms', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','mas', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','rs', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','pty', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','qtu', vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'ada','qtn', vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'ada','n', vt=True, resampling = True).write_result('test_vt')

CLF(X_17, y_17, 'gb', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','ss', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','mms', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','mas', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','rs', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','pty', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','qtu', n=500, vt=True, resampling = True).write_result('test_vt')
CLF(X_17, y_17, 'gb','qtn', n=500, vt=True, resampling = True).write_result('test_vt')
#CLF(X_17, y_17, 'gb','n', n=500, vt=True, resampling = True).write_result('test_vt')

In [None]:
CLF(X_17, y_17, 'rf','mas', n=500, vt=True, resampling = True).write_result('best')
CLF(X_17, y_17, 'mlp','pty', vt=True).write_result('best')
CLF(X_17, y_17, 'svm','qtu', gamma='auto', vt=True, resampling = True).write_result('best')
CLF(X_17, y_17, 'knn','qtu', n=10).write_result('best')
CLF(X_17, y_17, 'sgd','n', l='log').write_result('best')
CLF(X_17, y_17, 'dt','qtn', resampling = True).write_result('best')
CLF(X_17, y_17, 'r','qtu', n=None, vt=True, resampling = True).write_result('best')
CLF(X_17, y_17, 'xg','n', resampling = True).write_result('best')
CLF(X_17, y_17, 'gpc','n').write_result('best')
CLF(X_17, y_17, 'ada','qtu', vt=True, resampling = True).write_result('best')
CLF(X_17, y_17, 'gb','qtn', n=500, vt=True).write_result('best')




In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(300, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'rf','ss', n=500, f_selection='kb', kbest_f=f_classif,
                    f = i, vt=True, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("rf-kbf-ss.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('RF (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('RF-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(300, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'rf','ss', n=500, f_selection='rf', fs_n=500,
                f = i, vt=True, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("RF-rf-ss.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('RF (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('RF-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(300, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'rf','ss', n=500, d_reduction=1,
        f = i, vt=True, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("rf-pca-ss.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('RF (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('RF-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(340, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'mlp','pty', f_selection='kb', kbest_f=f_classif,
                    f = i, vt=True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("mlp-kbf-pty.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('MLP (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('MLP-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(340, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'mlp','pty', f_selection='rf', fs_n=500,
                f = i, vt=True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("mlp-rf-pty.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('MLP (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('MLP-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(340, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'mlp','pty', d_reduction=1,
        f = i, vt=True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("mlp-pca-pty.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('MLP (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('MLP-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='kb', kbest_f=f_classif,
                    f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("svm-kbf-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('SVM (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('SVM-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='rf', fs_n=500,
                f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("svm-rf-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('SVM (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('SVM-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'svm','qtu', gamma='auto', d_reduction=1,
        f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("svm-pca-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('SVM (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('SVM-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='kb', kbest_f=f_classif,
                    f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("knn-kbf-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('KNN (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('KNN-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='rf', fs_n=500,
                f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("knn-rf-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('KNN (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('KNN-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(390, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'knn','qtu', n=10, d_reduction=1,
        f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("knn-pca-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('KNN (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('KNN-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'sgd','n', l='log', f_selection='kb', kbest_f=f_classif,
                    f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("sgd-kbf-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('SGD (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('SGD-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'sgd','n', l='log', f_selection='rf', fs_n=500,
                f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("sgd-rf-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('SGD (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('SGD-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(390, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'sgd','n', l='log', d_reduction=1,
        f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("SGD-pca-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('SGD (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('SGD-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'dt','qtn', f_selection='kb', kbest_f=f_classif,
                    f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("dt-kbf-qtn.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('DT (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('DT-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'dt','qtn', f_selection='rf', fs_n=500,
                f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("dt-rf-qtn.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('DT (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('DT-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(390, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'dt','qtn', d_reduction=1,
        f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("dt-pca-qtn.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('DT (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('DT-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'gpc','n', f_selection='kb', kbest_f=f_classif,
                    f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("gpc-kbf-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('GPC (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('GPC-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'gpc','n', f_selection='rf', fs_n=500,
                f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("gpc-rf-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('GPC (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('GPC-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(390, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'gpc','n', d_reduction=1,
        f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("gpc-pca-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('GPC (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('GPC-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'r','qtu', n=None, f_selection='kb', kbest_f=f_classif,
                    f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("r-kbf-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('R (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('R-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'r','qtu', n=None, f_selection='rf', fs_n=500,
                f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("r-rf-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('R (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('R-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'r','qtu', n=None, d_reduction=1,
        f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("r-pca-qtu.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('R (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('R-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'xg','n', f_selection='kb', kbest_f=f_classif,
                    f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("xg-kbf-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('XG (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('XG-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(400, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'xg','n', f_selection='rf', fs_n=500,
                f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("xg-rf-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('XG (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('XG-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(390, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'xg','n', d_reduction=1,
        f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("xg-pca-n.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('XG (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('XG-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'ada','pty', f_selection='kb', kbest_f=f_classif,
                    f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("ada-kbf-pty.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('ADA (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('ADA-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'ada','pty', f_selection='rf', fs_n=500,
                f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("ada-rf-pty.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('ADA (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('ADA-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'ada','pty', d_reduction=1,
        f = i, resampling = True).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("ada-pca-pty.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('ADA (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('ADA-pca.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'gb', 'mas', n=500, f_selection='kb', kbest_f=f_classif,
                    f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("gb-kbf-mas.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('GB (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('GB-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'gb', 'mas', n=500, f_selection='rf', fs_n=500,
                f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("gb-rf-mas.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('GB (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('GB-rf.png')

In [None]:
x=[]
y=[]
best = [-1, -1]

for i in range(330, 0, -10):
    f1 = np.mean(CLF(X_17, y_17, 'gb', 'mas', n=500, d_reduction=1,
        f = i).results_array[0])
    x.append(i)
    y.append(f1)
    if f1 > best[1]:
        best[0] = i
        best[1] = f1


df = pd.DataFrame({"Feature Selection Intensity" : x, "F1-score" : y})
df.to_csv("gb-pca-mas.csv", index=False)

print(best)
plt.plot(x, y)
plt.title('GB (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('F1-score')
plt.savefig('GB-pca.png')

In [None]:
#CLF(X_17, y_17, 'rf','ss', n=500, vt=True, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'rf','ss', n=500, f_selection='kb', kbest_f=f_classif,
    #f = 180, vt=True, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'rf','ss', n=500, f_selection='rf', fs_n=500,
    #f = 240, vt=True, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'rf','ss', n=500, d_reduction=1, 
    #f = 10, vt=True, resampling = True).write_result('best-feature-selection')


CLF(X_17, y_17, 'mlp','pty', vt=True).write_result('best-feature-selection')
CLF(X_17, y_17, 'mlp','pty', f_selection='kb', kbest_f=f_classif, 
    f = 50, vt=True).write_result('best-feature-selection')
CLF(X_17, y_17, 'mlp','pty', f_selection='rf', fs_n=500,
    f = 160, vt=True).write_result('best-feature-selection')
CLF(X_17, y_17, 'mlp','pty', d_reduction=1, 
    f = 120, vt=True).write_result('best-feature-selection')


#CLF(X_17, y_17, 'svm','qtu', gamma='auto', resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='kb', kbest_f=f_classif,
    #f = 90, resampling=True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='rf', fs_n=500,
    #f = 60, resampling=True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'svm','qtu', gamma='auto', d_reduction=1,
    #f = 30, resampling=True).write_result('best-feature-selection')

CLF(X_17, y_17, 'knn','qtu', n=10).write_result('best-feature-selection')
CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='kb', kbest_f=f_classif,
    f = 310).write_result('best-feature-selection')
CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='rf', fs_n=500,
    f = 320).write_result('best-feature-selection')
CLF(X_17, y_17, 'knn','qtu', n=10, d_reduction=1,
    f = 70).write_result('best-feature-selection')


CLF(X_17, y_17, 'sgd','n', l='log').write_result('best-feature-selection')
CLF(X_17, y_17, 'sgd','n', l='log', f_selection='kb', kbest_f=f_classif,
    f = 250).write_result('best-feature-selection')
CLF(X_17, y_17, 'sgd','n', l='log', f_selection='rf', fs_n=500,
    f = 140).write_result('best-feature-selection')
CLF(X_17, y_17, 'sgd','n', l='log', d_reduction=1,
    f = 40).write_result('best-feature-selection')


CLF(X_17, y_17, 'dt','qtn', resampling = True).write_result('best-feature-selection')
CLF(X_17, y_17, 'dt','qtn', f_selection='kb', kbest_f=f_classif,
    f = 270, resampling = True).write_result('best-feature-selection')
CLF(X_17, y_17, 'dt','qtn', f_selection='rf', fs_n=500,
    f = 30, resampling = True).write_result('best-feature-selection')
CLF(X_17, y_17, 'dt','qtn', d_reduction=1,
    f = 40, resampling = True).write_result('best-feature-selection')


#CLF(X_17, y_17, 'r','qtu', n=None, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'r','qtu', n=None, f_selection='kb', kbest_f=f_classif,
    #f = 260, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'r','qtu', n=None, f_selection='rf', fs_n=500,
    #f = 180, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'r','qtu', n=None, d_reduction=1,
    #f = 30, resampling = True).write_result('best-feature-selection')


CLF(X_17, y_17, 'xg','n', resampling = True).write_result('best-feature-selection')
CLF(X_17, y_17, 'xg','n', f_selection='kb', kbest_f=f_classif,
    f = 230, resampling = True).write_result('best-feature-selection')
CLF(X_17, y_17, 'xg','n', f_selection='rf', fs_n=500,
    f = 370, resampling = True).write_result('best-feature-selection')
CLF(X_17, y_17, 'xg','n', d_reduction=1,
    f = 50, resampling = True).write_result('best-feature-selection')


CLF(X_17, y_17, 'gpc','n').write_result('best-feature-selection')
CLF(X_17, y_17, 'gpc','n', f_selection='kb', kbest_f=f_classif,
    f = 270).write_result('best-feature-selection')
CLF(X_17, y_17, 'gpc','n', f_selection='rf', fs_n=500,
    f = 320).write_result('best-feature-selection')
CLF(X_17, y_17, 'gpc','n', d_reduction=1,
    f = 90).write_result('best-feature-selection')


#CLF(X_17, y_17, 'ada','pty', resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'ada','pty', f_selection='kb', kbest_f=f_classif,
    #f = 150, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'ada','pty', f_selection='rf', fs_n=500,
    #f = 140, resampling = True).write_result('best-feature-selection')
#CLF(X_17, y_17, 'ada','pty', d_reduction=1,
    #f = 20, resampling = True).write_result('best-feature-selection')


#CLF(X_17, y_17, 'gb', 'mas', n=500).write_result('best-feature-selection')
#CLF(X_17, y_17, 'gb', 'mas', n=500, f_selection='kb', kbest_f=f_classif,
    #f = 180).write_result('best-feature-selection')
#CLF(X_17, y_17, 'gb', 'mas', n=500, f_selection='rf', fs_n=500,
    #f = 240).write_result('best-feature-selection')
#CLF(X_17, y_17, 'gb', 'mas', n=500, d_reduction=1,
    #f = 60).write_result('best-feature-selection')


In [None]:
CLF(X_17, y_17, 'rf','mas', n=500, f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling = True).write_result('best-model-tunings')
CLF(X_17, y_17, 'mlp','pty', f_selection='rf', fs_n=500,
    f = 160, vt=True).write_result('best-model-tunings')
CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling=True).write_result('best-model-tunings')
CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='rf', fs_n=500,
    f = 320).write_result('best-model-tunings')
CLF(X_17, y_17, 'sgd','n', l='log', d_reduction=1,
    f = 40).write_result('best-model-tunings')
CLF(X_17, y_17, 'dt','qtn', f_selection='rf', fs_n=500,
    f = 30, resampling = True).write_result('best-model-tunings')
CLF(X_17, y_17, 'r','qtu', n=None, d_reduction=1,
    f = 20, vt = True, resampling = True).write_result('best-model-tunings')
CLF(X_17, y_17, 'xg','n', f_selection='rf', fs_n=500,
    f = 370, resampling = True).write_result('best-model-tunings')
CLF(X_17, y_17, 'gpc','n', f_selection='kb', kbest_f=f_classif,
    f = 270).write_result('best-model-tunings')
CLF(X_17, y_17, 'ada','qtu', f_selection='rf', fs_n=500,
    f = 210, vt = True, resampling = True).write_result('best-model-tunings')
CLF(X_17, y_17, 'gb', 'qtn', n=500,
    vt = True).write_result('best-model-tunings')


In [None]:
CLF(X_17, y_17, 'rf','mas', n=500, f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling = True, permute=True)
CLF(X_17, y_17, 'mlp','pty', f_selection='rf', fs_n=500,
    f = 160, vt=True, permute=True)
CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling=True, permute=True)
CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='rf', fs_n=500,
    f = 320, permute=True)
CLF(X_17, y_17, 'sgd','n', l='log', d_reduction=1,
    f = 40, permute=True)
CLF(X_17, y_17, 'dt','qtn', f_selection='rf', fs_n=500,
    f = 30, resampling = True, permute=True)
CLF(X_17, y_17, 'r','qtu', n=None, d_reduction=1,
    f = 20, vt = True, resampling = True, permute=True)
CLF(X_17, y_17, 'xg','n', f_selection='rf', fs_n=500,
    f = 370, resampling = True, permute=True)
CLF(X_17, y_17, 'gpc','n', f_selection='kb', kbest_f=f_classif,
    f = 270, permute=True)
CLF(X_17, y_17, 'ada','qtu', f_selection='rf', fs_n=500,
    f = 210, vt = True, resampling = True, permute=True)
CLF(X_17, y_17, 'gb', 'qtn', n=500,
    vt = True, permute=True)




In [None]:
import csv

#Calculating importance frequency and weighted frequency for each individual model
def importance_frequency(filename):
    table = {}
    f = open('Importance/' + filename + '.csv', 'r')
    for i in f:
        i = i.split(',')
        i[3] = float(i[3].replace('\n', ''))
        if table.get(i[0]) == None:
            table[i[0]] = i[3]
        else:
            table[i[0]] = table[i[0]] + i[3]

    list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())



    with open("Importance/Frequency-Count/" + filename + "-Frequency.csv", 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for item in list_table:
            writer.writerow([item[0], item[1]])


In [None]:
importance_frequency('PI-ADA')
importance_frequency('PI-DT')
importance_frequency('PI-GB')
importance_frequency('PI-GPC')
importance_frequency('PI-KNN')
importance_frequency('PI-MLP')
importance_frequency('PI-RF')
importance_frequency('PI-R')
importance_frequency('PI-SGD')
importance_frequency('PI-SVC')
importance_frequency('PI-XGB')

In [None]:
#Calculating importance frequency for cumulatively (every single model combined)

# import required module
import csv
import os
# assign directory
directory = 'Importance/Frequency-Count'
 
# iterate over files in
# that directory
table = {}
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        f = open(f, 'r')
        for i in f:
            i = i.split(',')
            i[1] = float(i[1].replace('\n', ''))
            if table.get(i[0]) == None:
                table[i[0]] = i[1]
            else:
                table[i[0]] = table[i[0]] + i[1]

list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())


with open("Importance/Frequency-Count/Cumulative-Frequency.csv", 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for item in list_table:
        writer.writerow(item)

In [None]:
count = 0
for item in list_table:
    count += item[1]
count

In [None]:
#cumulative_weighted = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8', names=['features', 'w_cumulative'])

#cumulative = pd.read_csv("Importance/Cumulative-Frequency.csv", encoding='utf-8', names=['features', 'cumulative'])
#aggregate = pd.merge(cumulative, cumulative_weighted, left_on='features', right_on='features')
#aggregate = aggregate.sort_values(by='w_cumulative', ascending=False)
#aggregate.to_csv("cumulative.csv", index=False)

In [None]:
metrics_to_category_dict = {
    'LCOM5': 'cohesion',
    'HCPL': 'complexity',
    'HDIF': 'complexity',
    'HEFF': 'complexity',
    'HNDB': 'complexity',
    'HPL': 'complexity',
    'HPV': 'complexity',
    'HTRP': 'complexity',
    'HVOL': 'complexity',
    'MIMS': 'complexity',
    'MI': 'complexity',
    'MISEI': 'complexity',
    'MISM': 'complexity',
    'McCC': 'complexity',
    'NL': 'complexity',
    'NLE': 'complexity',
    'WMC': 'complexity',
    'CBO': 'coupling',
    'CBOI': 'coupling',
    'NII': 'coupling',
    'NOI': 'coupling',
    'RFC': 'coupling',
    'AD': 'documentation',
    'CD': 'documentation',
    'CLOC': 'documentation',
    'DLOC': 'documentation',
    'PDA': 'documentation',
    'PUA': 'documentation',
    'TAD': 'documentation',
    'TCD': 'documentation',
    'TCLOC': 'documentation',
    'TPDA': 'documentation',
    'TPUA': 'documentation',
    'DIT': 'inheritance',
    'NOA': 'inheritance',
    'NOC': 'inheritance',
    'NOD': 'inheritance',
    'NOP': 'inheritance',
    'LOC': 'size',
    'LLOC': 'size',
    'NA': 'size',
    'NCL': 'size',
    'NEN': 'size',
    'NG': 'size',
    'NIN': 'size',
    'NLA': 'size',
    'NLG': 'size',
    'NLM': 'size',
    'NLPA': 'size',
    'NLPM': 'size',
    'NLS': 'size',
    'NM': 'size',
    'NPKG': 'size',
    'NUMPAR': 'size',
    'NPA': 'size',
    'NPM': 'size',
    'NS': 'size',
    'NOS': 'size',
    'TLOC': 'size',
    'TLLOC': 'size',
    'TNA': 'size',
    'TNCL': 'size',
    'TNDI': 'size',
    'TNEN': 'size',
    'TNFI': 'size',
    'TNG': 'size',
    'TNIN': 'size',
    'TNLA': 'size',
    'TNLG': 'size',
    'TNLM': 'size',
    'TNLPA': 'size',
    'TNLPM': 'size',
    'TNLS': 'size',
    'TNM': 'size',
    'TNPKG': 'size',
    'TNPA': 'size',
    'TNPCL': 'size',
    'TNPEN': 'size',
    'TNPIN': 'size',
    'TNPM': 'size',
    'TNS': 'size',
    'TNOS': 'size',
    'F': 'size',
    'M': 'size',
    'C': 'size'
}

In [None]:
importance = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8', names=['features', 'c_importance'])

category = []

for index, row in importance.iterrows():
    if len(row['features'].split('_')) == 2:
        category.append(metrics_to_category_dict.get(row['features'].split('_')[1]))
    else:
        category.append('external')
    
importance['category'] = category
importance.to_csv("Importance/cumulative.csv", index=False)

In [None]:
importance_category = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8')

table = {}
for index, row in importance_category.iterrows():
    if table.get(row['category']) == None:
        table[row['category']] = row['c_importance']
    else:
        table[row['category']] = table[row['category']] + row['c_importance']

list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())


with open("Importance/importance-category.csv", 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in list_table:
        writer.writerow(i)
        
list_table

In [None]:
metrics_granularity_dict = {
    'C': 'class',
    'F': 'file',
    'M': 'method',
    'N': 'repository'
}

importance = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8')

granularity = []

for index, row in importance.iterrows():
    if row['features'][0] not in metrics_granularity_dict:
        granularity.append("repository")
    else:
        granularity.append(metrics_granularity_dict.get(row['features'][0]))
        
importance['granularity'] = granularity
importance.to_csv("Importance/cumulative.csv", index=False)

In [None]:
importance_metrics = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8')

table = {}
for index, row in importance_metrics.iterrows():
    if table.get(row['granularity']) == None:
        table[row['granularity']] = row['c_importance']
    else:
        table[row['granularity']] = table[row['granularity']] + row['c_importance']

list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())


with open("Importance/granularity.csv", 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in list_table:
        writer.writerow(i)
        
list_table

In [None]:
pickle.dump(CLF(X_17, y_17, 'rf','mas', n=500, f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling = True), open('RF.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'mlp','pty', f_selection='rf', fs_n=500,
    f = 160, vt=True), open('MLP.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling=True), open('SVM.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='rf', fs_n=500,
    f = 320), open('KNN.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'sgd','n', l='log', d_reduction=1,
    f = 40), open('SGD.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'dt','qtn', f_selection='rf', fs_n=500,
    f = 30, resampling = True), open('DT.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'r','qtu', n=None, d_reduction=1,
    f = 20, vt = True, resampling = True), open('R.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'xg','n', f_selection='rf', fs_n=500,
    f = 370, resampling = True), open('XG.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'gpc','n', f_selection='kb', kbest_f=f_classif,
    f = 270), open('GPC.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'ada','qtu', f_selection='rf', fs_n=500,
    f = 210, vt = True, resampling = True), open('ADA.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'gb', 'qtn', n=500,
    vt = True), open('GB.pkl', 'wb'))



In [None]:
X_17.iloc[0]

In [None]:
test_model = pickle.load(open('model.pkl','rb'))


In [None]:
print(test_model.clf.predict([X_17.iloc[100]]))

In [None]:
len(X_17.iloc[100])

In [None]:
df.LVUsage.describe()

In [None]:
plt.hist(df[(df['reusability'] > 21)].Cstd_CBO, bins=50)
plt.title('')
plt.xlabel('CBO of Class (std)')
plt.ylabel('Frequency')
plt.show()
#plt.savefig('GB-pca.png')

In [None]:
df[df['LVUsage'] > 300].LVUsage.describe()

In [None]:
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(RandomOverSampler(sampling_strategy='minority', random_state=27033074),
                       VarianceThreshold(threshold = 0.8 * (1 - 0.8)), 
                       MaxAbsScaler(), SelectKBest(f_classif, k=160), 
                       RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state=27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-rf', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(VarianceThreshold(threshold = 0.8 * (1 - 0.8)), 
                       PowerTransformer(), 
                       SelectFromModel(RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state=27033074), 
                                   threshold=-np.inf, max_features=160), 
                       MLPClassifier(random_state = 27033074, max_iter=10000))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-mlp', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(RandomOverSampler(sampling_strategy='minority', random_state=27033074),
                       VarianceThreshold(threshold = 0.8 * (1 - 0.8)), 
                       QuantileTransformer(random_state = 27033074), SelectKBest(f_classif, k=160), 
                       SVC(gamma = 'auto', random_state = 27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-svm', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(QuantileTransformer(random_state = 27033074), 
                       SelectFromModel(RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state=27033074), 
                                   threshold=-np.inf, max_features=320), 
                       KNeighborsClassifier(n_neighbors = 10, n_jobs = -1))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-knn', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)
classi = make_pipeline(PCA(n_components=40, random_state=27033074), 
                       SGDClassifier(loss = 'log', n_jobs = -1, random_state = 27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-sgd', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(RandomOverSampler(sampling_strategy='minority', random_state=27033074), 
                       QuantileTransformer(output_distribution = 'normal', random_state = 27033074), 
                       SelectFromModel(RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state=27033074), 
                                   threshold=-np.inf, max_features=30), 
                       DecisionTreeClassifier(random_state = 27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-dt', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(RandomOverSampler(sampling_strategy='minority', random_state=27033074), 
                       VarianceThreshold(threshold = 0.8 * (1 - 0.8)), 
                       QuantileTransformer(random_state = 27033074), 
                       PCA(n_components=20, random_state=27033074), 
                       RidgeClassifierCV(cv = None))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-r', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)
classi = make_pipeline(RandomOverSampler(sampling_strategy='minority', random_state=27033074), 
                       SelectFromModel(RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state=27033074), 
                                   threshold=-np.inf, max_features=370), 
                       xgb.XGBClassifier(random_state = 27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-xgb', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)
classi = make_pipeline(SelectKBest(f_classif, k=270), 
                       GaussianProcessClassifier(random_state = 27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-gpc', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(RandomOverSampler(sampling_strategy='minority', random_state=27033074), 
                       VarianceThreshold(threshold = 0.8 * (1 - 0.8)),  
                       QuantileTransformer(random_state = 27033074), 
                       SelectFromModel(RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state=27033074), 
                                   threshold=-np.inf, max_features=210), 
                       AdaBoostClassifier(random_state = 27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-ada', 'wb'))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_17, y_17, random_state=27033074)
classi = make_pipeline(VarianceThreshold(threshold = 0.8 * (1 - 0.8)),  
                       QuantileTransformer(output_distribution = 'normal', random_state = 27033074), 
                       GradientBoostingClassifier(n_estimators=500, random_state=27033074))

classi.fit(X_train, y_train)
dill.dump(classi, open('test-gb', 'wb'))

In [None]:
pickle.dump(CLF(X_17, y_17, 'rf','mas', n=500, f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling = True), open('RF.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'mlp','pty', f_selection='rf', fs_n=500,
    f = 160, vt=True), open('MLP.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'svm','qtu', gamma='auto', f_selection='kb', kbest_f=f_classif,
    f = 160, vt=True, resampling=True), open('SVM.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'knn','qtu', n=10, f_selection='rf', fs_n=500,
    f = 320), open('KNN.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'sgd','n', l='log', d_reduction=1,
    f = 40), open('SGD.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'dt','qtn', f_selection='rf', fs_n=500,
    f = 30, resampling = True), open('DT.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'r','qtu', n=None, d_reduction=1,
    f = 20, vt = True, resampling = True), open('R.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'xg','n', f_selection='rf', fs_n=500,
    f = 370, resampling = True), open('XG.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'gpc','n', f_selection='kb', kbest_f=f_classif,
    f = 270), open('GPC.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'ada','qtu', f_selection='rf', fs_n=500,
    f = 210, vt = True, resampling = True), open('ADA.pkl', 'wb'))
pickle.dump(CLF(X_17, y_17, 'gb', 'qtn', n=500,
    vt = True), open('GB.pkl', 'wb'))

