In [None]:
!pip install scikit-learn
!pip install pandas
!pip install scipy
!pip install imblearn
!pip install matplotlib
!pip install xgboost

In [1]:
import csv
import time
import math
import pandas as pd
import numpy as np
import os
import gc
from scipy import stats
from sklearn.feature_selection import SelectFromModel
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
#models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
#scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import normalize

In [4]:
#tools
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.inspection import permutation_importance

In [5]:
#dimensional reduction & feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression

In [6]:
#cross_val
from sklearn.model_selection import RepeatedStratifiedKFold

In [7]:
df = pd.read_csv("Dataset(MavenReuseGroundTruth).csv", sep=",", encoding='utf-8')
df = df.drop(['library_name'], axis=1)
print(len(df.index))
print(len(df.columns))
df.LVUsage.describe()

526
439


count     526.000000
mean      216.958175
std       770.215974
min         0.000000
25%         7.000000
50%        22.500000
75%        96.750000
max      9146.000000
Name: LVUsage, dtype: float64

In [8]:
dropped_columns = []
def data_preprocessing(df):

    #p_corr = df.corr(method='pearson')
    #k_corr = df.corr(method='kendall')
    s_corr = df.corr(method='spearman')
    #corr_stat = pd.concat([p_corr['maven_reuse'].round(3), k_corr['maven_reuse'].round(3), s_corr['maven_reuse'].round(3)], axis=1)
    #corr_stat.columns = ['pearson', 'kendall', 'spearman']
    #corr_stat
    for i in range(len(s_corr['LVUsage'])):
        if pd.isnull(s_corr['LVUsage'][i]) == True:
            df.drop(s_corr['LVUsage'].index[i], axis='columns', inplace=True)
            dropped_columns.append(s_corr['LVUsage'].index[i])
    #remove features which have NAN pearson coorelation        
    
    #print(len(df.columns))
    
    y = df['LVUsage']
    X = df.drop(['LVUsage'], axis=1)
    ###X = X.apply(pd.to_numeric)
    
    r_array = []
    for col in X:
        #print('{}: {}'.format(col, stats.ttest_ind(X[col], y_val)[1].round(5)))
        if stats.ttest_ind(X[col], y)[1].round(5) > 0.05:
            r_array.append(col)
    X = X.drop(columns=r_array)
    dropped_columns.extend(r_array)
    #remove all insignificant features
    print(len(X.columns))
    print(len(X.index))
    print(dropped_columns)
    return X, y

In [9]:
#df_400 = df[df['forks'] < 2500]
#df_200 = df[df['forks'] < 1000]
#df_100 = df[df['forks'] < 1000]
X_n, y_n = data_preprocessing(df)
#X_400, y_400 = data_preprocessing(df_400)
#X_200, y_200 = data_preprocessing(df_200)
#X_100, y_100 = data_preprocessing(df_100)

387
526
['Cmin_NOC', 'Cmed_NOC', 'Cmin_NOD', 'Cmed_NOD', 'Cmin_NLPA', 'Cmin_NLS', 'Cmin_NPA', 'Cmin_NS', 'Cmin_TNLPA', 'Cmin_TNLS', 'Cmin_TNPA', 'Cmin_TNS', 'Fmin_CLOC', 'Fmed_CLOC', 'Fmax_CLOC', 'Fsum_CLOC', 'Fstd_CLOC', 'Mmin_NL', 'Mmin_NLE', 'Mmin_NII', 'Mmin_NOI', 'Mmin_CD', 'Mmin_CLOC', 'Mmin_DLOC', 'Mmin_TCD', 'Mmin_TCLOC', 'Cmax_NII', 'Cmax_RFC', 'Csum_NOC', 'Csum_NOD', 'Csum_NOP', 'Cstd_LOC', 'Csum_NLPA', 'Csum_NLS', 'Cstd_TLLOC', 'Cstd_TLOC', 'Cmax_TNLM', 'Csum_TNLPA', 'Cmax_TNLPM', 'Csum_TNLS', 'Cmax_TNPM', 'No_F', 'Fmax_PUA', 'Fstd_LLOC', 'Mmax_HPV', 'Mmax_MI', 'Mmax_MISEI', 'Mmax_LLOC', 'Mmax_LOC', 'Mmax_TLLOC', 'Mmax_TLOC']


In [10]:
class RGS():
    
    def __init__(self, X, y, regressor, preprocess_method = None, d_reduction = 0,
                 f_selection = None, seed = 27033074, vt = False,
                 cv = 4, cv_r = 10, permute = False, top = 10, **params):
        
        self.X = X
        self.y = y
        self.regressor = regressor
        self.pm = preprocess_method
        self.dr = d_reduction
        self.fs = f_selection
        self.vt = vt
        self.p = params
        self.results_array = [[], [], [], []]
        
        if permute == False:
            self.skf = RepeatedStratifiedKFold(n_splits = cv, n_repeats = cv_r, random_state=seed)
            for train_index, test_index in self.skf.split(X, y):
                self.X_train, self.X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                self.y_train, self.y_test = self.y[train_index], self.y[test_index]
                self.rgs = make_pipeline(self.variance_t(), self.preprocess(),
                                         self.dim_reduction(), self.fea_selection(), self.model())
                self.rgs.fit(self.X_train, self.y_train)
                self.y_pred = self.rgs.predict(self.X_test)
                self.results_array[0].append(self.get_r2())
                self.results_array[1].append(self.get_mae())
                self.results_array[2].append(self.get_rmse())
                self.results_array[3].append(self.get_medae())
        #self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=seed)
        #self.clf = Pipeline([('preprocessor', self.preprocess()), ('model', self.model())])
        else:
            #take top 10 important features (default)
            self.skf = RepeatedStratifiedKFold(n_splits = cv, n_repeats = cv_r, random_state=seed)
            for train_index, test_index in self.skf.split(X, y):
                self.X_train, self.X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                self.y_train, self.y_test = self.y[train_index], self.y[test_index]
                self.rgs = make_pipeline(self.variance_t(), self.preprocess(),
                                         self.dim_reduction(), self.fea_selection(), self.model())
                self.rgs.fit(self.X_train, self.y_train)
                self.y_pred = self.rgs.predict(self.X_test)
                print('Computing permutation importance...')
                start = time.time()
                result = permutation_importance(self.rgs, self.X_test, self.y_test, random_state=seed, n_jobs=-1)
                print('Process completed at ' + str(round((time.time() - start)/60, 3)) + ' min')
                with open("PI-" + str(self.get_model())[:10] + "_" + str(self.get_preprocess()) + ".csv", 'a') as csv_file:
                    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    w = 1
                    for i in result.importances_mean.argsort()[::-1]:
                        #if round(result.importances_mean[i] - 2 * result.importances_std[i], 4) > 0:
                        writer.writerow([self.X.columns[i], 
                                         round(result.importances_mean[i], 3), 
                                         round(result.importances_std[i], 3), self.get_r2()])
                        w += 1
                        if w > 10:
                            break
        
        
    def variance_t(self, t = 0.8 * (1 - 0.8)):
        
        if self.vt == True:
            return VarianceThreshold(threshold = t)
        
        return None
    
    def preprocess(self, seed = 27033074):
        if self.pm == 'ss':
            return StandardScaler()
        elif self.pm == 'mms':
            return MinMaxScaler()
        elif self.pm == 'mas':
            return MaxAbsScaler()
        elif self.pm == 'rs':
            return RobustScaler()
        elif self.pm == 'pty':
            return PowerTransformer()
        elif self.pm == 'ptb':
            return PowerTransformer(method = 'box-cox')
        elif self.pm == 'qtu':
            return QuantileTransformer(random_state = seed)
        elif self.pm == 'qtn':
            return QuantileTransformer(output_distribution = 'normal', random_state = seed)
        elif self.pm == 'n':
            self.X_train = normalize(self.X_train, axis = 0)
            self.X_test = normalize(self.X_test, axis = 0)
            return None
        else:
            return None
        
    def dim_reduction(self, seed = 27033074):
        
        if self.dr == 0:
            return None
        elif self.dr == 1:
            return PCA(n_components=self.p['f'], random_state=seed)
        else:
            return
        
    def fea_selection(self, seed = 27033074):
        
        if self.fs == None:
            return None
        elif self.fs == 'kb':
            return SelectKBest(self.p['kbest_f'], k=self.p['f'])
        elif self.fs == 'rf':
            return SelectFromModel(RandomForestRegressor(n_estimators = self.p['fs_n'], n_jobs = -1, random_state=seed), 
                                   threshold=-np.inf, max_features=self.p['f'])
        else:
            return
    
    def model(self, seed = 27033074):
        if self.regressor == 'rf':
            return RandomForestRegressor(n_estimators = self.p['n'], n_jobs = -1, random_state=seed)
        elif self.regressor == 'knn':
            return KNeighborsRegressor(n_neighbors = self.p['n'], n_jobs = -1)
        elif self.regressor == 'mlp':
            return MLPRegressor(random_state = seed, max_iter=10000)
        elif self.regressor == 'dt':
            return DecisionTreeRegressor(random_state = seed)
        elif self.regressor == 'sgd':
            return SGDRegressor(loss = self.p['l'], random_state = seed, max_iter=10000)
        elif self.regressor == 'r':
            return RidgeCV(cv = self.p['n'])
        elif self.regressor == 'svm':
            return SVR(gamma = self.p['gamma'])
        elif self.regressor == 'xg':
            return xgb.XGBRegressor(random_state = seed)
        elif self.regressor == 'gpc':
            return GaussianProcessRegressor(random_state = seed)
        elif self.regressor == 'ada':
            return AdaBoostRegressor(random_state = seed)
        elif self.regressor == 'gb':
            return GradientBoostingRegressor(n_estimators=self.p['n'], random_state=seed)
        elif self.regressor == 'lr':
            return LogisticRegression(random_state=seed)
        else:
            print('No input model!')
    
    def mean_results_array(self):
        return [np.mean(self.results_array[0]), np.mean(self.results_array[1]), 
                np.mean(self.results_array[2]), np.mean(self.results_array[3]), 
                self.get_model(), self.num_of_features(), self.get_preprocess(),
                self.fs, self.dr, self.vt]
    
    def write_result(self, filename):
        
        isNone = not os.path.isfile(filename + '.csv')
        
        with open(filename +  ".csv", 'a') as csv_file:
            writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            if isNone:
                writer.writerow(['R-square', 'MAE', 'RSME', 'MedAE', 'Model',
                                 'Num_Features', 'Preprocessing', 'Fea_Selection',
                                 'Dim_Reduction', 'Var_Threshold'])
            writer.writerow(self.mean_results_array())
        
        return
    
    def num_of_features(self):
        try:
            return self.p['f']
        except:
            return len(self.X.columns)
        
    def get_preprocess(self):
        
        if self.pm == 'n':
            return 'normalize'
        else:
            return self.rgs[1]
    
    def get_model(self):
        return self.rgs[-1]
    
    def get_model_params(self):
        return self.rgs[-1].get_params()
    
    def __str__(self):
        return str(self.rgs.steps)
    
    def get_r2(self):
        return round(r2_score(self.y_test, self.y_pred), 3)
    
    def get_mae(self):
        return round(mean_absolute_error(self.y_test, self.y_pred), 3)
    
    def get_rmse(self):
        return round(mean_squared_error(self.y_test, self.y_pred, squared=False), 3)
    
    def get_medae(self):
        return round(median_absolute_error(self.y_test, self.y_pred), 3)

    #Explained Variance No Need la I guess!
    
    def p_importance(self, train_or_test, seed = 27033074):
        
        print('Computing permutation importance...')
        start = time.time()
        if train_or_test == 'train':
            result = permutation_importance(self.rgs, self.X_train, self.y_train, random_state=seed, n_jobs=-1)
        elif train_or_test == 'test':
            result = permutation_importance(self.rgs, self.X_test, self.y_test, random_state=seed, n_jobs=-1)
        else:
            print('Input train or test!')
            return
        
        print('Process completed at ' + str(round((time.time() - start)/60, 3)) + ' min')
        
        with open("PI-" + str(self.get_model()) + "_" + str(self.get_preprocess()) +  ".csv", 'w') as csv_file:
            writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            for i in result.importances_mean.argsort()[::-1]:
                #if round(result.importances_mean[i] - 2 * result.importances_std[i], 4) > 0:
                writer.writerow([self.X.columns[i], 
                                 round(result.importances_mean[i], 3), 
                                 round(result.importances_std[i], 3)])
        
        return
    
    def f_importance(self):
        #problem with the feature removals
        x = list(zip(self.rgs[-1].feature_importances_, self.X.columns.values))
        x = pd.DataFrame(x, columns=["Importance","Feature_Name"])
        x = x.sort_values(by=['Importance'], ascending=False)
        x.to_csv('FI-' + str(self.get_model()) + "_" + str(self.get_preprocess()) +  ".csv", index=False)
        return
    
    def coefs(self):
        
        x = list(zip(self.rgs[-1].feature_importances_, self.X.columns.values))
        x = pd.DataFrame(x, columns=["Importance","Feature_Name"])
        x = x.sort_values(by=['Importance'], ascending=False)
        x.to_csv('FI-' + str(self.get_model()) + "_" + str(self.get_preprocess()) +  ".csv", index=False)
        return
    

In [None]:
RGS(X_n, y_n, 'rf', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','ss', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','mms', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','mas', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','rs', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','pty', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','qtu', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','qtn', n=500).write_result('0vt')
RGS(X_n, y_n, 'rf','n', n=500).write_result('0vt')

RGS(X_n, y_n, 'mlp').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'ss').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'mms').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'mas').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'rs').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'pty').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'qtu').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'qtn').write_result('0vt')
RGS(X_n, y_n, 'mlp', 'n').write_result('0vt')


RGS(X_n, y_n, 'svm', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'ss', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'mms', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'mas', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'rs', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'qtu', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'qtn', gamma='auto').write_result('0vt')
RGS(X_n, y_n, 'svm', 'n', gamma='auto').write_result('0vt')


RGS(X_n, y_n, 'knn', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'ss', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'mms', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'mas', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'rs', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'pty', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'qtu', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'qtn', n=10).write_result('0vt')
RGS(X_n, y_n, 'knn', 'n', n=10).write_result('0vt')


RGS(X_n, y_n, 'lr').write_result('0vt')
RGS(X_n, y_n, 'lr', 'ss').write_result('0vt')
RGS(X_n, y_n, 'lr', 'mms').write_result('0vt')
RGS(X_n, y_n, 'lr', 'mas').write_result('0vt')
RGS(X_n, y_n, 'lr', 'rs').write_result('0vt')
RGS(X_n, y_n, 'lr', 'pty').write_result('0vt')
RGS(X_n, y_n, 'lr', 'qtu').write_result('0vt')
RGS(X_n, y_n, 'lr', 'qtn').write_result('0vt')
RGS(X_n, y_n, 'lr', 'n').write_result('0vt')


RGS(X_n, y_n, 'dt').write_result('0vt')
RGS(X_n, y_n, 'dt', 'ss').write_result('0vt')
RGS(X_n, y_n, 'dt', 'mms').write_result('0vt')
RGS(X_n, y_n, 'dt', 'mas').write_result('0vt')
RGS(X_n, y_n, 'dt', 'rs').write_result('0vt')
RGS(X_n, y_n, 'dt', 'pty').write_result('0vt')
RGS(X_n, y_n, 'dt', 'qtu').write_result('0vt')
RGS(X_n, y_n, 'dt', 'qtn').write_result('0vt')
RGS(X_n, y_n, 'dt', 'n').write_result('0vt')

RGS(X_n, y_n, 'r', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'ss', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'mms', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'mas', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'rs', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'pty', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'qtu', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'qtn', n=None).write_result('0vt')
RGS(X_n, y_n, 'r', 'n', n=None).write_result('0vt')

RGS(X_n, y_n, 'xg').write_result('0vt')
RGS(X_n, y_n, 'xg','ss').write_result('0vt')
RGS(X_n, y_n, 'xg','mms').write_result('0vt')
RGS(X_n, y_n, 'xg','mas').write_result('0vt')
RGS(X_n, y_n, 'xg','rs').write_result('0vt')
RGS(X_n, y_n, 'xg','pty').write_result('0vt')
RGS(X_n, y_n, 'xg','qtu').write_result('0vt')
RGS(X_n, y_n, 'xg','qtn').write_result('0vt')
RGS(X_n, y_n, 'xg','n').write_result('0vt')

RGS(X_n, y_n, 'gpc').write_result('0vt')
RGS(X_n, y_n, 'gpc','ss').write_result('0vt')
RGS(X_n, y_n, 'gpc','mms').write_result('0vt')
RGS(X_n, y_n, 'gpc','mas').write_result('0vt')
RGS(X_n, y_n, 'gpc','rs').write_result('0vt')
RGS(X_n, y_n, 'gpc','pty').write_result('0vt')
RGS(X_n, y_n, 'gpc','qtu').write_result('0vt')
RGS(X_n, y_n, 'gpc','qtn').write_result('0vt')
RGS(X_n, y_n, 'gpc','n').write_result('0vt')

RGS(X_n, y_n, 'ada').write_result('0vt')
RGS(X_n, y_n, 'ada','ss').write_result('0vt')
RGS(X_n, y_n, 'ada','mms').write_result('0vt')
RGS(X_n, y_n, 'ada','mas').write_result('0vt')
RGS(X_n, y_n, 'ada','rs').write_result('0vt')
RGS(X_n, y_n, 'ada','pty').write_result('0vt')
RGS(X_n, y_n, 'ada','qtu').write_result('0vt')
RGS(X_n, y_n, 'ada','qtn').write_result('0vt')
RGS(X_n, y_n, 'ada','n').write_result('0vt')

RGS(X_n, y_n, 'gb', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','ss', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','mms', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','mas', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','rs', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','pty', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','qtu', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','qtn', n=500).write_result('0vt')
RGS(X_n, y_n, 'gb','n', n=500).write_result('0vt')

In [None]:
RGS(X_n, y_n, 'rf', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','ss', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','mms', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','mas', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','rs', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','pty', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','qtu', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'rf','qtn', n=500, vt = True).write_result('1vt')
#RGS(X_n, y_n, 'rf','n', n=500, vt = True).write_result('1vt')

RGS(X_n, y_n, 'mlp', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'ss', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'mms', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'mas', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'rs', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'pty', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'qtu', vt = True).write_result('1vt')
RGS(X_n, y_n, 'mlp', 'qtn', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'mlp', 'n', vt = True).write_result('1vt')


RGS(X_n, y_n, 'svm', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'ss', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'mms', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'mas', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'rs', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'qtu', gamma='auto', vt = True).write_result('1vt')
RGS(X_n, y_n, 'svm', 'qtn', gamma='auto', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'svm', 'n', gamma='auto', vt = True).write_result('1vt')


RGS(X_n, y_n, 'knn', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'ss', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'mms', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'mas', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'rs', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'pty', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'qtu', n=10, vt = True).write_result('1vt')
RGS(X_n, y_n, 'knn', 'qtn', n=10, vt = True).write_result('1vt')
#RGS(X_n, y_n, 'knn', 'n', n=10, vt = True).write_result('1vt')


RGS(X_n, y_n, 'lr', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'ss', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'mms', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'mas', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'rs', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'pty', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'qtu', vt = True).write_result('1vt')
RGS(X_n, y_n, 'lr', 'qtn', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'lr', 'n', vt = True).write_result('1vt')


RGS(X_n, y_n, 'dt', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'ss', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'mms', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'mas', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'rs', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'pty', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'qtu', vt = True).write_result('1vt')
RGS(X_n, y_n, 'dt', 'qtn', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'dt', 'n', vt = True).write_result('1vt')

RGS(X_n, y_n, 'r', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'ss', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'mms', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'mas', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'rs', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'pty', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'qtu', n=None, vt = True).write_result('1vt')
RGS(X_n, y_n, 'r', 'qtn', n=None, vt = True).write_result('1vt')
#RGS(X_n, y_n, 'r', 'n', n=None, vt = True).write_result('1vt')

RGS(X_n, y_n, 'xg', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','ss', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','mms', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','mas', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','rs', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','pty', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','qtu', vt = True).write_result('1vt')
RGS(X_n, y_n, 'xg','qtn', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'xg','n', vt = True).write_result('1vt')

RGS(X_n, y_n, 'gpc', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','ss', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','mms', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','mas', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','rs', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','pty', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','qtu', vt = True).write_result('1vt')
RGS(X_n, y_n, 'gpc','qtn', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'gpc','n', vt = True).write_result('1vt')

RGS(X_n, y_n, 'ada', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','ss', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','mms', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','mas', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','rs', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','pty', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','qtu', vt = True).write_result('1vt')
RGS(X_n, y_n, 'ada','qtn', vt = True).write_result('1vt')
#RGS(X_n, y_n, 'ada','n', vt = True).write_result('1vt')

RGS(X_n, y_n, 'gb', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','ss', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','mms', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','mas', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','rs', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','pty', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','qtu', n=500, vt = True).write_result('1vt')
RGS(X_n, y_n, 'gb','qtn', n=500, vt = True).write_result('1vt')
#RGS(X_n, y_n, 'gb','n', n=500, vt = True).write_result('1vt')

In [None]:
RGS(X_n, y_n, 'rf','qtn', n=500, vt = True).write_result('best_vt_novt')
RGS(X_n, y_n, 'mlp', 'qtu', vt = True).write_result('best_vt_novt')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', vt = True).write_result('best_vt_novt')
RGS(X_n, y_n, 'knn', 'qtu', n=10, vt = True).write_result('best_vt_novt')
RGS(X_n, y_n, 'lr', 'qtu').write_result('best_vt_novt')
RGS(X_n, y_n, 'dt', 'qtn').write_result('best_vt_novt')
RGS(X_n, y_n, 'r', 'qtu', n=None, vt = True).write_result('best_vt_novt')
RGS(X_n, y_n, 'xg','mas').write_result('best_vt_novt')
RGS(X_n, y_n, 'gpc','qtu', vt = True).write_result('best_vt_novt')
RGS(X_n, y_n, 'ada','mas').write_result('best_vt_novt')
RGS(X_n, y_n, 'gb','pty', n=500, vt = True).write_result('best_vt_novt')

In [None]:
import math
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'gpc','qtu', f_selection='kb', kbest_f=f_regression, f = i, vt=True).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("gpc_kbf_qtu.csv", index=False)

print(best)
import matplotlib.pyplot as plt
plt.plot(x, y)
plt.title('GPC (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('GPC-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'gpc','qtu', f_selection='rf', fs_n=500, f = i, vt=True).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("gpc_rf_qtu.csv", index=False)

print(best)

plt.plot(x, y)
plt.title('GPC (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('GPC-rf.png')

In [None]:
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'gpc','qtu', d_reduction=1, f = i, vt=True).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("gpc_pca_qtu.csv", index=False)

print(best)

plt.plot(x, y)
plt.title('GPC (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('GPC-pca.png')

In [None]:
import math
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'ada', 'mas', f_selection='kb', kbest_f=f_regression, f = i).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("ada_kbf_mas.csv", index=False)

print(best)
import matplotlib.pyplot as plt
plt.plot(x, y)
plt.title('ADA (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('ADA-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'ada','mas', f_selection='rf', fs_n=500, f = i).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("ada_rf_mas.csv", index=False)

print(best)

plt.plot(x, y)
plt.title('ADA (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('ADA-rf.png')

In [None]:
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'ada','mas', d_reduction=1, f = i).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("ada_pca_mas.csv", index=False)

print(best)

plt.plot(x, y)
plt.title('ADA (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('ADA-pca.png')

In [None]:
import math
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'gb', 'pty', n=500, f_selection='kb', kbest_f=f_regression, f = i, vt=True).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("gb_kbf_pty.csv", index=False)

print(best)
import matplotlib.pyplot as plt
plt.plot(x, y)
plt.title('GB (kbf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('GB-kbf.png')

In [None]:
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'gb', 'pty', n=500, f_selection='rf', fs_n=500, f = i, vt=True).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("gb_rf_pty.csv", index=False)

print(best)

plt.plot(x, y)
plt.title('GB (rf)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('GB-rf.png')

In [None]:
x=[]
y=[]
best = [-1, math.inf]

for i in range(330, 0, -10):
    mae = np.mean(RGS(X_n, y_n, 'gb', 'pty', n=500, d_reduction=1, f = i, vt=True).results_array[1])
    x.append(i)
    y.append(mae)
    if mae < best[1]:
        best[0] = i
        best[1] = mae

a = np.array(x)
b = np.array(y)

df = pd.DataFrame({"Feature Selection Intensity" : x, "MAE" : y})
df.to_csv("gb_pca_pty.csv", index=False)

print(best)

plt.plot(x, y)
plt.title('GB (pca)')
plt.xlabel('Feature Selection Intensity')
plt.ylabel('MAE')
plt.savefig('GB-pca.png')

In [None]:
RGS(X_n, y_n, 'rf','qtn', n=500, vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'rf','qtn', f_selection='kb', kbest_f=f_regression, f = 70, n=500, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'rf','qtn', f_selection='rf', fs_n=500, f = 20, n=500, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'rf','qtn', d_reduction=1, f = 20, n=500, vt=True).write_result('best-feature-selection')


RGS(X_n, y_n, 'mlp', 'qtu', vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'mlp','qtu', f_selection='kb', kbest_f=f_regression, f = 50 , vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'mlp','qtu', f_selection='rf', fs_n=500, f = 10, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'mlp','qtu', d_reduction=1, f = 150, vt=True).write_result('best-feature-selection')


RGS(X_n, y_n, 'svm', 'pty', gamma='auto', vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', f_selection='kb', kbest_f=f_regression, f = 10, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', f_selection='rf', fs_n=500, f = 10, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', d_reduction=1, f = 330, vt=True).write_result('best-feature-selection')


RGS(X_n, y_n, 'knn', 'qtu', n=10, vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'knn', 'qtu', n=10, f_selection='kb', kbest_f=f_regression, f = 150, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'knn', 'qtu', n=10, f_selection='rf', fs_n=500, f = 30, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'knn', 'qtu', n=10, d_reduction=1, f = 10, vt=True).write_result('best-feature-selection')


RGS(X_n, y_n, 'lr', 'qtu').write_result('best-feature-selection')
RGS(X_n, y_n, 'lr', 'qtu', f_selection='kb', kbest_f=f_regression, f = 320).write_result('best-feature-selection')
RGS(X_n, y_n, 'lr','qtu', f_selection='rf', fs_n=500, f = 230).write_result('best-feature-selection')
RGS(X_n, y_n, 'lr','qtu', d_reduction=1, f = 100).write_result('best-feature-selection')


RGS(X_n, y_n, 'dt', 'qtn').write_result('best-feature-selection')
RGS(X_n, y_n, 'dt', 'qtn', f_selection='kb', kbest_f=f_regression, f = 300).write_result('best-feature-selection')
RGS(X_n, y_n, 'dt', 'qtn', f_selection='rf', fs_n=500, f = 50).write_result('best-feature-selection')
RGS(X_n, y_n, 'dt', 'qtn', d_reduction=1, f = 10).write_result('best-feature-selection')


RGS(X_n, y_n, 'r', 'qtu', n=None, vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'r', 'qtu', n=None, f_selection='kb', kbest_f=f_regression, f = 50, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'r', 'qtu', n=None, f_selection='rf', fs_n=500, f = 10, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'r', 'qtu', n=None, d_reduction=1, f = 20, vt=True).write_result('best-feature-selection')


RGS(X_n, y_n, 'xg','mas').write_result('best-feature-selection')
RGS(X_n, y_n, 'xg','mas', f_selection='kb', kbest_f=f_regression, f = 280).write_result('best-feature-selection')
RGS(X_n, y_n, 'xg', 'mas', f_selection='rf', fs_n=500, f = 250).write_result('best-feature-selection')
RGS(X_n, y_n, 'xg', 'mas', d_reduction=1, f = 30).write_result('best-feature-selection')


RGS(X_n, y_n, 'gpc','qtu', vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'gpc','qtu', f_selection='kb', kbest_f=f_regression, f = 210, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'gpc','qtu', f_selection='rf', fs_n=500, f = 150, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'gpc','qtu', d_reduction=1, f = 20, vt=True).write_result('best-feature-selection')


RGS(X_n, y_n, 'ada','mas').write_result('best-feature-selection')
RGS(X_n, y_n, 'ada', 'mas', f_selection='kb', kbest_f=f_regression, f = 300).write_result('best-feature-selection')
RGS(X_n, y_n, 'ada','mas', f_selection='rf', fs_n=500, f = 330).write_result('best-feature-selection')
RGS(X_n, y_n, 'ada','mas', d_reduction=1, f = 60).write_result('best-feature-selection')


RGS(X_n, y_n, 'gb','pty', n=500, vt = True).write_result('best-feature-selection')
RGS(X_n, y_n, 'gb', 'pty', n=500, f_selection='kb', kbest_f=f_regression, f = 330, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'gb', 'pty', n=500, f_selection='rf', fs_n=500, f = 270, vt=True).write_result('best-feature-selection')
RGS(X_n, y_n, 'gb', 'pty', n=500, d_reduction=1, f = 20, vt=True).write_result('best-feature-selection')

In [None]:
RGS(X_n, y_n, 'rf','qtn', f_selection='rf', fs_n=500, f = 20, n=500, vt=True).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'mlp','qtu', f_selection='kb', kbest_f=f_regression, f = 50 , vt=True).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', f_selection='kb', kbest_f=f_regression, f = 10, vt=True).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'knn', 'qtu', n=10, f_selection='rf', fs_n=500, f = 30, vt=True).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'lr','qtu', d_reduction=1, f = 100).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'dt', 'qtn', f_selection='rf', fs_n=500, f = 50).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'r', 'qtu', n=None, f_selection='kb', kbest_f=f_regression, f = 50, vt=True).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'xg', 'mas', f_selection='rf', fs_n=500, f = 250).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'gpc','qtu', f_selection='rf', fs_n=500, f = 150, vt=True).write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'ada','mas').write_result('best-model-tunings-Maven-regression')
RGS(X_n, y_n, 'gb', 'pty', n=500, f_selection='rf', fs_n=500, f = 270, vt=True).write_result('best-model-tunings-Maven-regression')

In [None]:
RGS(X_n, y_n, 'rf','qtn', f_selection='rf', fs_n=500, f = 20, n=500, vt=True, permute=True)
RGS(X_n, y_n, 'mlp','qtu', f_selection='kb', kbest_f=f_regression, f = 50 , vt=True, permute=True)
RGS(X_n, y_n, 'svm', 'pty', gamma='auto', f_selection='kb', kbest_f=f_regression, f = 10, vt=True, permute=True)
RGS(X_n, y_n, 'knn', 'qtu', n=10, f_selection='rf', fs_n=500, f = 30, vt=True, permute=True)
RGS(X_n, y_n, 'lr','qtu', d_reduction=1, f = 100, permute=True)
RGS(X_n, y_n, 'dt', 'qtn', f_selection='rf', fs_n=500, f = 50, permute=True)
RGS(X_n, y_n, 'r', 'qtu', n=None, f_selection='kb', kbest_f=f_regression, f = 50, vt=True, permute=True)
RGS(X_n, y_n, 'xg', 'mas', f_selection='rf', fs_n=500, f = 250, permute=True)
RGS(X_n, y_n, 'gpc','qtu', f_selection='rf', fs_n=500, f = 150, vt=True, permute=True)
RGS(X_n, y_n, 'ada','mas', permute=True)
RGS(X_n, y_n, 'gb', 'pty', n=500, f_selection='rf', fs_n=500, f = 270, vt=True, permute=True)

In [10]:
import csv

#Calculating importance frequency and weighted frequency for each individual model
def importance_frequency(filename):
    table = {}
    f = open('Importance/' + filename + '.csv', 'r')
    for i in f:
        i = i.split(',')
        i[3] = float(i[3].replace('\n', ''))
        if i[3] > 0:
            if table.get(i[0]) == None:
                table[i[0]] = i[3]
            else:
                table[i[0]] = table[i[0]] + i[3]

    list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())



    with open("Importance/Frequency-Count/" + filename + "-Frequency.csv", 'w') as csv_file:
        writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for i in list_table:
            writer.writerow([i[0], i[1]])

In [15]:
importance_frequency('PI-ADA')
importance_frequency('PI-DT')
importance_frequency('PI-GPC')
importance_frequency('PI-GB')
importance_frequency('PI-KNN')
importance_frequency('PI-MLP')
importance_frequency('PI-RF')
importance_frequency('PI-R')
importance_frequency('PI-LR')
importance_frequency('PI-SVR')
importance_frequency('PI-XGB')

In [16]:
#Calculating importance frequency for cumulatively (every single model combined)

# import required module
import csv
import os
# assign directory
directory = 'Importance/Frequency-Count'
 
# iterate over files in
# that directory
table = {}
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        f = open(f, 'r')
        for i in f:
            i = i.split(',')
            i[1] = float(i[1].replace('\n', ''))
            if table.get(i[0]) == None:
                table[i[0]] = i[1]
            else:
                table[i[0]] = table[i[0]] + i[1]

list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())


with open("Importance/Cumulative-Frequency.csv", 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for item in list_table:
        writer.writerow(item)

In [17]:
count = 0
for item in list_table:
    count += item[1]
count

1000.9200000000001

In [18]:
metrics_to_category_dict = {
    'LCOM5': 'cohesion',
    'HCPL': 'complexity',
    'HDIF': 'complexity',
    'HEFF': 'complexity',
    'HNDB': 'complexity',
    'HPL': 'complexity',
    'HPV': 'complexity',
    'HTRP': 'complexity',
    'HVOL': 'complexity',
    'MIMS': 'complexity',
    'MI': 'complexity',
    'MISEI': 'complexity',
    'MISM': 'complexity',
    'McCC': 'complexity',
    'NL': 'complexity',
    'NLE': 'complexity',
    'WMC': 'complexity',
    'CBO': 'coupling',
    'CBOI': 'coupling',
    'NII': 'coupling',
    'NOI': 'coupling',
    'RFC': 'coupling',
    'AD': 'documentation',
    'CD': 'documentation',
    'CLOC': 'documentation',
    'DLOC': 'documentation',
    'PDA': 'documentation',
    'PUA': 'documentation',
    'TAD': 'documentation',
    'TCD': 'documentation',
    'TCLOC': 'documentation',
    'TPDA': 'documentation',
    'TPUA': 'documentation',
    'DIT': 'inheritance',
    'NOA': 'inheritance',
    'NOC': 'inheritance',
    'NOD': 'inheritance',
    'NOP': 'inheritance',
    'LOC': 'size',
    'LLOC': 'size',
    'NA': 'size',
    'NCL': 'size',
    'NEN': 'size',
    'NG': 'size',
    'NIN': 'size',
    'NLA': 'size',
    'NLG': 'size',
    'NLM': 'size',
    'NLPA': 'size',
    'NLPM': 'size',
    'NLS': 'size',
    'NM': 'size',
    'NPKG': 'size',
    'NUMPAR': 'size',
    'NPA': 'size',
    'NPM': 'size',
    'NS': 'size',
    'NOS': 'size',
    'TLOC': 'size',
    'TLLOC': 'size',
    'TNA': 'size',
    'TNCL': 'size',
    'TNDI': 'size',
    'TNEN': 'size',
    'TNFI': 'size',
    'TNG': 'size',
    'TNIN': 'size',
    'TNLA': 'size',
    'TNLG': 'size',
    'TNLM': 'size',
    'TNLPA': 'size',
    'TNLPM': 'size',
    'TNLS': 'size',
    'TNM': 'size',
    'TNPKG': 'size',
    'TNPA': 'size',
    'TNPCL': 'size',
    'TNPEN': 'size',
    'TNPIN': 'size',
    'TNPM': 'size',
    'TNS': 'size',
    'TNOS': 'size',
    'F': 'size',
    'M': 'size',
    'C': 'size'
}

In [19]:
importance = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8', names=['features', 'c_importance'])

category = []

for index, row in importance.iterrows():
    if len(row['features'].split('_')) == 2:
        category.append(metrics_to_category_dict.get(row['features'].split('_')[1]))
    else:
        category.append('external')
    
importance['category'] = category
importance.to_csv("Importance/cumulative.csv", index=False)

In [20]:
importance_category = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8')

table = {}
for index, row in importance_category.iterrows():
    if table.get(row['category']) == None:
        table[row['category']] = row['c_importance']
    else:
        table[row['category']] = table[row['category']] + row['c_importance']

list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())


with open("Importance/importance-category.csv", 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in list_table:
        writer.writerow(i)
        
list_table

[('size', 457.1890000000002),
 ('documentation', 175.97300000000007),
 ('coupling', 133.45999999999998),
 ('complexity', 118.15199999999994),
 ('inheritance', 66.446),
 ('cohesion', 49.70000000000001)]

In [21]:
metrics_granularity_dict = {
    'C': 'class',
    'F': 'file',
    'M': 'method',
    'N': 'repository'
}

importance = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8')

granularity = []

for index, row in importance.iterrows():
    if row['features'][0] not in metrics_granularity_dict:
        granularity.append("repository")
    else:
        granularity.append(metrics_granularity_dict.get(row['features'][0]))
        
importance['granularity'] = granularity
importance.to_csv("Importance/cumulative.csv", index=False)

In [22]:
importance_metrics = pd.read_csv("Importance/Cumulative-Frequency.csv", sep=",", encoding='utf-8')

table = {}
for index, row in importance_metrics.iterrows():
    if table.get(row['granularity']) == None:
        table[row['granularity']] = row['c_importance']
    else:
        table[row['granularity']] = table[row['granularity']] + row['c_importance']

list_table = list(dict(sorted(table.items(), key=lambda item: item[1], reverse = True)).items())


with open("Importance/granularity.csv", 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in list_table:
        writer.writerow(i)
        
list_table

[('class', 592.1640000000001),
 ('file', 149.846),
 ('method', 145.698),
 ('repository', 113.21200000000002)]

In [None]:
from sklearn.ensemble import VotingRegressor



estimator.append(['rf', RGS(X_n, y_n, 'rf','qtu', f_selection='kb', kbest_f=f_regression, f = 50, n=500, vt=True).get_model()])
estimator.append(['mlp', RGS(X_n, y_n, 'mlp','qtn', f_selection='rf', fs_n=500, f = 50, n=500, vt=True).get_model()])
estimator.append(['sgdh', RGS(X_n, y_n, 'sgd','rs', l='huber', f_selection='rf', fs_n=500, f = 100, vt=True).get_model()])
estimator.append(['sgdsl', RGS(X_n, y_n, 'sgd','mms', l='squared_loss', d_reduction=1, f = 100, vt=True).get_model()])
estimator.append(['sgdei', RGS(X_n, y_n, 'sgd','ss', l='epsilon_insensitive', f_selection='rf', fs_n=500, f = 100).get_model()])
estimator.append(['sgdsei', RGS(X_n, y_n, 'sgd','n', l='squared_epsilon_insensitive', f_selection='rf', fs_n=500, f = 200).get_model()])
estimator.append(['svr', RGS(X_n, y_n, 'svm','ss', f_selection='kb', kbest_f=f_regression, f = 50, gamma='auto', vt=True).get_model()])
estimator.append(['knn', RGS(X_n, y_n, 'knn','n', f_selection='rf', fs_n=500, f = 50, n=10).get_model()])
estimator.append(['r', RGS(X_n, y_n, 'r','n', n=None, d_reduction=1, f = 100).get_model()])

rgs = VotingRegressor(estimators = estimator, voting='hard')
X_train, X_test, y_train, y_test = train_test_split(X_n, y_n, random_state=27033074)
clf.fit(X_train, y_train)
y_pred = rgs.predict(X_test)

print(classification_report(y_test, y_pred))
rgs.estimators_

In [None]:
len(q.clf[-2].scores_)

In [None]:
x = list(zip(q.clf[-2].scores_, q.X.columns.values))
x = pd.DataFrame(x, columns=["Importance","Feature_Name"])
x = x.sort_values(by=['Importance'], ascending=False)
x.to_csv('kb_f_classiflog' +  ".csv", index=False)

In [None]:
clf.estimators_

In [None]:
#importance = np.abs(classifier.coef_)
#importance = importance[0]
#feature_names = np.array(Xp.columns)

In [None]:
pred_array = []
for i in range(len(df['reusability'])):
    pred_array.append(the_regressor.rgs.predict([X_n.iloc[i]])[0])