In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn import preprocessing
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, log_loss, average_precision_score
from random import sample, randint, randrange
from copy import deepcopy
from sklearn.model_selection import cross_validate

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
x_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target
classes_count = 20

Change dimensions to 500

In [4]:
svd = TruncatedSVD(n_components=500)
svd.fit(x_train)
svd_x_train = svd.transform(x_train)
svd_x_test = svd.transform(x_test)

In [5]:
svd_x_train = np.concatenate((svd_x_train, svd_x_test))
y_train = np.concatenate((y_train, y_test))

SVM on full data and full features

In [6]:
clf = SVC(probability=True)

In [7]:
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))

In [8]:
scores

{'fit_time': array([769.33637071, 772.59549665, 749.85074735, 752.04376197, 725.95951343])
'score_time': array([63.4211514 , 59.82949114, 59.67970324, 59.71268415, 58.05852199])
'test_accuracy': array([0.87718833, 0.88591138, 0.88750332, 0.83788803, 0.83974529])
'test_neg_log_loss': array([-0.41717589, -0.40097721, -0.39930141, -0.52782357, -0.51695601])
'test_neg_mean_squared_error': array([-6.40159151, -6.63305917, -4.97399841, -9.20960467, -9.45608915])
'test_roc_auc_ovr': array([0.99351403, 0.99341685, 0.99371947, 0.9886126 , 0.98974137])
'test_f1_weighted': array([0.87848693, 0.88579029, 0.88833248, 0.83790885, 0.84019642])
'test_precision_weighted': array([0.88215933, 0.88701031, 0.89080091, 0.84100458, 0.84330728])
'test_recall_weighted': array([0.87718833, 0.88591138, 0.88750332, 0.83788803, 0.83974529])}

10 sub-classifiers

In [9]:
class MinorClassifiers:
    def __init__(self, samp, feat, voting, max_iter=-1):
        self.samp = samp
        self.feat = feat
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
        self.voting = voting
        self.max_iter = max_iter
    
    def get_params(self, deep = False):
        return {
            'samp': self.samp,
            'feat': self.feat,
            'voting': self.voting,
            'max_iter': self.max_iter
        }
    
    def predict(self, X):
        for i in range(len(self.classifiers)):
            classifier = self.classifiers[i]
            f = self.cut_features[i]
            x_test = X[:,f]
            y_pred = classifier.predict(x_test)
            pred = classifier.predict_proba(x_test)
            self.predictions.append((y_pred, pred))
            
        if self.voting == 'average':
            return average_pred(self.predictions)
        
        if self.voting == 'majority':
            return majority_pred(self.predictions)
        
        if self.voting == 'borda':
            return borda_pred(self.predictions)
    
    def predict_proba(self, X):
        if self.voting == 'average':
            return average_pred_proba(self.predictions, len(self.classifiers))
        
        if self.voting == 'majority':
            return majority_pred_proba(self.predictions, len(self.classifiers))
        
        if self.voting == 'borda':
            return borda_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(500)]
        samples_all = X.shape[0]
        features_all = X.shape[1]
        
        for i in range(10):
            f = sample(feature_list, int(features_all * self.feat))
            self.cut_features.append(f)
            x_train_f = X[:,f]
                
            x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(self.samp * samples_all), replace=False, random_state=0)

            svm_clf = SVC(probability=True, max_iter=self.max_iter)
            svm_clf.fit(x_train_s, y_train_s)
                
            self.classifiers.append(svm_clf)

In [10]:
def average_pred(predictions):
    #predictions = list of tuples 
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def average_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= n_classifiers            
    return results

def majority_pred(predictions):
    m = len(predictions[0][0])
    results = [0] * m
    for i in range(m):
        all_results = [0 for x in range(classes_count)] 
        for (pred, _) in predictions:
            all_results[pred[i]] += 1
        results[i] = all_results.index(max(all_results))        
    return results

def majority_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    classifiers_votes_count = [0] * m
    majority_results = majority_pred(predictions)
    for classifier in range(0, n_classifiers):
        for i in range(m):
            voted_class = majority_results[i] 
            (pred, pred_proba) = predictions[classifier]
            if(pred[i] == voted_class):
                classifiers_votes_count[i] += 1
                for j in range(classes_count):
                    results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= classifiers_votes_count[i]
    return results      
    

def borda_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def borda_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)] 
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    def get_points():
        sum = 0
        for i in range(classes_count):
            sum += i
        return sum * n_classifiers
    
    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= get_points()
        
    return results

In [11]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [12]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [13]:
for minor in minors:
    scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

Samples: 0.1, features: 1
{'fit_time': array([146.8439374 , 144.80648851, 143.79742622, 145.58774638, 147.87192154])
'score_time': array([80.36888385, 81.76582313, 79.61906075, 80.60673547, 80.74169636])
'test_accuracy': array([0.74801061, 0.75033165, 0.74714778, 0.71185991, 0.7153091 ])
'test_neg_log_loss': array([-1.03524361, -1.00071083, -1.02679065, -1.01529485, -0.98524062]), 'test_neg_mean_squared_error': array([-13.54403183, -13.90342266, -15.05465641, -17.12470151, 16.24383125])
'test_roc_auc_ovr': array([0.96871381, 0.97247651, 0.97267578, 0.96675164, 0.97066632])
'test_f1_weighted': array([0.75050395, 0.75064552, 0.7487207 , 0.71419086, 0.7175339 ]), 'test_precision_weighted': array([0.75794872, 0.75521117, 0.75493644, 0.72484796, 0.72781026]), 'test_recall_weighted': array([0.74801061, 0.75033165, 0.74714778, 0.71185991, 0.7153091 ])}
Samples: 0.35, features: 1
{'fit_time': array([1245.0476675 , 1233.07420564, 1240.46790099, 1298.18706274, 1375.52531981]), 'score_time': arra

In [14]:
minors = []
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'majority'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'majority'))

In [15]:
for minor in minors:
    scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

Samples: 0.1, features: 1
{'fit_time': array([148.16808438, 144.66432762, 150.11508965, 146.71330357, 152.67174196])
'score_time': array([79.51726079, 80.98261666, 81.92676306, 80.95609522, 91.16656756])
'test_accuracy': array([0.74217507, 0.7460865 , 0.73839215, 0.70443088, 0.70018573])
'test_neg_log_loss': array([-1.03196103, -1.00154019, -1.02316711, -1.01814237, -0.98306089]), 'test_neg_mean_squared_error': array([-13.57824934, -12.46590608, -13.52241974, -16.59007694,   -15.47704962])
'test_roc_auc_ovr': array([0.96893701, 0.97255294, 0.97254623, 0.96643783, 0.9706301 ])
'test_f1_weighted': array([0.74581488, 0.74695513, 0.73995017, 0.70787399, 0.7053624 ]), 'test_precision_weighted': array([0.76351305, 0.7598769 , 0.75137709, 0.72947445, 0.73186469]), 'test_recall_weighted': array([0.74217507, 0.7460865 , 0.73839215, 0.70443088, 0.70018573])}
Samples: 0.35, features: 1
{'fit_time': array([1358.10335183, 1233.21016026, 1266.36326027, 1428.89073133, 1407.69289255]), 'score_time': a

In [16]:
minors = []
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'borda'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'borda'))

In [17]:
for minor in minors:
    scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

Samples: 0.1, features: 1
{'fit_time': array([143.1978569 , 143.10595918, 143.31464124, 144.04252505,
       144.58688164]), 'score_time': array([80.07836199, 79.91525245, 79.21023536, 80.16977859, 80.10908532]), 'test_accuracy': array([0.74827586, 0.7508623 , 0.74661714, 0.71318652, 0.71477846]), 'test_neg_log_loss': array([-2.40542955, -2.40699472, -2.39584182, -2.41391118, -2.38489124]), 'test_neg_mean_squared_error': array([-13.16312997, -14.0498806 , -15.08357655, -17.29981427,
       -15.73282038]), 'test_roc_auc_ovr': array([0.95123034, 0.95686273, 0.95652897, 0.94880725, 0.95190248]), 'test_f1_weighted': array([0.75050928, 0.75082233, 0.74826756, 0.71584472, 0.71695438]), 'test_precision_weighted': array([0.75700721, 0.75539527, 0.75482369, 0.72681102, 0.72742506]), 'test_recall_weighted': array([0.74827586, 0.7508623 , 0.74661714, 0.71318652, 0.71477846])}
Samples: 0.35, features: 1
{'fit_time': array([1222.33014894, 1214.1776011 , 1220.71215248, 1255.46160603,
       1255.329

3a

In [18]:
parts = [0.1, 0.25, 0.5, 0.75] ## for 1.0 already calculated

In [19]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [20]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [21]:
for p in parts:
    new_X = svd_x_train[:int(p*svd_x_train.shape[0])]
    new_Y = y_train[:int(p*y_train.shape[0])]
    print("Data " + str(p))
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full")
    print(scores)
    for minor in minors:
        scores = cross_validate(minor, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
        print(scores)

Data 0.1
Full
{'fit_time': array([14.91580057, 15.06390715, 14.86017299, 15.14462852, 14.84050035]), 'score_time': array([0.81200314, 0.88047385, 0.82571363, 0.86820745, 0.83200431]), 'test_accuracy': array([0.74005305, 0.76392573, 0.75596817, 0.77984085, 0.76329787]), 'test_neg_log_loss': array([-0.92504631, -0.88011954, -0.86235334, -0.91682823, -0.8994923 ]), 'test_neg_mean_squared_error': array([-12.45623342, -11.89124668, -10.86737401, -15.56233422, -10.68882979]), 'test_roc_auc_ovr': array([0.96962844, 0.9748243 , 0.97635786, 0.97246687, 0.97234457]), 'test_f1_weighted': array([0.74120052, 0.76714738, 0.75899161, 0.78198566, 0.76637321]), 'test_precision_weighted': array([0.7736512 , 0.79603742, 0.78608896, 0.80605311, 0.80339669]), 'test_recall_weighted': array([0.74005305, 0.76392573, 0.75596817, 0.77984085, 0.76329787])}
Samples: 0.1, features: 1
{'fit_time': array([2.04304957, 2.02259231, 2.00164914, 2.03755426, 2.02558446]),
'score_time': array([0.97146487, 1.01927519, 1.024

3b

In [22]:
clf = SVC(probability=True, max_iter=300)
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
print(scores)



{'fit_time': array([562.93116689, 562.83924198, 561.86122465, 558.63007069,
       557.33135796]), 'score_time': array([50.83319187, 51.06287003, 50.92495847, 50.41782594, 50.80374217]), 'test_accuracy': array([0.87188329, 0.88936057, 0.88193155, 0.83921465, 0.8410719 ]), 'test_neg_log_loss': array([-0.42623447, -0.40478624, -0.40388631, -0.53291046, -0.51709512]), 'test_neg_mean_squared_error': array([-6.81909814, -5.86256301, -5.16741841, -8.82674449, -9.32793845]), 'test_roc_auc_ovr': array([0.9933504 , 0.9934361 , 0.99375567, 0.98848411, 0.98978386]), 'test_f1_weighted': array([0.87318645, 0.8891092 , 0.88278395, 0.83897054, 0.84150245]), 'test_precision_weighted': array([0.8771908 , 0.89048686, 0.88506253, 0.84157337, 0.84512648]), 'test_recall_weighted': array([0.87188329, 0.88936057, 0.88193155, 0.83921465, 0.8410719 ])}


In [23]:
clf = SVC(probability=True, max_iter=130)
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
print(scores)



{'fit_time': array([349.97228909, 332.89484262, 332.94227552, 331.75345349,
       331.66004205]), 'score_time': array([40.90028977, 41.29549599, 41.03101397, 40.17039728, 40.43398356]), 'test_accuracy': array([0.83846154, 0.85062351, 0.85009286, 0.81905015, 0.81666224]), 'test_neg_log_loss': array([-0.62555108, -0.62987654, -0.6286315 , -0.66268624, -0.68706849]), 'test_neg_mean_squared_error': array([ -7.9872679 ,  -7.75723003,  -6.62642611, -10.20801274,
       -11.19899178]), 'test_roc_auc_ovr': array([0.9903969 , 0.9895292 , 0.98983432, 0.98681415, 0.98668778]), 'test_f1_weighted': array([0.83867623, 0.85029454, 0.85089911, 0.8190994 , 0.81751722]), 'test_precision_weighted': array([0.84271581, 0.85269431, 0.85598384, 0.82667508, 0.82855683]), 'test_recall_weighted': array([0.83846154, 0.85062351, 0.85009286, 0.81905015, 0.81666224])}


In [24]:
clf = SVC(probability=True, max_iter=60)
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
print(scores)



{'fit_time': array([160.55420876, 160.31492424, 159.88336587, 159.85737753,
       161.1360631 ]), 'score_time': array([28.85886645, 28.78711581, 29.04045343, 28.20144725, 28.12108278]), 'test_accuracy': array([0.74960212, 0.74980101, 0.75855665, 0.73494296, 0.75616875]), 'test_neg_log_loss': array([-1.06782259, -1.08355699, -1.08235768, -1.08656571, -1.04117837]), 'test_neg_mean_squared_error': array([-14.85251989, -15.80737596, -13.89519767, -18.45741576,
       -15.74980101]), 'test_roc_auc_ovr': array([0.98045306, 0.97840346, 0.98050336, 0.97573777, 0.97877233]), 'test_f1_weighted': array([0.75122   , 0.752579  , 0.76120373, 0.74565294, 0.75955601]), 'test_precision_weighted': array([0.75937205, 0.76404097, 0.77575887, 0.77977702, 0.77843889]), 'test_recall_weighted': array([0.74960212, 0.74980101, 0.75855665, 0.73494296, 0.75616875])}


In [27]:
#0.1 samples, 1 features:
minor = MinorClassifiers(0.1, 1, 'average', 800)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 0.1, features: 1
{'fit_time': array([143.74664235, 143.4343214 , 142.68970346, 144.4288826 ,
       145.07612896]), 'score_time': array([79.37870693, 79.00381422, 78.88554025, 79.96405578, 79.68736148]), 'test_accuracy': array([0.74721485, 0.7484744 , 0.7460865 , 0.7129212 , 0.71477846]), 'test_neg_log_loss': array([-1.03030245, -1.00152858, -1.02694075, -1.01946636, -0.98732007]), 'test_neg_mean_squared_error': array([-13.12493369, -13.94375166, -14.71610507, -17.35261343,
       -16.24064739]), 'test_roc_auc_ovr': array([0.96907619, 0.97254683, 0.97261814, 0.96660486, 0.97042594]), 'test_f1_weighted': array([0.74962856, 0.74849211, 0.74763184, 0.71551568, 0.71743782]), 'test_precision_weighted': array([0.75651728, 0.75341645, 0.75364292, 0.72618261, 0.72899139]), 'test_recall_weighted': array([0.74721485, 0.7484744 , 0.7460865 , 0.7129212 , 0.71477846])}


In [28]:
#0.1 samples, 1 features:
minor = MinorClassifiers(0.1, 1, 'average', 500)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 0.1, features: 1
{'fit_time': array([143.69359374, 143.53579068, 143.32959175, 144.25927544,
       144.90196466]), 'score_time': array([79.82853341, 79.0423317 , 78.99677014, 79.84514499, 79.67702365]), 'test_accuracy': array([0.74748011, 0.7484744 , 0.74635182, 0.71371717, 0.71504378]), 'test_neg_log_loss': array([-1.02967009, -1.00297844, -1.02586306, -1.02028616, -0.98688472]), 'test_neg_mean_squared_error': array([-12.98435013, -13.82992836, -14.97638631, -17.11276201,
       -16.18758291]), 'test_roc_auc_ovr': array([0.96916692, 0.97259883, 0.97266523, 0.96657797, 0.97043052]), 'test_f1_weighted': array([0.74968528, 0.74841573, 0.74807805, 0.71610509, 0.71757741]), 'test_precision_weighted': array([0.75618164, 0.75338198, 0.75479582, 0.72666075, 0.72917952]), 'test_recall_weighted': array([0.74748011, 0.7484744 , 0.74635182, 0.71371717, 0.71504378])}


In [29]:
#0.1 samples, 1 features:
minor = MinorClassifiers(0.1, 1, 'average', 100)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.1, features: 1
{'fit_time': array([161.38426304, 161.85869408, 160.53655028, 163.50446272,
       165.09554029]), 'score_time': array([97.60327721, 96.85600066, 96.70366549, 97.71602917, 97.99591017]), 'test_accuracy': array([0.7469496 , 0.74873972, 0.74661714, 0.71212523, 0.71371717]), 'test_neg_log_loss': array([-1.03075665, -1.00233244, -1.02767706, -1.01985539, -0.98797864]), 'test_neg_mean_squared_error': array([-13.09655172, -14.03502255, -14.95648713, -17.27593526,
       -15.96232422]), 'test_roc_auc_ovr': array([0.96912883, 0.97268161, 0.97269384, 0.96658127, 0.97038774]), 'test_f1_weighted': array([0.74939829, 0.7488493 , 0.74828945, 0.71455978, 0.71651812]), 'test_precision_weighted': array([0.75650198, 0.75368039, 0.7546048 , 0.7251028 , 0.72806585]), 'test_recall_weighted': array([0.7469496 , 0.74873972, 0.74661714, 0.71212523, 0.71371717])}


In [30]:
#0.35 samples, 1 features:
minor = MinorClassifiers(0.35, 1, 'average', 63)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.35, features: 1
{'fit_time': array([552.34787416, 541.3920517 , 542.13603902, 542.2991457 ,
       541.66053224]), 'score_time': array([181.61350226, 183.52007818, 182.02906919, 180.00202489,
       181.05537295]), 'test_accuracy': array([0.78249337, 0.77951711, 0.77500663, 0.72751393, 0.74316795]), 'test_neg_log_loss': array([-0.90358993, -0.92197339, -0.91951682, -0.94885831, -0.91105022]), 'test_neg_mean_squared_error': array([-13.90344828, -14.10665959, -12.78588485, -19.70655346,
       -16.65083577]), 'test_roc_auc_ovr': array([0.98358329, 0.98278093, 0.98308693, 0.97780141, 0.97878412]), 'test_f1_weighted': array([0.78781884, 0.78261763, 0.77589813, 0.73747437, 0.7471857 ]), 'test_precision_weighted': array([0.8123188 , 0.81028381, 0.7993413 , 0.77990371, 0.78239465]), 'test_recall_weighted': array([0.78249337, 0.77951711, 0.77500663, 0.72751393, 0.74316795])}


In [31]:
#0.35 samples, 1 features:
minor = MinorClassifiers(0.35, 1, 'average', 38)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.35, features: 1
{'fit_time': array([331.37923431, 332.66124415, 331.92640615, 331.51932669,
       331.31372976]), 'score_time': array([150.83965302, 150.49776673, 148.94946837, 148.34948683,
       149.22491384]), 'test_accuracy': array([0.7066313 , 0.70761475, 0.70602282, 0.64367206, 0.63438578]), 'test_neg_log_loss': array([-1.27358719, -1.28382723, -1.28036336, -1.27947631, -1.29149313]), 'test_neg_mean_squared_error': array([-21.37082228, -21.39957548, -20.86176705, -28.08384187,
       -30.21040064]), 'test_roc_auc_ovr': array([0.97558053, 0.97314962, 0.97516072, 0.96727272, 0.96794843]), 'test_f1_weighted': array([0.71497346, 0.71586849, 0.70882662, 0.6768126 , 0.66548412]), 'test_precision_weighted': array([0.76301142, 0.76476431, 0.75454763, 0.76746248, 0.76570663]), 'test_recall_weighted': array([0.7066313 , 0.70761475, 0.70602282, 0.64367206, 0.63438578])}


In [32]:
#0.35 samples, 1 features:
minor = MinorClassifiers(0.35, 1, 'average', 19)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.35, features: 1
{'fit_time': array([170.76959729, 170.729491  , 170.73375201, 170.86403394,
       170.97561479]), 'score_time': array([94.92840934, 94.34453869, 94.43074822, 93.24886131, 91.80771637]), 'test_accuracy': array([0.57559682, 0.56964712, 0.56991244, 0.50437782, 0.51074556]), 'test_neg_log_loss': array([-1.81902736, -1.8315846 , -1.81931019, -1.81184169, -1.81899491]), 'test_neg_mean_squared_error': array([-36.22811671, -38.88829928, -38.42876094, -46.78455824,
       -44.21278854]), 'test_roc_auc_ovr': array([0.92840088, 0.93501789, 0.9310363 , 0.94184957, 0.93880368]), 'test_f1_weighted': array([0.58591736, 0.59282698, 0.5800717 , 0.54927694, 0.5518135 ]), 'test_precision_weighted': array([0.65209905, 0.66981066, 0.6459285 , 0.69052462, 0.68374176]), 'test_recall_weighted': array([0.57559682, 0.56964712, 0.56991244, 0.50437782, 0.51074556])}


In [33]:
#0.7 samples, 1 features:
minor = MinorClassifiers(0.7, 1, 'average', 30)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.7, features: 1
{'fit_time': array([557.4581275 , 557.87601018, 557.11426044, 558.04654264,
       558.00818896]), 'score_time': array([164.09793568, 159.56093526, 159.1605742 , 156.58893991,
       157.1965394 ]), 'test_accuracy': array([0.64350133, 0.64022287, 0.65587689, 0.60811886, 0.58530114]), 'test_neg_log_loss': array([-1.54497817, -1.54879283, -1.53300453, -1.50880266, -1.52695008]), 'test_neg_mean_squared_error': array([-31.97639257, -28.93579199, -29.01724595, -33.5717697 ,
       -39.46245689]), 'test_roc_auc_ovr': array([0.95203273, 0.95151557, 0.95487603, 0.95894532, 0.95687504]), 'test_f1_weighted': array([0.65978691, 0.65306382, 0.66221334, 0.63523244, 0.62949919]), 'test_precision_weighted': array([0.70726112, 0.69077771, 0.69182468, 0.72518419, 0.73321374]), 'test_recall_weighted': array([0.64350133, 0.64022287, 0.65587689, 0.60811886, 0.58530114])}


In [34]:
#0.7 samples, 1 features:
minor = MinorClassifiers(0.7, 1, 'average', 18)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.7, features: 1
{'fit_time': array([341.79230094, 343.3571496 , 342.34976411, 341.98819757,
       341.85069942]), 'score_time': array([106.65240788, 105.72991347, 105.054847  , 102.85468841,
        99.70738578]), 'test_accuracy': array([0.50424403, 0.47731494, 0.4958875 , 0.5017246 , 0.44786415]), 'test_neg_log_loss': array([-2.02426948, -2.06989556, -2.05990069, -1.9497137 , -2.01026221]), 'test_neg_mean_squared_error': array([-53.10557029, -60.4555585 , -52.39771823, -49.18466437,
       -57.6274874 ]), 'test_roc_auc_ovr': array([0.89390592, 0.88854979, 0.88692634, 0.9273053 , 0.90996748]), 'test_f1_weighted': array([0.53647367, 0.52604928, 0.52166639, 0.54521141, 0.49506146]), 'test_precision_weighted': array([0.63707549, 0.65611734, 0.61412632, 0.66747681, 0.6350969 ]), 'test_recall_weighted': array([0.50424403, 0.47731494, 0.4958875 , 0.5017246 , 0.44786415])}


In [35]:
#0.7 samples, 1 features:
minor = MinorClassifiers(0.7, 1, 'average', 8)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.7, features: 1
{'fit_time': array([163.48288536, 163.17852306, 162.81571603, 162.85702705,
       162.94164491]), 'score_time': array([52.49703074, 54.60026574, 55.01369143, 54.24818039, 51.63275695]), 'test_accuracy': array([0.31485411, 0.30405943, 0.30379411, 0.37808437, 0.33536747]), 'test_neg_log_loss': array([-2.60395101, -2.62071942, -2.61853389, -2.46254405, -2.50721086]), 'test_neg_mean_squared_error': array([-78.4198939 , -80.33191828, -79.87768639, -63.48766251,
       -69.36747148]), 'test_roc_auc_ovr': array([0.77684977, 0.78127085, 0.78537512, 0.84619761, 0.82530602]), 'test_f1_weighted': array([0.34993632, 0.35013506, 0.34015569, 0.41864387, 0.36073942]), 'test_precision_weighted': array([0.53620605, 0.54795833, 0.53149469, 0.56951726, 0.51844868]), 'test_recall_weighted': array([0.31485411, 0.30405943, 0.30379411, 0.37808437, 0.33536747])}


In [36]:
#1 samples, 0.25 features:
minor = MinorClassifiers(1, 0.25, 'average', 48)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.25
{'fit_time': array([555.75384307, 559.56569028, 559.6182642 , 558.76342058,
       559.1712389 ]), 'score_time': array([97.22147369, 96.8145082 , 96.0690465 , 94.39497876, 92.1265347 ]), 'test_accuracy': array([0.58302387, 0.60440435, 0.57946405, 0.49907137, 0.54656408]), 'test_neg_log_loss': array([-2.09836408, -2.02466279, -2.1420835 , -2.20278194, -2.09449683]), 'test_neg_mean_squared_error': array([-46.16604775, -44.03210401, -49.33271425, -63.2894667 ,
       -54.99363226]), 'test_roc_auc_ovr': array([0.92454674, 0.93497974, 0.91770921, 0.93153972, 0.93208022]), 'test_f1_weighted': array([0.61046096, 0.63091561, 0.60634022, 0.5351385 , 0.58115379]), 'test_precision_weighted': array([0.70852733, 0.72774097, 0.70339212, 0.7148566 , 0.72475351]), 'test_recall_weighted': array([0.58302387, 0.60440435, 0.57946405, 0.49907137, 0.54656408])}


In [37]:
#1 samples, 0.25 features:
minor = MinorClassifiers(1, 0.25, 'average', 28)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.25
{'fit_time': array([335.63943529, 332.738415  , 339.48606753, 333.25363588,
       333.44349122]), 'score_time': array([66.02936602, 65.90219331, 66.77987671, 65.8019774 , 64.49197388]), 'test_accuracy': array([0.44854111, 0.37171664, 0.41602547, 0.46404882, 0.43114885]), 'test_neg_log_loss': array([-2.450547  , -2.50481542, -2.48972129, -2.42505484, -2.41978305]), 'test_neg_mean_squared_error': array([-63.50503979, -76.17829663, -70.23268772, -63.31122314,
       -69.75457681]), 'test_roc_auc_ovr': array([0.86284376, 0.84860148, 0.84037468, 0.89849713, 0.88779606]), 'test_f1_weighted': array([0.48742804, 0.41699899, 0.46267296, 0.49553046, 0.4760672 ]), 'test_precision_weighted': array([0.65158164, 0.66446863, 0.66066056, 0.68075653, 0.6807329 ]), 'test_recall_weighted': array([0.44854111, 0.37171664, 0.41602547, 0.46404882, 0.43114885])}


In [38]:
#1 samples, 0.25 features:
minor = MinorClassifiers(1, 0.25, 'average', 13)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.25
{'fit_time': array([164.39329934, 162.27841687, 167.68766785, 165.13241696,
       167.21511054]), 'score_time': array([38.00784135, 37.08639002, 38.05272174, 36.68836689, 34.82594943]), 'test_accuracy': array([0.33129973, 0.29211993, 0.29105864, 0.30405943, 0.37543115]), 'test_neg_log_loss': array([-2.73221991, -2.76460477, -2.81760962, -2.77883778, -2.68310712]), 'test_neg_mean_squared_error': array([-78.37453581, -81.47068188, -82.17723534, -72.67100027,
       -65.25364818]), 'test_roc_auc_ovr': array([0.77903982, 0.7545565 , 0.73509122, 0.78493461, 0.81122771]), 'test_f1_weighted': array([0.37550804, 0.33968268, 0.3307604 , 0.332058  , 0.40148497]), 'test_precision_weighted': array([0.57764553, 0.58462264, 0.57491473, 0.52942048, 0.59195944]), 'test_recall_weighted': array([0.33129973, 0.29211993, 0.29105864, 0.30405943, 0.37543115])}


In [39]:
#1 samples, 0.5 features:
minor = MinorClassifiers(1, 0.5, 'average', 26)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.5
{'fit_time': array([568.19140482, 586.37390924, 543.64614224, 547.91092396,
       555.43582177]), 'score_time': array([119.62154579, 111.0768137 , 108.85527968, 109.19729185,
       106.73456955]), 'test_accuracy': array([0.56233422, 0.53091006, 0.53011409, 0.57203502, 0.46670204]), 'test_neg_log_loss': array([-2.01439551, -2.10124608, -2.11982117, -1.94219811, -2.13217459]), 'test_neg_mean_squared_error': array([-47.75278515, -53.95622181, -54.98142743, -45.92730167,
       -63.83788803]), 'test_roc_auc_ovr': array([0.91404782, 0.9061534 , 0.89225259, 0.94186771, 0.91659064]), 'test_f1_weighted': array([0.59710848, 0.57174781, 0.55837735, 0.60626789, 0.52083042]), 'test_precision_weighted': array([0.68701621, 0.69149985, 0.6638014 , 0.71241671, 0.70157132]), 'test_recall_weighted': array([0.56233422, 0.53091006, 0.53011409, 0.57203502, 0.46670204])}


In [40]:
#1 samples, 0.5 features:
minor = MinorClassifiers(1, 0.5, 'average', 15)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.5
{'fit_time': array([326.70945597, 328.47471452, 326.34596229, 326.71032429,
       327.04601884]), 'score_time': array([70.79618835, 71.21680522, 71.14517403, 69.11581302, 65.8418386 ]), 'test_accuracy': array([0.3867374 , 0.38710533, 0.40196339, 0.49217299, 0.47784558]), 'test_neg_log_loss': array([-2.53810166, -2.47862545, -2.45818872, -2.34341286, -2.30864039]), 'test_neg_mean_squared_error': array([-75.49363395, -75.53409392, -72.07031043, -55.28601751,
       -54.17458212]), 'test_roc_auc_ovr': array([0.81146765, 0.83137794, 0.83139789, 0.88372071, 0.88146139]), 'test_f1_weighted': array([0.43869523, 0.44499755, 0.45288991, 0.525556  , 0.51345627]), 'test_precision_weighted': array([0.62449705, 0.62911972, 0.62526292, 0.65123637, 0.62964143]), 'test_recall_weighted': array([0.3867374 , 0.38710533, 0.40196339, 0.49217299, 0.47784558])}


In [41]:
#1 samples, 0.5 features:
minor = MinorClassifiers(1, 0.5, 'average', 7)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.5
{'fit_time': array([144.9206686 , 166.44730639, 166.02456474, 165.11131692,
       163.69526696]), 'score_time': array([39.73367906, 39.67960811, 41.04520416, 38.35888076, 37.21409965]), 'test_accuracy': array([0.29469496, 0.26956752, 0.26956752, 0.35791987, 0.36481825]), 'test_neg_log_loss': array([-2.77523933, -2.77462647, -2.84233997, -2.69538992, -2.66544632]), 'test_neg_mean_squared_error': array([-83.50026525, -87.75935261, -86.83841868, -64.85062351,
       -68.06155479]), 'test_roc_auc_ovr': array([0.7398455 , 0.74526962, 0.70472902, 0.79057637, 0.79033756]), 'test_f1_weighted': array([0.32864982, 0.31694594, 0.29976518, 0.38750127, 0.38391606]), 'test_precision_weighted': array([0.54722527, 0.52857479, 0.52732321, 0.55805479, 0.52695154]), 'test_recall_weighted': array([0.29469496, 0.26956752, 0.26956752, 0.35791987, 0.36481825])}


In [42]:
#1 samples, 0.75 features:
minor = MinorClassifiers(1, 0.75, 'average', 18)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.75
{'fit_time': array([543.2848134 , 542.75682473, 545.0330205 , 543.90491152,
       544.73181605]), 'score_time': array([116.60086894, 114.68820333, 116.33433199, 115.2073648 ,
       109.53170824]), 'test_accuracy': array([0.47427056, 0.4460069 , 0.46166092, 0.49986734, 0.46908994]), 'test_neg_log_loss': array([-2.1958687 , -2.24638702, -2.24491859, -2.09166133, -2.16360361]), 'test_neg_mean_squared_error': array([-60.82281167, -66.05253383, -62.43565933, -55.53356328,
       -60.0628814 ]), 'test_roc_auc_ovr': array([0.87283345, 0.86794023, 0.86200725, 0.91351472, 0.90002562]), 'test_f1_weighted': array([0.52055165, 0.49776334, 0.50178686, 0.54129777, 0.51213135]), 'test_precision_weighted': array([0.65198465, 0.64723404, 0.64109106, 0.67440219, 0.65932564]), 'test_recall_weighted': array([0.47427056, 0.4460069 , 0.46166092, 0.49986734, 0.46908994])}


In [43]:
#1 samples, 0.75 features:
minor = MinorClassifiers(1, 0.75, 'average', 10)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.75
{'fit_time': array([313.81512237, 315.01926184, 315.6699419 , 316.18299389,
       313.71451807]), 'score_time': array([71.72010326, 71.46488166, 72.00461149, 68.82487416, 65.875911  ]), 'test_accuracy': array([0.33156499, 0.33351021, 0.32475458, 0.4234545 , 0.39877952]), 'test_neg_log_loss': array([-2.60271595, -2.58509251, -2.64581455, -2.43916448, -2.45705621]), 'test_neg_mean_squared_error': array([-80.46525199, -81.33297957, -83.05226851, -63.88511542,
       -65.80366145]), 'test_roc_auc_ovr': array([0.78987905, 0.79998341, 0.77094091, 0.85266154, 0.83826409]), 'test_f1_weighted': array([0.3775018 , 0.39111919, 0.37290265, 0.46704879, 0.43357864]), 'test_precision_weighted': array([0.56420576, 0.58946058, 0.56980663, 0.61955335, 0.58746608]), 'test_recall_weighted': array([0.33156499, 0.33351021, 0.32475458, 0.4234545 , 0.39877952])}


In [44]:
#1 samples, 0.75 features:
minor = MinorClassifiers(1, 0.75, 'average', 4)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 1, features: 0.75
{'fit_time': array([144.6208396 , 145.08518982, 145.80395412, 144.94858265,
       144.84199882]), 'score_time': array([35.57569957, 36.61703134, 34.7765727 , 34.84597135, 31.93229795]), 'test_accuracy': array([0.20557029, 0.21172725, 0.2194216 , 0.2799151 , 0.29981427]), 'test_neg_log_loss': array([-2.87825814, -2.88095561, -2.91086788, -2.78753094, -2.79013936]), 'test_neg_mean_squared_error': array([-81.30344828, -91.65348899, -91.26850624, -68.51499071,
       -71.08224993]), 'test_roc_auc_ovr': array([0.69667086, 0.69754482, 0.67800833, 0.72023747, 0.73043098]), 'test_f1_weighted': array([0.24324792, 0.23637009, 0.2317789 , 0.3001158 , 0.30691546]), 'test_precision_weighted': array([0.48582467, 0.46314854, 0.4409292 , 0.48916202, 0.46735043]), 'test_recall_weighted': array([0.20557029, 0.21172725, 0.2194216 , 0.2799151 , 0.29981427])}


4

In [45]:
def make_noise_data(data, percent):
    m = data.shape[0]
    n = data.shape[1]
    new_data = deepcopy(data)
    for i in range(m):
        for j in range(n):
            rand = randrange(-1, 2, 2) #random integer from {-1, 1}
            new_data[i][j] = (1 + rand * percent) * data[i][j]
    return new_data

In [46]:
def make_noise_label(labels, percent):
    labels_with_noise = deepcopy(labels)
    arr_size = labels.shape[0]
    indexes = [n for n in range(arr_size)]
    indexes_to_change = sample(indexes, int(arr_size * percent))
    
    for i in indexes_to_change:
        old_val = labels[i]
        new_val = randint(0, classes_count-1)
        while old_val == new_val:
            new_val = randint(0, classes_count-1)
        labels_with_noise[i] = new_val
        
    return labels_with_noise

In [47]:
noises = [0.05, 0.1, 0.2, 0.4]

In [48]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [49]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [50]:
for n in noises:
    new_x_train = make_noise_data(svd_x_train, n)
    print("Noise on data: " + str(n))
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full")
    print(scores)
    for minor in minors:
        scores = cross_validate(minor, new_x_train, y_train, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
        print(scores)

Noise on data: 0.05
Full
{'fit_time': array([714.10467768, 699.99005127, 721.55515337, 778.71660495, 795.39420342])
'score_time': array([58.76022601, 57.11110663, 56.64833307, 62.29998899, 64.55566335])
'test_accuracy': array([0.87241379, 0.8861767 , 0.88644203, 0.83470417, 0.84080658])
'test_neg_log_loss': array([-0.42142284, -0.406026  , -0.40029779, -0.5317129 , -0.51451489])
'test_neg_mean_squared_error': array([-6.6602122 , -6.05041125, -5.09339347, -9.07747413, -9.524277  ])
'test_roc_auc_ovr': array([0.99350554, 0.99331718, 0.99374273, 0.98850601, 0.9899363 ])
'test_f1_weighted': array([0.87384865, 0.8860088 , 0.887088  , 0.83452864, 0.84088504])
'test_precision_weighted': array([0.87823768, 0.88739488, 0.88941373, 0.83800338, 0.84422754])
'test_recall_weighted': array([0.87241379, 0.8861767 , 0.88644203, 0.83470417, 0.84080658])}
Samples: 0.1, features: 1
{'fit_time': array([154.17217922, 150.05866861, 153.23058915, 151.83258295,
       156.07881832]), 'score_time': array([84.3

Samples: 1, features: 0.5
{'fit_time': array([3887.04473448, 3979.57309031, 3839.72717118, 3762.76277852,
       3931.58019376]), 'score_time': array([315.352211  , 317.44698024, 317.20530248, 313.24389148,
       318.80638123]), 'test_accuracy': array([0.87427056, 0.88458477, 0.88458477, 0.8315203 , 0.84186787]), 'test_neg_log_loss': array([-0.49783675, -0.49412846, -0.48165129, -0.61265261, -0.57796264]), 'test_neg_mean_squared_error': array([-7.04270557, -5.42504643, -5.10400637, -9.7224728 , -8.83921465]), 'test_roc_auc_ovr': array([0.9925213 , 0.99250592, 0.99320857, 0.98667697, 0.98915912]), 'test_f1_weighted': array([0.8749937 , 0.8840639 , 0.88483512, 0.83140918, 0.84108109]), 'test_precision_weighted': array([0.87692341, 0.88463625, 0.88603959, 0.8327402 , 0.84182772]), 'test_recall_weighted': array([0.87427056, 0.88458477, 0.88458477, 0.8315203 , 0.84186787])}
Samples: 1, features: 0.75
{'fit_time': array([5225.5091362 , 5437.13613319, 5360.36702347, 5503.38156438,
       523

Samples: 0.7, features: 1
{'fit_time': array([4754.13093948, 4343.07299876, 4221.38828945, 4277.95838237,
       4303.87766171]), 'score_time': array([501.75028634, 456.31798673, 454.77140641, 456.92040133,
       459.5886271 ]), 'test_accuracy': array([0.84137931, 0.85195012, 0.86017511, 0.79676307, 0.80711064]), 'test_neg_log_loss': array([-0.55641008, -0.54709967, -0.5256149 , -0.67119989, -0.65484963]), 'test_neg_mean_squared_error': array([ -9.15888594,  -8.13823295,  -6.79623242, -12.48686654,
       -11.6418148 ]), 'test_roc_auc_ovr': array([0.98904029, 0.98917666, 0.99016616, 0.98296026, 0.98423189]), 'test_f1_weighted': array([0.84208157, 0.85108148, 0.86040717, 0.79731334, 0.80726987]), 'test_precision_weighted': array([0.84379865, 0.85102208, 0.86160535, 0.79980474, 0.80919648]), 'test_recall_weighted': array([0.84137931, 0.85195012, 0.86017511, 0.79676307, 0.80711064])}
Samples: 1, features: 0.25
{'fit_time': array([2600.77160215, 2531.09497333, 2682.15796685, 2506.54630566

In [51]:
for n in noises:
    new_y_train = make_noise_label(y_train, n)
    print("Noise on labels: " + str(n))
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full")
    print(scores)
    for minor in minors:
        scores = cross_validate(minor, svd_x_train, new_y_train, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
        print(scores)

Noise on labels: 0.05
Full
{'fit_time': array([806.81425142, 772.13194489, 762.82271099, 752.83477163, 752.52486014]),
'score_time': array([59.92133784, 59.03220415, 58.88771701, 58.68690872, 58.65899467]),
'test_accuracy': array([0.82864721, 0.83523481, 0.838684  , 0.79198726, 0.79702839]),
'test_neg_log_loss': array([-0.77516345, -0.80088014, -0.76551566, -0.87139582, -0.86650792]),
'test_neg_mean_squared_error': array([-10.05676393,  -9.96020164,  -8.69249138, -12.32767312, -12.78376227]),
'test_roc_auc_ovr': array([0.93928688, 0.93279267, 0.94132188, 0.93661633, 0.94360486]),
'test_f1_weighted': array([0.82880489, 0.83422287, 0.83850507, 0.79075368, 0.79608673]),
'test_precision_weighted': array([0.83189081, 0.83472842, 0.84026157, 0.79356081, 0.79853303]),
'test_recall_weighted': array([0.82864721, 0.83523481, 0.838684  , 0.79198726, 0.79702839])}
Samples: 0.1, features: 1
{'fit_time': array([147.3920486 , 145.00769901, 143.99762869, 149.63122702,
       154.37247801]), 'score_tim

Samples: 1, features: 0.5
{'fit_time': array([4653.22140098, 4476.5926919 , 4587.05533099, 4499.70081139,
       4637.06842899]), 'score_time': array([335.92143774, 332.81360936, 335.58371782, 332.94712138,
       337.47496176]), 'test_accuracy': array([0.78037135, 0.78906872, 0.78615017, 0.73202441, 0.74794375]), 'test_neg_log_loss': array([-1.11186426, -1.10133025, -1.10412853, -1.23772095, -1.17725273]), 'test_neg_mean_squared_error': array([-13.34509284, -12.54842133, -12.282303  , -16.49907137,
       -15.77739453]), 'test_roc_auc_ovr': array([0.94062509, 0.9406932 , 0.94178923, 0.92906682, 0.93596105]), 'test_f1_weighted': array([0.78048486, 0.7884564 , 0.78634281, 0.73085363, 0.74749657]), 'test_precision_weighted': array([0.78330802, 0.78966377, 0.78795441, 0.73342932, 0.75044887]), 'test_recall_weighted': array([0.78037135, 0.78906872, 0.78615017, 0.73202441, 0.74794375])}
Samples: 1, features: 0.75
{'fit_time': array([6366.471174  , 6344.73523259, 6602.1999054 , 6388.5762136 

Samples: 0.7, features: 1
{'fit_time': array([6813.23192263, 6201.19724369, 6200.78530812, 6187.20400405,
       6221.60724497]), 'score_time': array([518.99287033, 516.62730479, 518.26143956, 520.71870971,
       518.77536559]), 'test_accuracy': array([0.47612732, 0.47413107, 0.4887238 , 0.46829398, 0.48182542]), 'test_neg_log_loss': array([-2.26653771, -2.26814716, -2.22658226, -2.26174909, -2.22657176]), 'test_neg_mean_squared_error': array([-33.59071618, -33.92677103, -33.54178827, -35.02281772,
       -34.78827275]), 'test_roc_auc_ovr': array([0.77576315, 0.77680069, 0.78608765, 0.7764357 , 0.78123108]), 'test_f1_weighted': array([0.47571956, 0.47390109, 0.48693187, 0.46741745, 0.48093589]), 'test_precision_weighted': array([0.47769613, 0.47683201, 0.48906016, 0.47025034, 0.48615523]), 'test_recall_weighted': array([0.47612732, 0.47413107, 0.4887238 , 0.46829398, 0.48182542])}
Samples: 1, features: 0.25
{'fit_time': array([3601.09884953, 3549.66617465, 3626.84223938, 3633.32772803