In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn import preprocessing
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, log_loss, average_precision_score
from random import sample, randint, randrange
from copy import deepcopy
from sklearn.model_selection import cross_validate

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
x_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target
classes_count = 20

Change dimensions to 500

In [6]:
svd = TruncatedSVD(n_components=500)
svd.fit(x_train)
svd_x_train = svd.transform(x_train)
svd_x_test = svd.transform(x_test)

In [7]:
svd_x_train = np.concatenate((svd_x_train, svd_x_test))
y_train = np.concatenate((y_train, y_test))

SVM on full data and full features

In [None]:
clf = SVC(probability=True)

In [None]:
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))

In [None]:
scores

10 sub-classifiers

In [6]:
class MinorClassifiers:
    def __init__(self, samp, feat, voting, max_iter=-1):
        self.samp = samp
        self.feat = feat
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
        self.voting = voting
        self.max_iter = max_iter
    
    def get_params(self, deep = False):
        return {
            'samp': self.samp,
            'feat': self.feat,
            'voting': self.voting,
            'max_iter': self.max_iter
        }
    
    def predict(self, X):
        for i in range(len(self.classifiers)):
            classifier = self.classifiers[i]
            f = self.cut_features[i]
            x_test = X[:,f]
            y_pred = classifier.predict(x_test)
            pred = classifier.predict_proba(x_test)
            self.predictions.append((y_pred, pred))
            
        if self.voting == 'average':
            return average_pred(self.predictions)
        
        if self.voting == 'majority':
            return majority_pred(self.predictions)
        
        if self.voting == 'borda':
            return borda_pred(self.predictions)
    
    def predict_proba(self, X):
        if self.voting == 'average':
            return average_pred_proba(self.predictions, len(self.classifiers))
        
        if self.voting == 'majority':
            return majority_pred_proba(self.predictions, len(self.classifiers))
        
        if self.voting == 'borda':
            return borda_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(500)]
        samples_all = X.shape[0]
        features_all = X.shape[1]
        
        for i in range(10):
            f = sample(feature_list, int(features_all * self.feat))
            self.cut_features.append(f)
            x_train_f = X[:,f]
                
            x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(self.samp * samples_all), replace=False, random_state=0)

            svm_clf = SVC(probability=True, max_iter=self.max_iter)
            svm_clf.fit(x_train_s, y_train_s)
                
            self.classifiers.append(svm_clf)

In [7]:
def average_pred(predictions):
    #predictions = list of tuples 
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def average_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= n_classifiers            
    return results

def majority_pred(predictions):
    m = len(predictions[0][0])
    results = [0] * m
    for i in range(m):
        all_results = [0 for x in range(classes_count)] 
        for (pred, _) in predictions:
            all_results[pred[i]] += 1
        results[i] = all_results.index(max(all_results))        
    return results

def majority_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    classifiers_votes_count = [0] * m
    majority_results = majority_pred(predictions)
    for classifier in range(0, n_classifiers):
        for i in range(m):
            voted_class = majority_results[i] 
            (pred, pred_proba) = predictions[classifier]
            if(pred[i] == voted_class):
                classifiers_votes_count[i] += 1
                for j in range(classes_count):
                    results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= classifiers_votes_count[i]
    return results      
    

def borda_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def borda_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)] 
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    def get_points():
        sum = 0
        for i in range(classes_count):
            sum += i
        return sum * n_classifiers
    
    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= get_points()
        
    return results

In [9]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [12]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [13]:
for minor in minors:
    scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

Samples: 0.1, features: 1
{'fit_time': array([148.91131902, 146.96860027, 145.08941054, 151.25387406,
       145.18135452]), 'score_time': array([79.84562659, 85.42474294, 78.85297894, 84.54803824, 79.84669614]), 'test_accuracy': array([0.74217507, 0.7460865 , 0.73839215, 0.70443088, 0.70018573]), 'test_neg_log_loss': array([-1.03252602, -1.00095979, -1.02576483, -1.0182766 , -0.98272496]), 'test_neg_mean_squared_error': array([-13.57824934, -12.46590608, -13.52241974, -16.59007694,
       -15.47704962]), 'test_roc_auc_ovr': array([0.96891407, 0.9726225 , 0.97246307, 0.96648893, 0.9706784 ]), 'test_f1_weighted': array([0.74581488, 0.74695513, 0.73995017, 0.70787399, 0.7053624 ]), 'test_precision_weighted': array([0.76351305, 0.7598769 , 0.75137709, 0.72947445, 0.73186469]), 'test_recall_weighted': array([0.74217507, 0.7460865 , 0.73839215, 0.70443088, 0.70018573])}
Samples: 0.35, features: 1
{'fit_time': array([1250.17741156, 1215.59635448, 1221.13495445, 1253.77801323,
       1255.189

Samples: 1, features: 0.5
{'fit_time': array([3869.10345006, 3843.47206473, 3612.73425889, 3794.17781258,
       3604.67475128]), 'score_time': array([314.03146791, 316.82461762, 308.79971576, 313.6022191 ,
       311.20412183]), 'test_accuracy': array([0.8734748 , 0.88087026, 0.88299284, 0.83072433, 0.838684  ]), 'test_neg_log_loss': array([-2.33324503, -2.32945829, -2.33060984, -2.34056075, -2.33796621]), 'test_neg_mean_squared_error': array([-6.32095491, -6.00106129, -5.92332184, -9.60785354, -9.45449721]), 'test_roc_auc_ovr': array([0.98608608, 0.98770734, 0.98684618, 0.98106492, 0.98216681]), 'test_f1_weighted': array([0.87408639, 0.88037591, 0.88350422, 0.83098397, 0.8388703 ]), 'test_precision_weighted': array([0.87629491, 0.88109681, 0.88530988, 0.83342773, 0.84099954]), 'test_recall_weighted': array([0.8734748 , 0.88087026, 0.88299284, 0.83072433, 0.838684  ])}
Samples: 1, features: 0.75
{'fit_time': array([5152.01870775, 5203.89438987, 5332.218431  , 5689.703866  ,
       530

In [None]:
minors = []
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'majority'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'majority'))

In [None]:
for minor in minors:
    scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

In [None]:
minors = []
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'borda'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'borda'))

In [None]:
for minor in minors:
    scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

3a

In [13]:
parts = [0.1, 0.25, 0.5, 0.75] ## for 1.0 already calculated

In [22]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [23]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [24]:
for p in parts:
    new_X = svd_x_train[:int(p*svd_x_train.shape[0])]
    new_Y = y_train[:int(p*y_train.shape[0])]
    print("Data " + str(p))
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full " + str(p) + "%")
    print(scores)
    for minor in minors:
        scores = cross_validate(minor, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
        print(scores)

Data 0.1
Samples: 0.7, features: 1
{'fit_time': array([76.41327071, 75.94379377, 76.88744974, 73.38574314, 73.52283382]), 'score_time': array([5.86673832, 5.99670053, 5.93612862, 5.71173048, 6.10567951]), 'test_accuracy': array([0.74535809, 0.73474801, 0.72413793, 0.75596817, 0.7287234 ]), 'test_neg_log_loss': array([-1.08154999, -1.05632682, -1.05962249, -1.10169275, -1.11996161]), 'test_neg_mean_squared_error': array([-13.16710875, -11.50928382, -14.54376658, -15.23607427,
       -13.1462766 ]), 'test_roc_auc_ovr': array([0.95871098, 0.96679956, 0.96866598, 0.96282195, 0.95810908]), 'test_f1_weighted': array([0.74606982, 0.73582449, 0.72886921, 0.75608787, 0.72404682]), 'test_precision_weighted': array([0.77087507, 0.75137784, 0.75825003, 0.77764468, 0.73074313]), 'test_recall_weighted': array([0.74535809, 0.73474801, 0.72413793, 0.75596817, 0.7287234 ])}
Data 0.25
Samples: 0.7, features: 1
{'fit_time': array([422.98991966, 442.01739025, 432.92568946, 456.66984463,
       456.4957156

3b

In [11]:
clf = SVC(probability=True, max_iter=300)
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
print(scores)



{'fit_time': array([562.93116689, 562.83924198, 561.86122465, 558.63007069,
       557.33135796]), 'score_time': array([50.83319187, 51.06287003, 50.92495847, 50.41782594, 50.80374217]), 'test_accuracy': array([0.87188329, 0.88936057, 0.88193155, 0.83921465, 0.8410719 ]), 'test_neg_log_loss': array([-0.42623447, -0.40478624, -0.40388631, -0.53291046, -0.51709512]), 'test_neg_mean_squared_error': array([-6.81909814, -5.86256301, -5.16741841, -8.82674449, -9.32793845]), 'test_roc_auc_ovr': array([0.9933504 , 0.9934361 , 0.99375567, 0.98848411, 0.98978386]), 'test_f1_weighted': array([0.87318645, 0.8891092 , 0.88278395, 0.83897054, 0.84150245]), 'test_precision_weighted': array([0.8771908 , 0.89048686, 0.88506253, 0.84157337, 0.84512648]), 'test_recall_weighted': array([0.87188329, 0.88936057, 0.88193155, 0.83921465, 0.8410719 ])}


In [11]:
clf = SVC(probability=True, max_iter=130)
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
print(scores)



{'fit_time': array([349.97228909, 332.89484262, 332.94227552, 331.75345349,
       331.66004205]), 'score_time': array([40.90028977, 41.29549599, 41.03101397, 40.17039728, 40.43398356]), 'test_accuracy': array([0.83846154, 0.85062351, 0.85009286, 0.81905015, 0.81666224]), 'test_neg_log_loss': array([-0.62555108, -0.62987654, -0.6286315 , -0.66268624, -0.68706849]), 'test_neg_mean_squared_error': array([ -7.9872679 ,  -7.75723003,  -6.62642611, -10.20801274,
       -11.19899178]), 'test_roc_auc_ovr': array([0.9903969 , 0.9895292 , 0.98983432, 0.98681415, 0.98668778]), 'test_f1_weighted': array([0.83867623, 0.85029454, 0.85089911, 0.8190994 , 0.81751722]), 'test_precision_weighted': array([0.84271581, 0.85269431, 0.85598384, 0.82667508, 0.82855683]), 'test_recall_weighted': array([0.83846154, 0.85062351, 0.85009286, 0.81905015, 0.81666224])}


In [27]:
clf = SVC(probability=True, max_iter=60)
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
print(scores)



{'fit_time': array([160.55420876, 160.31492424, 159.88336587, 159.85737753,
       161.1360631 ]), 'score_time': array([28.85886645, 28.78711581, 29.04045343, 28.20144725, 28.12108278]), 'test_accuracy': array([0.74960212, 0.74980101, 0.75855665, 0.73494296, 0.75616875]), 'test_neg_log_loss': array([-1.06782259, -1.08355699, -1.08235768, -1.08656571, -1.04117837]), 'test_neg_mean_squared_error': array([-14.85251989, -15.80737596, -13.89519767, -18.45741576,
       -15.74980101]), 'test_roc_auc_ovr': array([0.98045306, 0.97840346, 0.98050336, 0.97573777, 0.97877233]), 'test_f1_weighted': array([0.75122   , 0.752579  , 0.76120373, 0.74565294, 0.75955601]), 'test_precision_weighted': array([0.75937205, 0.76404097, 0.77575887, 0.77977702, 0.77843889]), 'test_recall_weighted': array([0.74960212, 0.74980101, 0.75855665, 0.73494296, 0.75616875])}


In [None]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [None]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [32]:
#0.1 samples, 1 features:
minor = MinorClassifiers(0.1, 1, 'average', 800)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 0.1, features: 1
{'fit_time': array([143.74664235, 143.4343214 , 142.68970346, 144.4288826 ,
       145.07612896]), 'score_time': array([79.37870693, 79.00381422, 78.88554025, 79.96405578, 79.68736148]), 'test_accuracy': array([0.74721485, 0.7484744 , 0.7460865 , 0.7129212 , 0.71477846]), 'test_neg_log_loss': array([-1.03030245, -1.00152858, -1.02694075, -1.01946636, -0.98732007]), 'test_neg_mean_squared_error': array([-13.12493369, -13.94375166, -14.71610507, -17.35261343,
       -16.24064739]), 'test_roc_auc_ovr': array([0.96907619, 0.97254683, 0.97261814, 0.96660486, 0.97042594]), 'test_f1_weighted': array([0.74962856, 0.74849211, 0.74763184, 0.71551568, 0.71743782]), 'test_precision_weighted': array([0.75651728, 0.75341645, 0.75364292, 0.72618261, 0.72899139]), 'test_recall_weighted': array([0.74721485, 0.7484744 , 0.7460865 , 0.7129212 , 0.71477846])}


In [33]:
#0.1 samples, 1 features:
minor = MinorClassifiers(0.1, 1, 'average', 500)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

Samples: 0.1, features: 1
{'fit_time': array([143.69359374, 143.53579068, 143.32959175, 144.25927544,
       144.90196466]), 'score_time': array([79.82853341, 79.0423317 , 78.99677014, 79.84514499, 79.67702365]), 'test_accuracy': array([0.74748011, 0.7484744 , 0.74635182, 0.71371717, 0.71504378]), 'test_neg_log_loss': array([-1.02967009, -1.00297844, -1.02586306, -1.02028616, -0.98688472]), 'test_neg_mean_squared_error': array([-12.98435013, -13.82992836, -14.97638631, -17.11276201,
       -16.18758291]), 'test_roc_auc_ovr': array([0.96916692, 0.97259883, 0.97266523, 0.96657797, 0.97043052]), 'test_f1_weighted': array([0.74968528, 0.74841573, 0.74807805, 0.71610509, 0.71757741]), 'test_precision_weighted': array([0.75618164, 0.75338198, 0.75479582, 0.72666075, 0.72917952]), 'test_recall_weighted': array([0.74748011, 0.7484744 , 0.74635182, 0.71371717, 0.71504378])}


In [20]:
#0.1 samples, 1 features:
minor = MinorClassifiers(0.1, 1, 'average', 100)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.1, features: 1
{'fit_time': array([161.38426304, 161.85869408, 160.53655028, 163.50446272,
       165.09554029]), 'score_time': array([97.60327721, 96.85600066, 96.70366549, 97.71602917, 97.99591017]), 'test_accuracy': array([0.7469496 , 0.74873972, 0.74661714, 0.71212523, 0.71371717]), 'test_neg_log_loss': array([-1.03075665, -1.00233244, -1.02767706, -1.01985539, -0.98797864]), 'test_neg_mean_squared_error': array([-13.09655172, -14.03502255, -14.95648713, -17.27593526,
       -15.96232422]), 'test_roc_auc_ovr': array([0.96912883, 0.97268161, 0.97269384, 0.96658127, 0.97038774]), 'test_f1_weighted': array([0.74939829, 0.7488493 , 0.74828945, 0.71455978, 0.71651812]), 'test_precision_weighted': array([0.75650198, 0.75368039, 0.7546048 , 0.7251028 , 0.72806585]), 'test_recall_weighted': array([0.7469496 , 0.74873972, 0.74661714, 0.71212523, 0.71371717])}


In [34]:
#0.35 samples, 1 features:
minor = MinorClassifiers(0.35, 1, 'average', 63)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.35, features: 1
{'fit_time': array([552.34787416, 541.3920517 , 542.13603902, 542.2991457 ,
       541.66053224]), 'score_time': array([181.61350226, 183.52007818, 182.02906919, 180.00202489,
       181.05537295]), 'test_accuracy': array([0.78249337, 0.77951711, 0.77500663, 0.72751393, 0.74316795]), 'test_neg_log_loss': array([-0.90358993, -0.92197339, -0.91951682, -0.94885831, -0.91105022]), 'test_neg_mean_squared_error': array([-13.90344828, -14.10665959, -12.78588485, -19.70655346,
       -16.65083577]), 'test_roc_auc_ovr': array([0.98358329, 0.98278093, 0.98308693, 0.97780141, 0.97878412]), 'test_f1_weighted': array([0.78781884, 0.78261763, 0.77589813, 0.73747437, 0.7471857 ]), 'test_precision_weighted': array([0.8123188 , 0.81028381, 0.7993413 , 0.77990371, 0.78239465]), 'test_recall_weighted': array([0.78249337, 0.77951711, 0.77500663, 0.72751393, 0.74316795])}


In [61]:
#0.35 samples, 1 features:
minor = MinorClassifiers(0.35, 1, 'average', 38)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.35, features: 1
{'fit_time': array([331.37923431, 332.66124415, 331.92640615, 331.51932669,
       331.31372976]), 'score_time': array([150.83965302, 150.49776673, 148.94946837, 148.34948683,
       149.22491384]), 'test_accuracy': array([0.7066313 , 0.70761475, 0.70602282, 0.64367206, 0.63438578]), 'test_neg_log_loss': array([-1.27358719, -1.28382723, -1.28036336, -1.27947631, -1.29149313]), 'test_neg_mean_squared_error': array([-21.37082228, -21.39957548, -20.86176705, -28.08384187,
       -30.21040064]), 'test_roc_auc_ovr': array([0.97558053, 0.97314962, 0.97516072, 0.96727272, 0.96794843]), 'test_f1_weighted': array([0.71497346, 0.71586849, 0.70882662, 0.6768126 , 0.66548412]), 'test_precision_weighted': array([0.76301142, 0.76476431, 0.75454763, 0.76746248, 0.76570663]), 'test_recall_weighted': array([0.7066313 , 0.70761475, 0.70602282, 0.64367206, 0.63438578])}


In [62]:
#0.35 samples, 1 features:
minor = MinorClassifiers(0.35, 1, 'average', 19)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.35, features: 1
{'fit_time': array([170.76959729, 170.729491  , 170.73375201, 170.86403394,
       170.97561479]), 'score_time': array([94.92840934, 94.34453869, 94.43074822, 93.24886131, 91.80771637]), 'test_accuracy': array([0.57559682, 0.56964712, 0.56991244, 0.50437782, 0.51074556]), 'test_neg_log_loss': array([-1.81902736, -1.8315846 , -1.81931019, -1.81184169, -1.81899491]), 'test_neg_mean_squared_error': array([-36.22811671, -38.88829928, -38.42876094, -46.78455824,
       -44.21278854]), 'test_roc_auc_ovr': array([0.92840088, 0.93501789, 0.9310363 , 0.94184957, 0.93880368]), 'test_f1_weighted': array([0.58591736, 0.59282698, 0.5800717 , 0.54927694, 0.5518135 ]), 'test_precision_weighted': array([0.65209905, 0.66981066, 0.6459285 , 0.69052462, 0.68374176]), 'test_recall_weighted': array([0.57559682, 0.56964712, 0.56991244, 0.50437782, 0.51074556])}


In [55]:
#0.7 samples, 1 features:
minor = MinorClassifiers(0.7, 1, 'average', 30)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.7, features: 1
{'fit_time': array([557.4581275 , 557.87601018, 557.11426044, 558.04654264,
       558.00818896]), 'score_time': array([164.09793568, 159.56093526, 159.1605742 , 156.58893991,
       157.1965394 ]), 'test_accuracy': array([0.64350133, 0.64022287, 0.65587689, 0.60811886, 0.58530114]), 'test_neg_log_loss': array([-1.54497817, -1.54879283, -1.53300453, -1.50880266, -1.52695008]), 'test_neg_mean_squared_error': array([-31.97639257, -28.93579199, -29.01724595, -33.5717697 ,
       -39.46245689]), 'test_roc_auc_ovr': array([0.95203273, 0.95151557, 0.95487603, 0.95894532, 0.95687504]), 'test_f1_weighted': array([0.65978691, 0.65306382, 0.66221334, 0.63523244, 0.62949919]), 'test_precision_weighted': array([0.70726112, 0.69077771, 0.69182468, 0.72518419, 0.73321374]), 'test_recall_weighted': array([0.64350133, 0.64022287, 0.65587689, 0.60811886, 0.58530114])}


In [65]:
#0.7 samples, 1 features:
minor = MinorClassifiers(0.7, 1, 'average', 18)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.7, features: 1
{'fit_time': array([341.79230094, 343.3571496 , 342.34976411, 341.98819757,
       341.85069942]), 'score_time': array([106.65240788, 105.72991347, 105.054847  , 102.85468841,
        99.70738578]), 'test_accuracy': array([0.50424403, 0.47731494, 0.4958875 , 0.5017246 , 0.44786415]), 'test_neg_log_loss': array([-2.02426948, -2.06989556, -2.05990069, -1.9497137 , -2.01026221]), 'test_neg_mean_squared_error': array([-53.10557029, -60.4555585 , -52.39771823, -49.18466437,
       -57.6274874 ]), 'test_roc_auc_ovr': array([0.89390592, 0.88854979, 0.88692634, 0.9273053 , 0.90996748]), 'test_f1_weighted': array([0.53647367, 0.52604928, 0.52166639, 0.54521141, 0.49506146]), 'test_precision_weighted': array([0.63707549, 0.65611734, 0.61412632, 0.66747681, 0.6350969 ]), 'test_recall_weighted': array([0.50424403, 0.47731494, 0.4958875 , 0.5017246 , 0.44786415])}


In [70]:
#0.7 samples, 1 features:
minor = MinorClassifiers(0.7, 1, 'average', 8)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)





Samples: 0.7, features: 1
{'fit_time': array([163.48288536, 163.17852306, 162.81571603, 162.85702705,
       162.94164491]), 'score_time': array([52.49703074, 54.60026574, 55.01369143, 54.24818039, 51.63275695]), 'test_accuracy': array([0.31485411, 0.30405943, 0.30379411, 0.37808437, 0.33536747]), 'test_neg_log_loss': array([-2.60395101, -2.62071942, -2.61853389, -2.46254405, -2.50721086]), 'test_neg_mean_squared_error': array([-78.4198939 , -80.33191828, -79.87768639, -63.48766251,
       -69.36747148]), 'test_roc_auc_ovr': array([0.77684977, 0.78127085, 0.78537512, 0.84619761, 0.82530602]), 'test_f1_weighted': array([0.34993632, 0.35013506, 0.34015569, 0.41864387, 0.36073942]), 'test_precision_weighted': array([0.53620605, 0.54795833, 0.53149469, 0.56951726, 0.51844868]), 'test_recall_weighted': array([0.31485411, 0.30405943, 0.30379411, 0.37808437, 0.33536747])}


In [None]:
#1 samples, 0.25 features:
minor = MinorClassifiers(1, 0.25, 'average', 48)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.25 features:
minor = MinorClassifiers(1, 0.25, 'average', 28)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.25 features:
minor = MinorClassifiers(1, 0.25, 'average', 13)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.5 features:
minor = MinorClassifiers(1, 0.5, 'average', 26)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.5 features:
minor = MinorClassifiers(1, 0.5, 'average', 15)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.5 features:
minor = MinorClassifiers(1, 0.5, 'average', 7)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.75 features:
minor = MinorClassifiers(1, 0.75, 'average', 18)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.75 features:
minor = MinorClassifiers(1, 0.75, 'average', 10)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

In [None]:
#1 samples, 0.75 features:
minor = MinorClassifiers(1, 0.75, 'average', 4)
scores = cross_validate(minor, svd_x_train, y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
print(scores)

4

In [None]:
def make_noise_data(data, percent):
    m = data.shape[0]
    n = data.shape[1]
    new_data = deepcopy(data)
    for i in range(m):
        for j in range(n):
            rand = randrange(-1, 2, 2) #random integer from {-1, 1}
            new_data[i][j] = (1 + rand * percent) * data[i][j]
    return new_data

In [None]:
def make_noise_label(labels, percent):
    labels_with_noise = deepcopy(labels)
    arr_size = labels.shape[0]
    indexes = [n for n in range(arr_size)]
    indexes_to_change = sample(indexes, int(arr_size * percent))
    
    for i in indexes_to_change:
        old_val = labels[i]
        new_val = randint(0, classes_count-1)
        while old_val == new_val:
            new_val = randint(0, classes_count-1)
        labels_with_noise[i] = new_val
        
    return labels_with_noise

In [10]:
noises = [0.05, 0.1, 0.2, 0.4]

In [11]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [12]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1, 'average'))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n, 'average'))

In [13]:
for n in noises:
    new_x_train = make_noise_data(svd_x_train, n)
    print("Noise on data: " + str(n))
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full " + str(p) + "%")
    for minor in minors:
        scores = cross_validate(minor, new_x_train, y_train, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
        print(scores)

Noise on data: 0.05
Samples: 0.1, features: 1
{'fit_time': array([154.17217922, 150.05866861, 153.23058915, 151.83258295,
       156.07881832]), 'score_time': array([84.33436298, 87.54580879, 84.91181469, 84.54791069, 82.25219893]), 'test_accuracy': array([0.74535809, 0.74767843, 0.74422924, 0.7105333 , 0.71477846]), 'test_neg_log_loss': array([-1.03093654, -1.00874163, -1.02946624, -1.02003515, -0.98535182]), 'test_neg_mean_squared_error': array([-13.14615385, -13.65640754, -15.24860706, -17.15388697,
       -15.84505174]), 'test_roc_auc_ovr': array([0.96907094, 0.97211317, 0.97235117, 0.9662398 , 0.97053474]), 'test_f1_weighted': array([0.74770109, 0.74781188, 0.74619186, 0.71335642, 0.7177339 ]), 'test_precision_weighted': array([0.75464193, 0.75303739, 0.75324298, 0.72433503, 0.72822271]), 'test_recall_weighted': array([0.74535809, 0.74767843, 0.74422924, 0.7105333 , 0.71477846])}
Samples: 0.35, features: 1
{'fit_time': array([1257.66017866, 1234.03320789, 1240.05512547, 1274.67126

Samples: 1, features: 0.5
{'fit_time': array([3887.04473448, 3979.57309031, 3839.72717118, 3762.76277852,
       3931.58019376]), 'score_time': array([315.352211  , 317.44698024, 317.20530248, 313.24389148,
       318.80638123]), 'test_accuracy': array([0.87427056, 0.88458477, 0.88458477, 0.8315203 , 0.84186787]), 'test_neg_log_loss': array([-0.49783675, -0.49412846, -0.48165129, -0.61265261, -0.57796264]), 'test_neg_mean_squared_error': array([-7.04270557, -5.42504643, -5.10400637, -9.7224728 , -8.83921465]), 'test_roc_auc_ovr': array([0.9925213 , 0.99250592, 0.99320857, 0.98667697, 0.98915912]), 'test_f1_weighted': array([0.8749937 , 0.8840639 , 0.88483512, 0.83140918, 0.84108109]), 'test_precision_weighted': array([0.87692341, 0.88463625, 0.88603959, 0.8327402 , 0.84182772]), 'test_recall_weighted': array([0.87427056, 0.88458477, 0.88458477, 0.8315203 , 0.84186787])}
Samples: 1, features: 0.75
{'fit_time': array([5225.5091362 , 5437.13613319, 5360.36702347, 5503.38156438,
       523

Samples: 0.7, features: 1
{'fit_time': array([4754.13093948, 4343.07299876, 4221.38828945, 4277.95838237,
       4303.87766171]), 'score_time': array([501.75028634, 456.31798673, 454.77140641, 456.92040133,
       459.5886271 ]), 'test_accuracy': array([0.84137931, 0.85195012, 0.86017511, 0.79676307, 0.80711064]), 'test_neg_log_loss': array([-0.55641008, -0.54709967, -0.5256149 , -0.67119989, -0.65484963]), 'test_neg_mean_squared_error': array([ -9.15888594,  -8.13823295,  -6.79623242, -12.48686654,
       -11.6418148 ]), 'test_roc_auc_ovr': array([0.98904029, 0.98917666, 0.99016616, 0.98296026, 0.98423189]), 'test_f1_weighted': array([0.84208157, 0.85108148, 0.86040717, 0.79731334, 0.80726987]), 'test_precision_weighted': array([0.84379865, 0.85102208, 0.86160535, 0.79980474, 0.80919648]), 'test_recall_weighted': array([0.84137931, 0.85195012, 0.86017511, 0.79676307, 0.80711064])}
Samples: 1, features: 0.25
{'fit_time': array([2600.77160215, 2531.09497333, 2682.15796685, 2506.54630566

In [14]:
for n in noises:
    new_y_train = make_noise_label(y_train, n)
    print("Noise on labels: " + str(n))
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full " + str(p) + "%")
    for minor in minors:
        scores = cross_validate(minor, svd_x_train, new_y_train, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
        print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
        print(scores)

Noise on labels: 0.05
Samples: 0.1, features: 1
{'fit_time': array([147.3920486 , 145.00769901, 143.99762869, 149.63122702,
       154.37247801]), 'score_time': array([80.12688184, 79.6095295 , 82.08589077, 80.63044739, 94.53331995]), 'test_accuracy': array([0.69708223, 0.71371717, 0.71000265, 0.66569382, 0.6821438 ]), 'test_neg_log_loss': array([-1.31655301, -1.25115119, -1.30059008, -1.27777832, -1.22312223]), 'test_neg_mean_squared_error': array([-16.8403183 , -15.20854338, -16.46059963, -19.75643407,
       -18.91085168]), 'test_roc_auc_ovr': array([0.93925082, 0.9462846 , 0.94184798, 0.93913961, 0.94616271]), 'test_f1_weighted': array([0.69827101, 0.71298166, 0.71034135, 0.66821722, 0.68428299]), 'test_precision_weighted': array([0.70417301, 0.71538463, 0.71374635, 0.67812542, 0.69424198]), 'test_recall_weighted': array([0.69708223, 0.71371717, 0.71000265, 0.66569382, 0.6821438 ])}
Samples: 0.35, features: 1
{'fit_time': array([1306.8132813 , 1265.42759252, 1278.12685347, 1294.755

Samples: 1, features: 0.5
{'fit_time': array([4653.22140098, 4476.5926919 , 4587.05533099, 4499.70081139,
       4637.06842899]), 'score_time': array([335.92143774, 332.81360936, 335.58371782, 332.94712138,
       337.47496176]), 'test_accuracy': array([0.78037135, 0.78906872, 0.78615017, 0.73202441, 0.74794375]), 'test_neg_log_loss': array([-1.11186426, -1.10133025, -1.10412853, -1.23772095, -1.17725273]), 'test_neg_mean_squared_error': array([-13.34509284, -12.54842133, -12.282303  , -16.49907137,
       -15.77739453]), 'test_roc_auc_ovr': array([0.94062509, 0.9406932 , 0.94178923, 0.92906682, 0.93596105]), 'test_f1_weighted': array([0.78048486, 0.7884564 , 0.78634281, 0.73085363, 0.74749657]), 'test_precision_weighted': array([0.78330802, 0.78966377, 0.78795441, 0.73342932, 0.75044887]), 'test_recall_weighted': array([0.78037135, 0.78906872, 0.78615017, 0.73202441, 0.74794375])}
Samples: 1, features: 0.75
{'fit_time': array([6366.471174  , 6344.73523259, 6602.1999054 , 6388.5762136 

Samples: 0.7, features: 1
{'fit_time': array([6813.23192263, 6201.19724369, 6200.78530812, 6187.20400405,
       6221.60724497]), 'score_time': array([518.99287033, 516.62730479, 518.26143956, 520.71870971,
       518.77536559]), 'test_accuracy': array([0.47612732, 0.47413107, 0.4887238 , 0.46829398, 0.48182542]), 'test_neg_log_loss': array([-2.26653771, -2.26814716, -2.22658226, -2.26174909, -2.22657176]), 'test_neg_mean_squared_error': array([-33.59071618, -33.92677103, -33.54178827, -35.02281772,
       -34.78827275]), 'test_roc_auc_ovr': array([0.77576315, 0.77680069, 0.78608765, 0.7764357 , 0.78123108]), 'test_f1_weighted': array([0.47571956, 0.47390109, 0.48693187, 0.46741745, 0.48093589]), 'test_precision_weighted': array([0.47769613, 0.47683201, 0.48906016, 0.47025034, 0.48615523]), 'test_recall_weighted': array([0.47612732, 0.47413107, 0.4887238 , 0.46829398, 0.48182542])}
Samples: 1, features: 0.25
{'fit_time': array([3601.09884953, 3549.66617465, 3626.84223938, 3633.32772803