1.

In [1]:
from sklearn.metrics import (accuracy_score, f1_score)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.utils import resample
import tensorflow as tf
import numpy as np
from random import sample 
from copy import deepcopy
from random import sample, randint, randrange
import imgaug.augmenters as iaa
from contextlib import contextmanager
from timeit import default_timer
from sklearn.model_selection import cross_validate

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
(x_train_f, y_train_f), (x_test_f, y_test_f) = tf.keras.datasets.fashion_mnist.load_data()

In [4]:
x_train = x_train_f.reshape((x_train_f.shape[0],-1))
x_test = x_test_f.reshape((x_test_f.shape[0],-1))
y_train = y_train_f.reshape((y_train_f.shape[0],))
y_test = y_test_f

X_concat = np.concatenate([x_train, x_test])
Y = np.concatenate([y_train, y_test])

scaler = StandardScaler()

scaler.fit(X_concat)

X_transform = scaler.transform(X_concat)

pca = PCA(n_components=50)
pca.fit(X_transform)

X = pca.transform(X_transform)

classes_count = 10

2.

Pełne dane

In [None]:
clf = SVC(probability=True)

In [None]:
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))

In [None]:
scores

Niepełne dane i cechy

In [5]:
def average_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def average_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]

    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= n_classifiers            
    return results

def majority_pred(predictions):
    m = len(predictions[0][0])
    results = [0] * m
    for i in range(m):
        all_results = [0 for x in range(classes_count)] 
        for (pred, _) in predictions:
            all_results[pred[i]] += 1
        results[i] = all_results.index(max(all_results))
        
    return results

def majority_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    classifiers_votes_count = [0] * m
    majority_results = majority_pred(predictions)
    for classifier in range(0, n_classifiers):
        for i in range(m):
            voted_class = majority_results[i] 
            (pred, pred_proba) = predictions[classifier]
            if(pred[i] == voted_class):
                classifiers_votes_count[i] += 1
                for j in range(classes_count):
                    results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= classifiers_votes_count[i]
    return results      
    
def borda_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def borda_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)] 
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    def get_points():
        sum = 0
        for i in range(classes_count):
            sum += i
        return sum * n_classifiers
    
    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
                
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= get_points()
        
    return results

In [76]:
class MinorClassifiers:
    def __init__(self, samp, feat):
        self.samp = samp
        self.feat = feat
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
    
    def get_params(self, deep = False):
        return {
            'samp': self.samp,
            'feat': self.feat,   
        }
    
    def predict(self, X):
        for i in range(len(self.classifiers)):
            classifier = self.classifiers[i]
            f = self.cut_features[i]
            x_test = X[:,f]
            y_pred = classifier.predict(x_test)
            pred = classifier.predict_proba(x_test)
            self.predictions.append((y_pred, pred))
            
        return average_pred(self.predictions)
    
    def predict_proba(self, X):
        return average_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(50)]
        samples_all = X.shape[0]
        features_all = X.shape[1]
        
        for i in range(10):
            f = sample(feature_list, int(features_all * self.feat))
            self.cut_features.append(f)
            x_train_f = X[:,f]
                
            x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(self.samp * samples_all), replace=False, random_state=0)

            svm_clf = SVC(probability=True)
            svm_clf.fit(x_train_s, y_train_s)
                
            self.classifiers.append(svm_clf)

In [77]:
n_samples = [0.1, 0.35, 0.7]
n_features = [0.25, 0.5, 0.75]
minors = []

In [78]:
for n in n_samples:
    minors.append(MinorClassifiers(n, 1))
    
for n in n_features:
    minors.append(MinorClassifiers(1, n))

In [None]:
for minor in minors:
    scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Samples: " + str(minor.samp) + ", features: " + str(minor.feat))
    print(scores)

3a.

In [None]:
parts = [0.1, 0.25, 0.5, 0.75] ## for 1.0 already calculated

In [None]:
n_samples = [0.1, 0.25, 0.35, 0.5]
n_features = [0.25, 0.5, 0.75]
minor = MinorClassifiers(n_samples, n_features)

In [None]:
for p in parts:
    new_X = X[:int(p*X.shape[0])]
    new_Y = Y[:int(p*Y.shape[0])]
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full " + str(p) + "%")
    print(scores)
    scores = cross_validate(minor, new_X, new_Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Minor " + str(p) + "%")
    print(scores)

3b.

In [None]:
class MinorClassifiers2:
    def __init__(self, n_samples, n_features, iterations):
        self.n_samples = n_samples
        self.n_features = n_features
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
        self.iterations = iterations
    
    def get_params(self, deep = False):
        return {
            'n_samples': self.n_samples,
            'n_features': self.n_features,  
            'iterations': self.iterations
        }
    
    def predict(self, X):
        for i in range(len(self.classifiers)):
            classifier = self.classifiers[i]
            f = self.cut_features[i]
            x_test = X[:,f]
            y_pred = classifier.predict(x_test)
            pred = classifier.predict_proba(x_test)
            self.predictions.append((y_pred, pred))
            
        return average_pred(self.predictions)
    
    def predict_proba(self, X):
        return average_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(50)]
        samples_all = X.shape[0]
        features_all = X.shape[1]

        for samples in self.n_samples:
            for features in self.n_features:
                f = sample(feature_list, int(features_all * features))
                self.cut_features.append(f)
                x_train_f = X[:,f]
                
                x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(samples * samples_all), replace=False, random_state=0)

                svm_clf = SVC(probability=True, max_iter=self.iterations)
                svm_clf.fit(x_train_s, y_train_s)
                
                self.classifiers.append(svm_clf)

In [None]:
n_samples = [0.1, 0.25, 0.35, 0.5]
n_features = [0.25, 0.5, 0.75]

In [None]:
max_iter = 800
clf = SVC(probability=True, max_iter=max_iter)
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 750
clf = SVC(probability=True, max_iter=max_iter)
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 700
clf = SVC(probability=True, max_iter=max_iter)
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 600
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 650
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 550
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 400
clf = SVC(probability=True, max_iter=max_iter)
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 350
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 200
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 250
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 150
clf = SVC(probability=True, max_iter=max_iter)
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 100
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 85
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

In [None]:
max_iter = 75
minor = MinorClassifiers2(n_samples, n_features, max_iter)
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
scores

4. 

In [None]:
class MinorClassifiers:
    def __init__(self, n_samples, n_features):
        self.n_samples = n_samples
        self.n_features = n_features
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
    
    def get_params(self, deep = False):
        return {
            'n_samples': self.n_samples,
            'n_features': self.n_features,   
        }
    
    def predict(self, X):
        for i in range(len(self.classifiers)):
            classifier = self.classifiers[i]
            f = self.cut_features[i]
            x_test = X[:,f]
            y_pred = classifier.predict(x_test)
            pred = classifier.predict_proba(x_test)
            self.predictions.append((y_pred, pred))
            
        return borda_pred(self.predictions)
    
    def predict_proba(self, X):
        return borda_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(50)]
        samples_all = X.shape[0]
        features_all = X.shape[1]

        for samples in self.n_samples:
            for features in self.n_features:
                f = sample(feature_list, int(features_all * features))
                self.cut_features.append(f)
                x_train_f = X[:,f]
                
                x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(samples * samples_all), replace=False, random_state=0)

                svm_clf = SVC(probability=True)
                svm_clf.fit(x_train_s, y_train_s)
                
                self.classifiers.append(svm_clf)

In [None]:
def make_noise_data(data, percent):
    m = data.shape[0]
    n = data.shape[1]
    new_data = deepcopy(data)
    for i in range(m):
        for j in range(n):
            rand = randrange(-1, 2, 2) #random integer from {-1, 1}
            new_data[i][j] = (1 + rand * percent) * data[i][j]
    return new_data

In [None]:
def make_noise_label(labels, percent):
    labels_with_noise = deepcopy(labels)
    arr_size = labels.shape[0]
    indexes = [n for n in range(arr_size)]
    indexes_to_change = sample(indexes, int(arr_size * percent))
    
    for i in indexes_to_change:
        old_val = labels[i]
        new_val = randint(0, classes_count-1)
        while old_val == new_val:
            new_val = randint(0, classes_count-1)
        labels_with_noise[i] = new_val
        
    return labels_with_noise

In [None]:
noises = [0.01, 0.05, 0.1]

In [None]:
for n in noises:
    new_y_train = make_noise_label(Y, n)
    clf = SVC(probability=True)
    scores = cross_validate(clf, X, new_y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full " + str(n) + "%")
    print(scores)
    scores = cross_validate(minor, X, new_y_train, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Minor " + str(n) + "%")
    print(scores)

In [None]:
noises = [0.1, 0.2, 0.3]

In [None]:
for n in noises:
    new_x_train = make_noise_data(X, n)
    clf = SVC(probability=True)
    scores = cross_validate(clf, new_x_train, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Full " + str(n) + "%")
    print(scores)
    scores = cross_validate(minor, new_x_train, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))
    print("Minor " + str(n) + "%")
    print(scores)