1.

In [1]:
from sklearn.metrics import (accuracy_score, f1_score)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.utils import resample
import tensorflow as tf
import numpy as np
from random import sample 
from copy import deepcopy
from random import sample, randint, randrange
import imgaug.augmenters as iaa
from contextlib import contextmanager
from timeit import default_timer
from sklearn.model_selection import cross_validate

In [2]:
@contextmanager
def elapsed_timer():
    start_time = default_timer()

    class _Timer():
      start = start_time
      end = default_timer()
      duration = end - start

    yield _Timer

    end_time = default_timer()
    _Timer.end = end_time
    _Timer.duration = end_time - start_time

In [3]:
(x_train_f, y_train_f), (x_test_f, y_test_f) = tf.keras.datasets.fashion_mnist.load_data()

In [4]:
x_train = x_train_f.reshape((x_train_f.shape[0],-1))
x_test = x_test_f.reshape((x_test_f.shape[0],-1))
y_train = y_train_f.reshape((y_train_f.shape[0],))
y_test = y_test_f

X_concat = np.concatenate([x_train, x_test])
Y = np.concatenate([y_train, y_test])

scaler = StandardScaler()

scaler.fit(X_concat)

X_transform = scaler.transform(X_concat)

pca = PCA(n_components=50)
pca.fit(X_transform)

X = pca.transform(X_transform)

classes_count = 10

In [5]:
X = X[:5000]
Y = Y[:5000]

2.

Pełne dane

In [6]:
clf = SVC(probability=True)

In [8]:
scores = cross_validate(clf, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))

In [9]:
scores

{'fit_time': array([2.37255764, 2.33812141, 2.48247051, 2.51903653, 2.48422027]),
 'score_time': array([0.31694555, 0.30446696, 0.30755639, 0.31117487, 0.31472564]),
 'test_accuracy': array([0.843, 0.826, 0.832, 0.833, 0.871]),
 'test_neg_log_loss': array([-0.43356853, -0.47976882, -0.47008644, -0.458468  , -0.38718756]),
 'test_roc_auc_ovr': array([0.98515158, 0.98275645, 0.98229374, 0.98334632, 0.98752266])}

In [6]:
def get_full_prediction(x_tr, y_tr, x_te):
    clf = SVC(probability=True)
    clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    pred = clf.predict_proba(x_te)
    return (y_pred, pred)

In [None]:
predictions = get_full_prediction(x_train, y_train, x_test)

In [None]:
(y_pred, _) = predictions

In [None]:
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))

Niepełne dane i cechy

In [211]:
def average_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def average_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]

    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= n_classifiers            
    return results

def majority_pred(predictions):
    m = len(predictions[0][0])
    results = [0] * m
    for i in range(m):
        all_results = [0 for x in range(classes_count)] 
        for (pred, _) in predictions:
            all_results[pred[i]] += 1
        results[i] = all_results.index(max(all_results))
        
    return results

def borda_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def borda_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)] 
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    def get_points():
        sum = 0
        for i in range(classes_count):
            sum += i
        return sum * n_classifiers
    
    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
                
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= get_points()
        
    return results

In [212]:
class MinorClassifiers:
    def __init__(self, n_samples, n_features):
        self.n_samples = n_samples
        self.n_features = n_features
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
    
    def get_params(self, deep = False):
        return {
            'n_samples': self.n_samples,
            'n_features': self.n_features,   
        }
    
    def predict(self, X):
        for i in range(len(self.classifiers)):
            classifier = self.classifiers[i]
            f = self.cut_features[i]
            x_test = X[:,f]
            y_pred = classifier.predict(x_test)
            pred = classifier.predict_proba(x_test)
            self.predictions.append((y_pred, pred))
            
        return majority_pred(self.predictions)
    
    def predict_proba(self, X):
        return borda_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(50)]
        samples_all = X.shape[0]
        features_all = X.shape[1]

        for samples in self.n_samples:
            for features in self.n_features:
                f = sample(feature_list, int(features_all * features))
                self.cut_features.append(f)
                x_train_f = X[:,f]
                
                x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(samples * samples_all), replace=False, random_state=0)

                svm_clf = SVC(probability=True)
                svm_clf.fit(x_train_s, y_train_s)
                
                self.classifiers.append(svm_clf)

In [213]:
n_samples = [0.1, 0.25, 0.35, 0.5]
n_features = [0.25, 0.5, 0.75]

In [214]:
n_samples = [0.1, 0.25]
n_features = [0.25, 0.5]

In [215]:
minor = MinorClassifiers(n_samples, n_features)

In [216]:
scores = cross_validate(minor, X, Y, cv=5, scoring=('accuracy', 'neg_log_loss', 'neg_mean_squared_error', 'roc_auc_ovr', 'f1_weighted', 'precision_weighted', 'recall_weighted'))

In [217]:
scores

{'fit_time': array([0.50507832, 0.46871352, 0.40498924, 0.42737055, 0.44281554]),
 'score_time': array([0.28923082, 0.28775692, 0.30119681, 0.28623629, 0.26807547]),
 'test_accuracy': array([0.763, 0.748, 0.742, 0.769, 0.792]),
 'test_neg_log_loss': array([-1.71655802, -1.6858606 , -1.69580007, -1.69538118, -1.67441495]),
 'test_neg_mean_squared_error': array([-3.281, -3.034, -2.865, -3.08 , -2.482]),
 'test_roc_auc_ovr': array([0.94467398, 0.95568265, 0.95562878, 0.96352094, 0.96559779]),
 'test_f1_weighted': array([0.76157004, 0.74447995, 0.73515268, 0.76898455, 0.78159475]),
 'test_precision_weighted': array([0.76617381, 0.74807137, 0.74419921, 0.77633688, 0.80040467]),
 'test_recall_weighted': array([0.763, 0.748, 0.742, 0.769, 0.792])}

In [None]:
def get_minor_predictions(x_tr, y_tr, x_te, samples_perc, features_perc):
    feature_list = [n for n in range(50)]
    predictions = []
    samples_all = x_tr.shape[0]
    features_all = x_tr.shape[1]
    
    for samples in samples_perc:
        for features in features_perc:
            f = sample(feature_list, int(features_all * features))
            x_train_f = x_tr[:,f]
            x_test_f = x_te[:,f]
            
            x_train_s, y_train_s = resample(x_train_f, y_tr, n_samples=int(samples * samples_all), replace=False, random_state=0)
            
            svm_clf = SVC(probability=True)
            svm_clf.fit(x_train_s, y_train_s)
            
            y_pred = svm_clf.predict(x_test_f)
            pred = svm_clf.predict_proba(x_test_f)
            
            predictions.append((y_pred, pred))
            
    return predictions

In [88]:
pred = get_minor_predictions(x_train, y_train, x_test, n_samples, n_features)

ValueError: Sample larger than population or is negative

In [89]:
print(accuracy_score(y_test, majority_pred(pred)))
print(f1_score(y_test, majority_pred(pred), average='weighted'))

NameError: name 'pred' is not defined

In [90]:
print(accuracy_score(y_test, average_pred(pred)))
print(f1_score(y_test, average_pred(pred), average='weighted'))

NameError: name 'pred' is not defined

In [None]:
print(accuracy_score(y_test, borda_pred(pred)))
print(f1_score(y_test, borda_pred(pred), average='weighted'))

3a.

In [None]:
clf_100iter = SVC(probability=True, max_iter=100)

In [None]:
with elapsed_timer() as t:
    clf_100iter.fit(x_train, y_train)

print("duration: " + str(t.duration))

In [None]:
clf_7iter.fit(x_train, y_train)
y_pred_7ter = clf_7iter.predict(x_test)
pred_7iter = clf_7iter.predict_proba(x_test)

In [None]:
pred_7iter

In [None]:
from sklearn.metrics import accuracy_score
accuracy_7iter = accuracy_score(y_test, y_pred_7ter)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy_7iter

In [None]:
accuracy

4. 

In [7]:
def make_noise_data(data, percent):
    m = data.shape[0]
    n = data.shape[1]
    new_data = deepcopy(data)
    for i in range(m):
        for j in range(n):
            rand = randrange(-1, 2, 2) #random integer from {-1, 1}
            new_data[i][j] = (1 + rand * percent) * data[i][j]
    return new_data

In [None]:
def make_noise_label(labels, percent):
    labels_with_noise = deepcopy(labels)
    arr_size = labels.shape[0]
    indexes = [n for n in range(arr_size)]
    indexes_to_change = sample(indexes, int(arr_size * percent))
    
    for i in indexes_to_change:
        old_val = labels[i]
        new_val = randint(0, classes_count)
        while old_val == new_val:
            new_val = randint(0, classes_count)
        labels_with_noise[i] = new_val
        
    return labels_with_noise

In [None]:
new_y_train = make_noise_label(Y, 0.2)

In [None]:
pred = get_minor_predictions(x_train, new_y_train, x_test, n_samples, n_features)

In [None]:
print(accuracy_score(y_test, majority_pred(pred)))
print(f1_score(y_test, majority_pred(pred), average='weighted'))

In [11]:
new_x_train = make_noise_data(X, 0.2)

In [None]:
pred = get_minor_predictions(new_x_train, y_train, x_test, n_samples, n_features)

In [None]:
print(accuracy_score(y_test, majority_pred(pred)))
print(f1_score(y_test, majority_pred(pred), average='weighted'))