In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn import preprocessing
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, log_loss, average_precision_score
from random import sample, randint, randrange
from copy import deepcopy
from sklearn.model_selection import cross_validate

In [2]:
from contextlib import contextmanager
from timeit import default_timer

@contextmanager
def elapsed_timer():
    start_time = default_timer()

    class _Timer():
      start = start_time
      end = default_timer()
      duration = end - start

    yield _Timer

    end_time = default_timer()
    _Timer.end = end_time
    _Timer.duration = end_time - start_time

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [4]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
x_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target
#x_train = np.concatenate((x_train, x_test))
#y_train = np.concatenate((y_train, y_test))
classes_count = 20

Change dimensions to 500

In [5]:
svd = TruncatedSVD(n_components=500)
svd.fit(x_train)
svd_x_train = svd.transform(x_train)
svd_x_test = svd.transform(x_test)

In [6]:
svd_x_train = np.concatenate((svd_x_train, svd_x_test))
y_train = np.concatenate((y_train, y_test))

In [6]:
#x_train_res, y_train_res = resample(svd_x_train, y_train, n_samples=5000, replace=False, random_state=0)
#x_test_res, y_test_res = resample(svd_x_test, y_test, n_samples=500, replace=False, random_state=0)
#x_train_res = x_train_res.reshape((x_train_res.shape[0],-1))
#x_test_res = x_test_res.reshape((x_test_res.shape[0],-1))
#y_train_res = y_train_res.reshape((y_train_res.shape[0],))

In [7]:
#scaler = StandardScaler(with_mean=False).fit(x_train)
#x_train_scaled = scaler.transform(x_train) # scaling data
#x_test_scaled = scaler.transform(x_test)
#print(x_train_scaled.shape)

SVM on full data and full features

In [38]:
clf = SVC(probability=True, max_iter=200)
#clf.fit(svd_x_train, y_train)
#clf.fit(x_train_res, y_train_res)

In [16]:
m = svd_x_train.shape[0]

In [45]:
m3 = int(m*0.1)
svd_x_train_10 = svd_x_train[:m3]
y_train_10 = y_train[:m3]

In [14]:
y_train_noise_40 = make_noise_label(y_train, 0.4)

In [39]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [15]:
print(y_train_noise_40.shape)
print(svd_x_train.shape)

(18846,)
(18846, 500)


In [39]:
scores = cross_validate(clf, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))



In [40]:
scores

{'fit_time': array([453.452003  , 452.28610563, 452.48716164, 447.85763693,
        450.32840657]),
 'score_time': array([46.16600728, 46.40932512, 46.56393313, 45.55130672, 45.94004512]),
 'test_accuracy': array([0.86949602, 0.88007429, 0.88166622, 0.83443884, 0.83762271]),
 'test_neg_log_loss': array([-0.47028495, -0.45748055, -0.45810768, -0.56722823, -0.54911391]),
 'test_neg_mean_squared_error': array([-6.30371353, -5.90501459, -5.07084107, -9.62111966, -9.44229239]),
 'test_roc_auc_ovr': array([0.99272923, 0.99265828, 0.99277632, 0.98831115, 0.98951535]),
 'test_f1_weighted': array([0.87051746, 0.88002696, 0.88256631, 0.83389696, 0.83774838]),
 'test_precision_weighted': array([0.8735847 , 0.88092778, 0.8848363 , 0.83623437, 0.84251736]),
 'test_recall_weighted': array([0.86949602, 0.88007429, 0.88166622, 0.83443884, 0.83762271])}

In [16]:
scores = cross_validate(clf, svd_x_train, y_train_noise_40, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
scores

{'fit_time': array([1244.50412655, 1570.05026317, 1476.82827353, 1522.17273259,
        1421.41617584]),
 'score_time': array([72.63192344, 82.25924134, 82.61139894, 88.18289876, 73.61998105]),
 'test_accuracy': array([0.5       , 0.50464314, 0.4887238 , 0.49376492, 0.47545768]),
 'test_neg_log_loss': array([-2.27421874, -2.24522513, -2.30234006, -2.23431492, -2.29031023]),
 'test_neg_mean_squared_error': array([-34.75251989, -34.11488458, -34.23985142, -35.36004245,
        -34.06924914]),
 'test_roc_auc_ovr': array([0.77234373, 0.77954066, 0.75769786, 0.7747673 , 0.76451044]),
 'test_f1_weighted': array([0.49584589, 0.49774046, 0.48334969, 0.48867876, 0.46880907]),
 'test_precision_weighted': array([0.49512082, 0.49420359, 0.48109027, 0.48748494, 0.46745597]),
 'test_recall_weighted': array([0.5       , 0.50464314, 0.4887238 , 0.49376492, 0.47545768])}

In [9]:
y_pred = clf.predict(svd_x_test)
#y_pred = clf.predict(x_test_res)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
#accuracy = accuracy_score(y_test_res, y_pred)

In [11]:
accuracy #0.78956

0.7873074880509825

In [12]:
f_score = f1_score(y_test, y_pred, average='weighted')
#f_score = f1_score(y_test_res, y_pred, average='weighted')
f_score

0.7888587724079459

10 sub-classifiers

In [41]:
n_samples = [0.1, 0.25, 0.35, 0.5]
n_features = [0.25, 0.5, 0.75]
n_classifiers = 12

samples_all = svd_x_train.shape[0]
samples_all_test = svd_x_test.shape[0]
features_all = svd_x_train.shape[1]

In [43]:
class MinorClassifiers:
    def __init__(self, n_samples, n_features):
        self.n_samples = n_samples
        self.n_features = n_features
        self.classifiers = []
        self.predictions = []
        self.cut_features = []
    
    def get_params(self, deep = False):
        return {
            'n_samples': self.n_samples,
            'n_features': self.n_features,   
        }
    
    def predict(self, X):
        if self.predictions == []:
            for i in range(len(self.classifiers)):
                classifier = self.classifiers[i]
                x_test = X[:,self.cut_features[i]]
                y_pred = classifier.predict(x_test)
                pred = classifier.predict_proba(x_test)
                self.predictions.append((y_pred, pred))
            
        return average_pred(self.predictions)
    
    def predict_proba(self, X):
        if self.predictions == []:
            for i in range(len(self.classifiers)):
                classifier = self.classifiers[i]
                x_test = X[:,self.cut_features[i]]
                y_pred = classifier.predict(x_test)
                pred = classifier.predict_proba(x_test)
                self.predictions.append((y_pred, pred))
        return average_pred_proba(self.predictions, len(self.classifiers))
        
    def fit(self, X, Y):
        feature_list = [n for n in range(500)]
        samples_all = X.shape[0]
        features_all = X.shape[1]

        for samples in self.n_samples:
            for features in self.n_features:
                f = sample(feature_list, int(features_all * features))
                self.cut_features.append(f)
                x_train_f = X[:,f]
                
                x_train_s, y_train_s = resample(x_train_f, Y, n_samples=int(samples * samples_all), replace=False, random_state=0)

                svm_clf = SVC(probability=True, max_iter=max_iters)
                svm_clf.fit(x_train_s, y_train_s)
                
                self.classifiers.append(svm_clf)

In [30]:
def get_minor_classifiers(samples_perc, features_perc):
    feature_list = [n for n in range(500)]
    classifiers = []
    
    for samples in samples_perc:
        for features in features_perc:
            f = sample(feature_list, int(features_all * features))
            x_train_f = svd_x_train[:,f]
            x_test_f = svd_x_test[:,f]
            
            x_train_s, y_train_s = resample(x_train_f, y_train, n_samples=int(samples * samples_all), replace=False, random_state=0)
            #x_test_s, y_test_s = resample(svd_x_test, y_test, n_samples=int(samples * samples_all_test), replace=False, random_state=0)
            
            svm_clf = SVC(probability=True)
            svm_clf.fit(x_train_s, y_train_s)
            
            y_pred = svm_clf.predict(x_test_f)
            pred = svm_clf.predict_proba(x_test_f)
            
            classifiers.append((y_pred, pred))
            
    return classifiers

In [31]:
#clas = get_minor_classifiers(n_samples, n_features)

In [44]:
def average_pred(predictions):
    #predictions = list of tuples 
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def average_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= n_classifiers            
    return results

def majority_pred(predictions):
    m = len(predictions[0][0])
    results = [0] * m
    for i in range(m):
        all_results = [0 for x in range(classes_count)] 
        for (pred, _) in predictions:
            all_results[pred[i]] += 1
        results[i] = all_results.index(max(all_results))        
    return results

def majority_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)]
    classifiers_votes_count = [0] * m
    majority_results = majority_pred(predictions)
    for classifier in range(0, n_classifiers):
        for i in range(m):
            voted_class = majority_results[i] 
            (pred, pred_proba) = predictions[classifier]
            if(pred[i] == voted_class):
                classifiers_votes_count[i] += 1
                for j in range(classes_count):
                    results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= classifiers_votes_count[i]
    return results      
    

def borda_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(classes_count)] for y in range(m)] 
    results = [0] * m
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def borda_pred_proba(predictions, n_classifiers):
    m = len(predictions[0][0])
    results = [[0 for x in range(classes_count)] for y in range(m)] 
    
    def get_final_borda_points(predictions):
        return np.argsort(np.argsort(predictions)).tolist()

    def get_points():
        sum = 0
        for i in range(classes_count):
            sum += i
        return sum * n_classifiers
    
    for (_, pred_proba) in predictions:
        for i in range(m):
            pred_proba[i] = get_final_borda_points(pred_proba[i])
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(classes_count):
                results[i][j] += pred_proba[i][j]
    for i in range(m):
        for j in range(classes_count):
            results[i][j] /= get_points()
        
    return results

In [33]:
res = majority_pred(clas)

NameError: name 'clas' is not defined

In [19]:
accuracy = accuracy_score(y_test, res)

NameError: name 'res' is not defined

In [20]:
accuracy

0.7873074880509825

In [10]:
def make_noise_label(labels, percent):
    labels_with_noise = deepcopy(labels)
    arr_size = labels.shape[0]
    indexes = [n for n in range(arr_size)]
    indexes_to_change = sample(indexes, int(arr_size * percent))
    
    for i in indexes_to_change:
        old_val = labels[i]
        new_val = randint(0, classes_count-1)
        while old_val == new_val:
            new_val = randint(0, classes_count-1)
        labels_with_noise[i] = new_val
        
    return labels_with_noise

In [11]:
def make_noise_data(data, percent):
    m = data.shape[0]
    n = data.shape[1]
    new_data = deepcopy(data)
    for i in range(m):
        for j in range(n):
            rand = randrange(-1, 2, 2) #random integer from {-1, 1}
            new_data[i][j] = (1 + rand * percent) * data[i][j]
    return new_data

In [114]:
clf_100iter = SVC(probability=True, max_iter=50) #221 dla 100, 24 dla 10, 46 dla 20, 112 dla 50

with elapsed_timer() as t:
    clf_100iter.fit(svd_x_train, y_train)

print("duration: " + str(t.duration))

duration: 111.54180649999944




Wyniki jakości klasyfikatorów:

In [77]:
max_iters = 121
clfs = MinorClassifiers(n_samples, n_features)

In [28]:
m = svd_x_train.shape[0]
m2 = int(m*0.1)
svd_x_train_10 = svd_x_train[:m2]
y_train_10 = y_train[:m2]

In [30]:
svd_x_train_10.shape

(1884, 500)

In [17]:
y_train_40 = make_noise_label(y_train, 0.4)

In [18]:
print(y_train_40.shape)
print(svd_x_train.shape)

(18846,)
(18846, 500)


In [78]:
#UWAGA NIE PUSZCZAĆ BEZ POWODU - linijka niezakomentowana żeby uniknąć restartowania kernela
#cross-validation -> wybór metody fuzji decyzji?
scores = cross_validate(clfs, svd_x_train, y_train, cv=5, scoring=('accuracy', 
                                                    'neg_log_loss', 
                                                    'neg_mean_squared_error', 
                                                    'roc_auc_ovr', 
                                                    'f1_weighted', 
                                                    'precision_weighted', 
                                                    'recall_weighted'))
#print(scores['test_accuracy'])
#print(scores['test_neg_log_loss'])
#print(scores['test_precision_weighted'])
#print(scores['test_recall_weighted'])
#print(scores['test_f1_weighted'])
#print(scores['test_roc_auc_ovr'])
#print(scores['test_average_precision'])






In [79]:
scores

{'fit_time': array([456.73319173, 458.01375389, 459.7722764 , 461.8099308 ,
        466.41780066]),
 'score_time': array([117.90738559, 119.20071459, 120.67032194, 118.62991047,
        119.7404778 ]),
 'test_accuracy': array([0.83262599, 0.83709207, 0.83894932, 0.79994694, 0.80100822]),
 'test_neg_log_loss': array([-0.87120483, -0.85184481, -0.90018811, -0.87375549, -0.89624805]),
 'test_neg_mean_squared_error': array([ -8.53899204,  -8.32342796,  -7.82515256, -11.96975325,
        -11.44866012]),
 'test_roc_auc_ovr': array([0.98636978, 0.98694682, 0.98709906, 0.98243303, 0.98239627]),
 'test_f1_weighted': array([0.83334755, 0.83653121, 0.8395049 , 0.7999577 , 0.80079059]),
 'test_precision_weighted': array([0.83628244, 0.8376113 , 0.84178961, 0.80360644, 0.80461577]),
 'test_recall_weighted': array([0.83262599, 0.83709207, 0.83894932, 0.79994694, 0.80100822])}

In [123]:
y_pred = clfs.predict(svd_x_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy score: {0:0.2f}'.format(accuracy))

Accuracy score: 0.79


In [121]:
y_pred_proba = clfs.predict_proba(svd_x_test)
loss = log_loss(y_test, y_pred_proba)
print('Loss score: {0:0.2f}'.format(loss))

Loss score: 0.72


In [129]:
average_precision = average_precision_score(y_test, y_pred, pos_label=1)
print('Precision-recall score: {0:0.2f}'.format(average_precision))

ValueError: multiclass format is not supported

In [15]:
x_train

<11314x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [7]:
a = x_train.toarray()

In [6]:
b = np.array([[2.0,3.0], [2.0,4.], [4.,5.]])

In [17]:
c = make_noise_data(svd_x_train, 0.1)
print(c)

[[ 0.22475435 -0.06248808 -0.01441603 ... -0.0168442   0.02101432
  -0.0191214 ]
 [ 0.12599262 -0.08438557 -0.0357827  ... -0.01513339  0.00517915
   0.02194321]
 [ 0.3346583  -0.04556982 -0.07380376 ...  0.00602186  0.01392154
  -0.02985655]
 ...
 [ 0.1662913  -0.00550221 -0.0884303  ...  0.01776318 -0.00348805
   0.01692393]
 [ 0.20675388 -0.07267247  0.03741476 ...  0.03050179  0.02278233
  -0.02552414]
 [ 0.09054867 -0.09988643  0.00410151 ...  0.00048173 -0.02367268
   0.00355212]]
