In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn import preprocessing
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, log_loss
from random import sample

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
x_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target

Change dimensions to 50

In [4]:
svd = TruncatedSVD(n_components=500)
svd.fit(x_train)
svd_x_train = svd.transform(x_train)
svd_x_test = svd.transform(x_test)

In [5]:
x_train_res, y_train_res = resample(svd_x_train, y_train, n_samples=5000, replace=False, random_state=0)
x_test_res, y_test_res = resample(svd_x_test, y_test, n_samples=500, replace=False, random_state=0)
x_train_res = x_train_res.reshape((x_train_res.shape[0],-1))
x_test_res = x_test_res.reshape((x_test_res.shape[0],-1))
y_train_res = y_train_res.reshape((y_train_res.shape[0],))

In [20]:
#scaler = StandardScaler(with_mean=False).fit(x_train)
#x_train_scaled = scaler.transform(x_train) # scaling data
#x_test_scaled = scaler.transform(x_test)
#print(x_train_scaled.shape)

(5000, 130107)


Changing dimensions to 50

In [61]:
#svd = TruncatedSVD(n_components=50)
#svd.fit(x_train_res)
#svd_x_train = svd.transform(x_train_res)
#svd_x_test = svd.transform(x_test_res)

SVM on full data and full features

In [103]:
clf = SVC(probability=True)
clf.fit(x_train_res, y_train_res)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [104]:
y_pred = clf.predict(x_test_res)

In [105]:
accuracy = accuracy_score(y_test_res, y_pred)

In [106]:
accuracy #0.758 na całości

0.758

In [107]:
f_score = f1_score(y_test_res, y_pred, average='weighted')
f_score

0.7598208885405394

10 sub-classifiers

In [6]:
n_samples = [0.1, 0.25]
n_features = [0.1, 0.2]

samples_all = svd_x_train.shape[0]
samples_all_test = svd_x_test.shape[0]
features_all = svd_x_train.shape[1]

In [7]:
def get_minor_classifiers(samples_perc, features_perc):
    feature_list = [n for n in range(500)]
    classifiers = []
    
    for samples in samples_perc:
        for features in features_perc:
            f = sample(feature_list, int(features_all * features))
            x_train_f = svd_x_train[:,f]
            
            x_train_s, y_train_s = resample(svd_x_train, y_train, n_samples=int(samples * samples_all), replace=False, random_state=0)
            #x_test_s, y_test_s = resample(svd_x_test, y_test, n_samples=int(samples * samples_all_test), replace=False, random_state=0)
            
            svm_clf = SVC(probability=True)
            svm_clf.fit(x_train_s, y_train_s)
            
            y_pred = svm_clf.predict(svd_x_test)
            pred = svm_clf.predict_proba(svd_x_test)
            
            classifiers.append((y_pred, pred))
            
    return classifiers

In [8]:
clas = get_minor_classifiers(n_samples, n_features)

In [20]:
def average_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(20)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(20):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

def majority_pred(predictions):
    m = len(predictions[0][0])
    results = [0] * m
    for i in range(m):
        all_results = [0 for x in range(20)] 
        for (pred, _) in predictions:
            all_results[pred[i]] += 1
        results[i] = all_results.index(max(all_results))        
    return results

def borda_pred(predictions):
    m = len(predictions[0][0])
    all_results = [[0 for x in range(10)] for y in range(m)] 
    results = [0] * m
    for (_, pred_proba) in predictions:
        for i in range(m):
                all_results[i] += pred_proba[i][j]
    for (_, pred_proba) in predictions:
        for i in range(m):
            for j in range(10):
                all_results[i][j] += pred_proba[i][j]
    for i in range(m):
        results[i] = all_results[i].index(max(all_results[i]))
    return results

In [21]:
res = majority_pred(clas)

In [22]:
accuracy = accuracy_score(y_test, res)

In [23]:
accuracy

0.6736590546999469