# Подготовка данных для обучения

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import re

from tqdm.notebook import tqdm

import ssl
ssl._create_default_https_context = ssl._create_stdlib_context

import nltk
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('twitter_samples')
#nltk.download('gutenberg')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
#from nltk.corpus import gutenberg?
from nltk.corpus import twitter_samples
import gensim

from multiprocessing.dummy import Pool, Queue

import itertools
from scipy.special import softmax
from scipy.stats import median_abs_deviation as mad
from sklearn.cluster import DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.base import BaseEstimator
from sklearn.neighbors import KNeighborsClassifier, KDTree
from scipy.spatial.distance import euclidean, cosine
from scipy.spatial.distance import cdist
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.svm import OneClassSVM

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score

### Данные

Для ускорения исследования, скачаем весь датасет, и будем собирать батчи из него.

In [3]:
df = pd.read_csv('twitter_dataset_4_9000_2000.csv', low_memory=False)
df.drop(columns=['Unnamed: 0'], inplace=True)
df.dropna(inplace=True)

def stream_generator(batch_size=10):
    for i in range(0, df.shape[0], batch_size):
        stream_batch = df.iloc[i : min(i + batch_size, df.shape[0])]
        yield stream_batch['content'].tolist(), stream_batch['novel'].tolist()

### Предобработка

Некоторые стоп-слова удалим. Применим в одном варианте лемматизацию, в другом - стемминг.

In [4]:
class Preprocessor:
    
    def __init__(self):
        self.methods = {
            'lemm' : self.lemmatization,
            'stem' : self.stemming,
            'both' : self.both_norm
        }
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.swords = set( stopwords.words("english") )
     
    def preprocess1(self, content_batch, standard='both'):
        # stream_batch: ([content], [novel]), content = list(string), novel = list
    
        preprocessed_batch = []
        for doc in content_batch:
            doc = doc.lower()
            #doc = self.delete_tags(doc)
            doc = self.delete_links(doc)
            doc = self.delete_garbage(doc)
            tokens = self.get_tokens(doc)
            tokens = self.methods[standard](tokens)
            tokens = self.delete_stop_words(tokens)
            preprocessed_batch.append( ' '.join(tokens) )
            
        return preprocessed_batch
    
    
    def delete_tags(self, doc):
        # doc = re.sub(r'^@[\w]*', ' ', doc) 
        # doc = re.sub(r'\s@[\w]*', ' ', doc)
        return doc
    
    
    def delete_links(self, doc):
        doc = re.sub(r'http\:\/\/[\w\-&\./?=\+;@#%]*', ' ', doc)
        doc = re.sub(r'https\:\/\/[\w\-&\./?=\+;@#%]*', ' ', doc)
        doc = re.sub(r'ftp\:\/\/[\w\-&\./?=\+;@#%]*', ' ', doc)
        doc = re.sub(r'www\.[\w\-&\./?=\+;@#%]*', ' ', doc)
        return doc
    
    
    def delete_garbage(self, doc):
        doc = re.sub(r'&amp;', ' ', doc)
        doc = re.sub(r'\s+', ' ', doc)
        # doc = re.sub(r"\b\w*\W+\w*\b", ' ', doc)
        doc = re.sub(r"[^a-zA-Z0-9\s]*", '', doc)
        # doc = re.sub(r"\sRT\s ", '', doc)
        return doc
    
    
    def delete_stop_words(self, tokens): # TODO: create own sw list
        return list( filter(lambda sword: sword not in self.swords, tokens) )
    
    
    def get_tokens(self, doc):
        return list(map(lambda token: token.lower(), doc.split()))
    
    
    def lemmatization(self, tokens):
        return list(map(lambda token: self.lemmatizer.lemmatize(token), tokens))
    
    
    def stemming(self, tokens):
        return list( map(lambda token: self.stemmer.stem(token), tokens) )
    
    
    def both_norm(self, tokens):
        tokens = list(map(lambda token: self.lemmatizer.lemmatize(token), tokens))
        return list( map(lambda token: self.stemmer.stem(token), tokens) )

### Словарь

Скачаем корпус из классических произведений, и создадим из него фиксированный словарь для векторизации.

In [5]:
def create_vocab():
    twitter_samples.fileids()
    vocab_corpus = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] + 
             [(t, "neg") for t in twitter_samples.strings("negative_tweets.json")] +
             [(t, "neg") for t in twitter_samples.strings("tweets.20150430-223406.json")]
            )
    vocab_corpus = list( map( lambda pair: pair[0], vocab_corpus ) )
    return vocab_corpus

vocab_corpus = create_vocab()

pp = Preprocessor()
vocab_corpus = pp.preprocess1(vocab_corpus)

print( len(vocab_corpus) )
print( vocab_corpus[0] )

30000
followfriday franceint pkuchly57 milipolpari top engag member commun thi week


### Векторизация

Класс, в котором будут реализованы основные методы векторизации.

In [32]:
class Vectorizer():
    
    def __init__(self, method, train_corpus):
        self.methods = {
            'one-hot' : self.one_hot_vectorizer,
            'count' : self.count_vectorizer,
            'tf-idf' : self.tfidf_vectorizer,
            #'n-gramms' : self.n_gramms_vectorizer,
            'doc-to-vec' : self.doc_to_vec_vectorizer
        }
        if method not in self.methods:
            raise Exception('Wrong method: {}'.format(method))
        
        self.method = method
        self.model = None
        self.train_corpus = Preprocessor().preprocess1( train_corpus )
        
    
    def vectorize(self, batch, **args):
        return self.methods[self.method](batch, **args)
        
    
    def one_hot_vectorizer(self, batch, **args):
        
        if self.model is None:
            self.model = CountVectorizer(binary=True, **args)
            self.model.fit(self.train_corpus)
            
        return self.model.transform(batch)
    
    
    def count_vectorizer(self, batch, **args):
        
        if self.model is None:
            self.model = CountVectorizer(**args)
            self.model.fit(self.train_corpus)
            
        return self.model.transform(batch)
    
    
    def tfidf_vectorizer(self, batch, **args):
        
        if self.model is None:
            self.model = TfidfVectorizer(**args)
            self.model.fit(self.train_corpus)
            
        return self.model.transform(batch)
        
        
    def doc_to_vec_vectorizer(self, batch, **args):
        
        def extract_tokens(train = False):
            if train:
                for i, doc in enumerate(self.train_corpus):
                    tokens = Preprocessor().get_tokens(doc)
                    yield gensim.models.doc2vec.TaggedDocument(tokens, [i])    
                    
            else:
                for i, doc in enumerate(batch):
                    tokens = Preprocessor().get_tokens(doc)
                    yield tokens

        
        if self.model is None:
            # vocab = list( extract_tokens(train=True) )
            vocab = list( extract_tokens(train=True) )
            self.model = gensim.models.doc2vec.Doc2Vec(min_count=1, vector_size=100)
            # self.model.build_vocab(train_corpus, update = True)
            # self.model.build_vocab(self.train_corpus)
            self.model.build_vocab( vocab )
            self.model.train(vocab, total_examples=self.model.corpus_count, epochs=5, **args)
        
        return np.array( list( map( lambda token: self.model.infer_vector(token), extract_tokens() ) ) )

## Алгоритмы

Результаты замеров есть в таблице:

https://docs.google.com/spreadsheets/d/1a6M76eS8L-WTGKVI-xRajut2P-X70j-bv5s_7WZv3ZE/edit?usp=sharing

In [7]:
def run_scores(pipeline, batch_size, n_start, return_target=False):
    """
        pipeline: pp, alg, svd, vectorizer
        return: accuracy, precision, recall, roc_auc, [answers, targets, scores]
    """
    
    pp, alg, svd, vectorizer = pipeline
    
    answers = []
    targets = []

    for batch, target in tqdm( stream_generator(batch_size) ):
        batch = pp.preprocess1(batch)
        vectors = vectorizer.vectorize(batch)
        if svd is not None:
            vectors = svd.fit_transform(vectors)
            
        answers += list( alg.predict(vectors) ) 
        targets += target
    
    answers = answers[n_start:]
    targets = targets[n_start:]
    
    try:
        roc_auc = roc_auc_score(targets, answers)
    except:
        roc_auc = 0.5
        
    result = [roc_auc]
    
    if return_target:
        result += [answers, targets]
    
    return result

def combine_params(*args):
    return list( itertools.product(*args) )


def pool_process(params, run_with_param):
    
    def process_run(arg):
        i, run_with_param = arg

        #with open('data/poems{:05d}.txt'.format(i), mode='w') as txt:
        while not queue.empty():
            ind = queue.get()
            cur_param = params[ind]
        
            try:
                roc_auc = run_with_param(cur_param)
            except Exception as err:
                roc_auc = -1
                with lock:
                    print('process: {}; error: {}; \nparams: {}'.format( i, err, cur_param))

            with lock:
                print(cur_param)
                print('ROC AUC: {}'.format(roc_auc))
                print()
                pbar.update(1)
    
    
    queue = Queue()

    params_ind = np.arange(len(params))
    for i in params_ind:
        queue.put(i)

    with Pool(processes=4) as pool, tqdm(total=queue.qsize()) as pbar:
        lock = pbar.get_lock()
        pool.map(process_run, zip(range(pool._processes), [run_with_param] * len(range(pool._processes)) ))

    pool.join()

### Базовый класс для алгоритмов

In [8]:
class Storage:
    
    def __init__(self, n_start, max_samples):
        self.data = None
        self.pos = 0
        self.initialized = False
        self.max_samples = max_samples
        self.n_for_start = n_start
        
        if self.n_for_start > self.max_samples:
            self.n_for_start = self.max_samples
            print('n_start set to max_samples')
            
    def update_storage(self, X, start_count): # Warning: copying removed 
        
        if self.data.shape[0] == self.max_samples:
            
            if self.pos + X.shape[0] - start_count <= self.max_samples:
                self.data[self.pos : self.pos + X.shape[0] - start_count] = X[start_count : ] # .copy()
                self.pos += X.shape[0] - start_count

            else:
                self.data[self.pos : ] = X[start_count : start_count + self.data.shape[0] - self.pos] # .copy()
                start_count += self.data.shape[0] - self.pos
                self.pos = 0
                self.data[self.pos : self.pos + X.shape[0] - start_count] = X[start_count :] # .copy()
        
        else:
            
            if self.pos + X.shape[0] - start_count <= self.max_samples:
                self.data = np.concatenate((self.data, X[start_count : ]), axis=0)
                self.pos += X.shape[0] - start_count

            else:
                self.data = np.concatenate((self.data,
                                            X[start_count : start_count + self.data.shape[0] - self.pos]), axis=0)
                start_count += self.data.shape[0] - self.pos
                self.pos = 0
                self.data[self.pos : self.pos + X.shape[0] - start_count] = X[start_count :] # .copy()
    
    
    def check_consistency(self, X):
        if X.shape[0] > self.max_samples:
            raise Exception('Too large batch for this model. Fix max_samples')
    
    
    def reset(self):
        self.data = None
        self.pos = 0
        self.initialized = False
        
        
class NovelPredictor(BaseEstimator):
    """
        all algorithms must contain fit and predict methods
        all sons must implement 'decision_function' method
    """
    def __init__(self, alg, n_start=100, max_samples = 1000 ):
        self.storage = Storage(n_start, max_samples)
        self.alg = alg
        self.novelty_label = 1
    
    def predict(self, X):
        
        X = np.asarray(X)
        
        self.storage.check_consistency(X)
        start_count = 0
        
        if not self.storage.initialized:
            X = np.asarray(X, dtype = X.dtype)
            start_count = min( self.storage.n_for_start, X.shape[0])
            self.storage.data = X[:start_count]
            self.storage.pos = start_count % self.storage.max_samples
            self.storage.initialized = True
        
        targets = []
        if start_count < X.shape[0]:
 
            # self.storage.update_storage(X, start_count) # TODO: так после же надо?
            self.alg.fit(self.storage.data)
            # print(len( self.decision_function( self.alg.predict(X[start_count:]) ) ), start_count)
            predict = self.decision_function( self.alg.predict(X[start_count:]) )
            targets = [self.novelty_label] * start_count + predict # SHOULD BE LIST!
            self.storage.update_storage(X, start_count) # TODO: так после же надо?
            
        else:
            targets = [self.novelty_label] * start_count
        
        return targets
    

    def reset(self):
        self.storage.reset()

### IQR

In [9]:
class IQR:
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.X = X.copy()
    def predict(self, X, y=None):
        q75, q25 = np.percentile(self.X, [75, 25], axis=0)
        iqr = q75 - q25
        scores = np.where( (X > q25 - 1.5 * iqr).astype(int) * (X < q75 + 1.5 * iqr).astype(int) == 1, 0, 1)
        scores = np.sum( scores, axis=1 )
        #return np.where(scores > 0, 1, 0)
        return softmax( scores ) # неверно. Нужно сначала собрать все скоры, потом сделать predict


class IQRModel(NovelPredictor):
    
    def __init__(self, n_start=100, max_samples = 500):
        
        alg = IQR()
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        if not isinstance(predict, list):
            return predict.tolist()
        
        return predict

### DBSCAN

In [10]:
class DbscanWrapper:
    def __init__(self, eps, min_samples):
        self.alg = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
        
    def fit(self, X, y=None):
        self.X = X
        
    def predict(self, X, y=None):
        self.alg.fit( np.concatenate( (self.X, X), axis=0) )
        return np.where( self.alg.labels_[self.X.shape[0]:] < 0, 1, 0)


class Dbscan(NovelPredictor):
    
    def __init__(self, n_start=100, max_samples = 500, eps=0.4, min_samples=10):
        
        alg = DbscanWrapper(eps=eps, min_samples=min_samples)
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        return predict.tolist() 

### Elliptical Envelope

In [11]:
class EllipseWrapper:
    
    def __init__(self, contamination):
        self.alg = EllipticEnvelope(contamination=contamination)
        
    def fit(self, X, y=None):
        self.alg.fit(X)
    
    def predict(self, X):
        return -alg.decision_function(X)
        #return 1 - softmax( alg.decision_function(X) )

class Ellipse(NovelPredictor):
    
    def __init__(self, contamination, n_start=100, max_samples = 500):
        
        alg = EllipticEnvelope(contamination=contamination)
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        return predict.tolist()
        #return np.where( predict < 0, 1, 0).tolist()

### Z-score

In [12]:
class ZModel:
    def __init__(self, eps=1e-6): # threshold,
        self.eps = eps
        #self.threshold = threshold
    def fit(self, X, y=None):
        self.X = X
    def predict(self, X, y=None):
        nX = (X - np.mean(self.X, axis=0)) / np.std( np.where(X > self.eps, X, self.eps), axis=0 )
        return (nX ** 2).sum(axis=1).astype(int)
        #novel = np.where( np.absolute(nX) > self.threshold, 1, 0)
        #return novel.any(axis=1).astype(int)

    
class ZScore(NovelPredictor):
    
    def __init__(self, n_start=100, max_samples = 500): # threshold,
        
        alg = ZModel()
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        return predict.tolist()

### Median Z-score

In [13]:
class MZModel:
    def __init__(self, eps=1e-6): #  threshold,
        self.eps = eps
        #self.threshold = threshold
        # assert threshold > 0
        
    def fit(self, X, y=None):
        self.X = X
        
    def predict(self, X, y=None):
        nX = (X - np.median(self.X, axis=0)) / mad( np.where(X > self.eps, X, self.eps), axis=0 )
        return (nX ** 2).sum(axis=1).astype(int)
        #novel = np.where( np.absolute(nX) > self.threshold, 1, 0)
        #return novel.any(axis=1).astype(int)

    
class MZScore(NovelPredictor):
    
    def __init__(self, n_start=100, max_samples=500):
        
        alg = MZModel()
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        return predict.tolist()

### LOF

In [14]:
class LOFWrapper:
    def __init__(self, n_neighbors, contamination):
        self.alg = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True,
                                      contamination=contamination, n_jobs=4, algorithm='kd_tree')
        
    def fit(self, X, y=None):
        self.alg.fit(X)
        
    def predict(self, X, y=None):
        #return np.where( self.alg.decision_function(X) > self.threshold, 0, 1)
        return -self.alg.decision_function(X)


class LOF(NovelPredictor):
    
    def __init__(self, n_start=100, max_samples = 500, n_neighbors=20,
                 contamination=0.1):
        
        alg = LOFWrapper(n_neighbors, contamination)
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        return predict.tolist()

### KNN

In [15]:
class KnnWrapper:
    def __init__(self, algorithm, n_neighbors):
        self.algorithm = algorithm
        self.n_neighbors = n_neighbors
    
    def fit(self, X):
        self.X = X
    
    def predict(self, X):

        res = []
        for obj in X:
            dist, ind = self.get_neighbors([obj])
            int_lof = dist[0, -1] # radius of sphere

            dist, ind = self.get_neighbors( [self.X[i] for i in ind[0]] )
            ext_lof = np.mean( dist[:, -1] )
            
            res.append( -(1 + ext_lof) / (1 + int_lof) )
        
        return res
    
    
    def get_neighbors(self, X):
        
        X = np.asarray(X)
        
        if self.algorithm == 'kd_tree':

            tree = KDTree(self.X)
            dist, ind = tree.query(X, k=self.n_neighbors)
            return dist, ind

        else:
            raise Exception('unknown algorithm')
        


class Knn(NovelPredictor):
    
    def __init__(self, n_start=100, n_neighbors=5, max_samples = 5000,
                    algorithm='kd_tree', metric='euclidean'):
        
        alg = KnnWrapper(algorithm=algorithm, n_neighbors=n_neighbors)
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        if not isinstance(predict, list):
            return predict.tolist()
        return predict

### SVM

In [16]:
class SVMWrapper:
    def __init__(self, nu):
        self.alg = OneClassSVM(nu=nu)
    def fit(self, X):
        self.alg.fit(X)
    def predict(self, X):
        return -self.alg.decision_function(X)

class SVM(NovelPredictor):
    
    def __init__(self, n_start=100, max_samples = 1000, nu=0.5):
        
        alg = SVMWrapper(nu)
        super().__init__(alg, n_start, max_samples)
        
    def decision_function(self, predict):
        return predict.tolist()
        #return np.where(predict == -1, 1, 0).tolist()

## Ансамблирование

In [17]:
class Vote():
    def __init__(self):
        pass
    
    def average_voting(self, targets):
        return np.mean(targets, axis=0).tolist()
    
    def max_voting(self, targets):
        return np.max(targets, axis=0).tolist()
    
    def weighted_average_voting(self, targets, weights):
        assert abs( sum(weights) - 1 ) < 1e-8
        return np.sum( np.array(targets) * np.asarray(weights)[:, None], axis=0 )
    
    def average_max_voting(self, targets, count):
        return np.mean( np.partition(targets, -np.arange(1, count + 1), axis=0)[-count:] ).tolist()
    
    
def get_roc_auc(y_true, y_pred):
    try:
        res = roc_auc_score(y_true, y_pred)
    except:
        res = 0.5
    
    return res
    
    
def reset_pipeline(pipeline):
    for pl_num, (pp, vectorizer, svd, alg) in enumerate(pipeline):
        alg.reset()
        

def sigmoid(x):
    return (1 / (1 + np.exp(-np.asarray(x)))).tolist()
    
    
def data_ensemble_predict(pipeline, batch_size, voting, n_start, verbose=1):
    """
    pipeline[i]: [pp, vectorizer, svd, algorithm]
    voting: voting function
    
    return: answers, targets
    """
    
    answers = []
    targets = []

    for batch, target in tqdm( stream_generator(batch_size) ):

        y_pred = []
        for pl_num, (pp, vectorizer, svd, alg) in enumerate(pipeline):
            batch = pp.preprocess1(batch)
            vectors = vectorizer.vectorize(batch)
            if svd is not None:
                vectors = svd.fit_transform(vectors) # don't use for doc-to-vec

            y_pred.append( list(alg.predict(vectors)) )

        answers += voting( y_pred )
        targets += target

    reset_pipeline(pipeline)
        
    return answers, targets
        
        

def data_ensemble(pipeline, batch_size, vote, n_start, weights=None, repeat_count=1, verbose=1):
    """
    pipeline[i]: [pp, vectorizer, svd, algorithm]
    """

    if weights is None:
        weights = [1 / len(pipeline)] * len(pipeline)
    
    roc_auc = []
    answers = []
    ap, am, aw, amp  = [], [], [], []
    
    for iter_num in tqdm( range(repeat_count) ):
        
        targets = []
        y_pred = [[] for i in range(len(pipeline))]
        
        for batch, target in tqdm( stream_generator(batch_size) ):
            for pl_num, (pp, vectorizer, svd, alg) in enumerate(pipeline):
                batch = pp.preprocess1(batch)
                vectors = vectorizer.vectorize(batch)
                if svd is not None:
                    vectors = svd.fit_transform(vectors) # don't use for doc-to-vec
                
                y_pred[pl_num] += list(alg.predict(vectors))
            
            targets += target

        for i in range(len(y_pred)):
            y_pred[i] = sigmoid( y_pred[i][n_start:] )
        
        ap = vote.average_voting( y_pred )
        mp = vote.max_voting( y_pred )
        wap = vote.weighted_average_voting( y_pred, weights )
        amp = vote.average_max_voting( y_pred, count=3 )

        targets = targets[n_start:]
        
        
        roc_auc.append( [get_roc_auc(targets, ap),
                          get_roc_auc(targets, mp),
                          get_roc_auc(targets, wap),
                          get_roc_auc(targets, amp)])
        
        reset_pipeline(pipeline)
        
        if verbose:
            print('iter num: {}'.format(iter_num))
            print( 'roc auc:', roc_auc[-1] )
            print()
    
    if verbose:
        print('ROC AUC: AP | MP | WAP | AMP')
        print('ROC AUC: ', np.mean(roc_auc, axis=0))
        print()
        
    return np.mean(roc_auc, axis=0)

In [50]:
vote = Vote()
batch_size = 500
scores_iter = 3
n_start = 100
vectorization='tf-idf'

weighted_algorithms = [ 
    IQRModel(n_start=n_start, max_samples=500),
    Dbscan(n_start=n_start, max_samples=500, eps=0.1, min_samples=15),
    Ellipse(n_start=n_start, max_samples=500, contamination=0.75), # 
    LOF(n_start=n_start, max_samples=500, contamination=0.01, n_neighbors=5), # 
    Knn(n_start=n_start, max_samples=500, n_neighbors=5),
    SVM(n_start=n_start, max_samples=500, nu=0.0001), # 
    LOF(n_start=n_start, max_samples=500, contamination=0.0001, n_neighbors=5), 
    SVM(n_start=n_start, max_samples=500, nu=0.1), 
]
    
#weights = [0.15, 0.15, 0.4, 0.15, 0.15]
weights = [0.2, 0.4, 0.4]
    
pipeline = [
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=6, random_state=5, max_iter=50), weighted_algorithms[0] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         NMF(n_components=5, init='random', max_iter=1000, random_state=0), weighted_algorithms[0] ],
#     [ Preprocessor(), Vectorizer('doc-to-vec', vocab_corpus),
#         TruncatedSVD(n_components=5, n_iter=100, random_state=0), weighted_algorithms[7] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=5, n_jobs=-1, random_state=5), weighted_algorithms[1] ],
    [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
        NMF(n_components=10, init='random', max_iter=1000, random_state=0), weighted_algorithms[2] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=6, random_state=42), weighted_algorithms[3] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=6, random_state=5, max_iter=500), weighted_algorithms[3] ],
    [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
        NMF(n_components=5, init='random', max_iter=1000, random_state=0), weighted_algorithms[3] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=6, n_jobs=-1, random_state=42), weighted_algorithms[3] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=6, n_jobs=-1, random_state=5), weighted_algorithms[4] ],
    [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
        NMF(n_components=5, init='random', max_iter=1000, random_state=0), weighted_algorithms[5] ],
    
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         NMF(n_components=10, init='random', max_iter=1000, random_state=0), weighted_algorithms[6] ],
    
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         NMF(n_components=5, init='random', max_iter=1000, random_state=0), weighted_algorithms[7] ],
    
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=6, n_jobs=-1, random_state=5), weighted_algorithms[6] ],
#     [ Preprocessor(), Vectorizer('tf-idf', vocab_corpus),
#         LDA(n_components=10, n_jobs=-1, random_state=5), weighted_algorithms[7] ],
]

# weights = [w[1] for w in weighted_algorithms] # [0.25,0.25,0.25,0.25]#

In [51]:
data_ensemble(pipeline=pipeline, batch_size=batch_size, vote=vote,
              n_start=n_start, repeat_count=scores_iter, weights=weights)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


iter num: 0
roc auc: [0.8179760029225747, 0.645848345777108, 0.8271700116446332, 0.5]



  return (1 / (1 + np.exp(-np.asarray(x)))).tolist()


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


iter num: 1
roc auc: [0.8185984199831039, 0.6466198598077494, 0.8276376235815238, 0.5]



  return (1 / (1 + np.exp(-np.asarray(x)))).tolist()


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


iter num: 2
roc auc: [0.8180616252254721, 0.6461298719090349, 0.8271976391077014, 0.5]


ROC AUC: AP | MP | WAP | AMP
ROC AUC:  [0.81821202 0.64619936 0.82733509 0.5       ]



  return (1 / (1 + np.exp(-np.asarray(x)))).tolist()


array([0.81821202, 0.64619936, 0.82733509, 0.5       ])