# Construindo um Classificador de Textos

In [1]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [None]:
def build_document_vector(text):
    """
    Build a scaled vector for the document (mean of the words present in it)
    :param text: document to be vectorized (tokenized)
    :param model: word2vec model
    :return:
    """
    vocab = set([])
    feature_count = model.syn0.shape[1]
    vec = np.zeros(feature_count).reshape((1, feature_count))
    count = 0.
    frases = [bigram[f] for f in _get_phrases(text)]

    for f in frases:
        for word in f:
            if word in vocab:
                continue
            try:
                vec += model[word].reshape((1, feature_count))
                vocab.add(word)
                count += 1.
            except KeyError:
                continue
        if count != 0:
            vec /= count
    return vec

In [2]:
def stream_docvec(fname, chunk):
    reader = pd.read_csv(fname, chunksize=chunk)
    for df in reader:
        yield df

In [3]:
rfclf = RandomForestClassifier(n_estimators=400, criterion='entropy', n_jobs=-1, min_samples_leaf=3, warm_start=True, verbose=0)
etclf = ExtraTreesClassifier(n_estimators=400, n_jobs=-1,min_samples_leaf=3, warm_start=True, verbose=0)

In [4]:
vcclf = VotingClassifier(estimators=[('rf', rfclf), ('et', etclf)], voting='soft', weights=[2,1])

In [None]:
docstreamer = stream_docvec('docvs.csv', batchsize)
    scaler = StandardScaler()

    acc_hist = defaultdict(lambda: [])
    for n, df in enumerate(docstreamer):
        try:
            Y = get_Y(get_original_class(df['ids']))
        except Exception as e:
            raise(e)
            break
        X = df.as_matrix()[:, :-2]  # remove ids and labels columns at the end
        probas = defaultdict(lambda: [])
        skf = StratifiedKFold(2, shuffle=True)
        print(df.ids.head(),Y)
        for train_index, test_index in skf.split(X, Y):
            scaler.fit(X)
            X = scaler.transform(X)
            print("==> Fitting:")
            # print ("Passive Agressive")
            # paclf.partial_fit(X[train_index], Y[train_index], classes=np.array([0, 1]))
            # probas['pa'].append((paclf.predict_proba(X[test_index]), Y[test_index]))
            print ("SGDC")
            sgclf.partial_fit(X[train_index], Y[train_index], classes=np.array([0, 1]))
            probas['SGDC'].append((sgclf.predict_proba(X[test_index]), Y[test_index]))
            print("Random Forest")
            rfclf.fit(X[train_index], Y[train_index])
            probas['RF'].append((rfclf.predict_proba(X[test_index]), Y[test_index]))
            print("Voting")
            vcclf.fit(X[train_index], Y[train_index])
            probas['Voting'].append(vcclf.predict_proba(X[test_index]))
            print("==> Scoring:")
            # acc_hist['pa'].append(cross_val_score(paclf, X[test_index], Y[test_index], cv=2, n_jobs=-1).mean())
            acc_hist['SGD'].append(cross_val_score(sgclf, X[test_index], Y[test_index], cv=2, n_jobs=-1).mean())
            acc_hist['RF'].append(cross_val_score(rfclf, X[test_index], Y[test_index], cv=2, n_jobs=-1).mean())
            acc_hist['Voting'].append(vcclf.score(X[test_index], Y[test_index]))
            print_class_report(X[test_index], Y[test_index], sgclf, 'SGDC')
            print_class_report(X[test_index], Y[test_index], rfclf, 'RF')
            print_class_report(X[test_index], Y[test_index], vcclf, 'Voting')

            plot_learning(acc_hist)
        plot_roc(probas)

        print('trained {} documents.'.format((n+1)*batchsize))
    df_acc = pd.DataFrame(acc_hist)