### Bibliotecas extras utilizadas
#### nltk
para a instalação do RSLPStemmer, punkt e stopwords, precisa-se fazer "import nltk" e "nltk.download_shell()" no terminal, então escolher a opção "d" e digitar "rslp" e depois o mesmo procedimento para "stopwords" e "punkt"
#### lxml
#### pandas
#### seaborn
#### matplotlib

In [None]:
from sklearn.feature_extraction.text import (TfidfTransformer,
                                             CountVectorizer)

from sklearn.metrics import accuracy_score

from sklearn.model_selection import (cross_val_score,
                                      KFold)

from sklearn.linear_model import (SGDClassifier,
                                  RidgeClassifier,
                                  Perceptron,
                                  PassiveAggressiveClassifier)

from sklearn.neighbors import (KNeighborsClassifier,
                               NearestCentroid)

from sklearn.tree import DecisionTreeClassifier

from sklearn.naive_bayes import (BernoulliNB,
                                 MultinomialNB)

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline

from nltk.stem import RSLPStemmer

from nltk.tokenize import word_tokenize

from nltk.data import load

from nltk.corpus import stopwords as stpw

from numpy import (zeros,
                   array)

from re import sub
from time import time
from lxml import etree as ET

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
def clean(dirty_file):
    for i in range(14):
        dirty_file.readline()

    xml_file = "<items>\n" + "\n".join(dirty_file.readlines()) + "</items>\n"

    parser = ET.XMLParser(recover=True)
    tree = ET.fromstring(xml_file, parser=parser)
    lista = []
    attri = set()
    
    for i in tree.getchildren():
        attri.add(i.attrib['category'])
        texto = [i.text for i in i.getchildren() if i.tag == 'text']
        if texto[0]:
            lista.append({i.attrib['category']:texto[0]})

    return {key: value for key, value in zip(attri, range(len(attri)))}, lista


def transform(ldocs, stop, stemmer, sent_tokenizer):
    """
    This function receive a python list of texts, a python list of stopwords and a nltk stemmer object.
    It returns two numpy arrays, the first is an array of stemmed texts and no stopwrods,
    the second is an array with the labels of every single element from the first array.

    path_txt -> path to folder txt files
    ldocs -> python list of file names
    stop -> list with all stop words
    stemmer -> a stemmer object from ntlk, like SnowballStemmer("english")
    label -> boolean indicator of classification(classficador.py) or train(treinamento.py)
    """
    
    docs_cleared = []
    labels = []
    
    for doc in ldocs:
        with open(doc, "r") as dirty_file:
            
            classes, file = clean(dirty_file)
            
            clean_definitions = [sub('[^\w\s-]',''," ".join([stemmer.stem(word) for word in list(filter(lambda x: x.lower() not in stop, [i  for i in sum([word_tokenize(sent, language='portuguese') for sent in sent_tokenizer.tokenize(list(definition.values())[0])],[]) if i not in ',.-"!;:?']))])) for definition in file]
            """
            desmembrando o comando acima, temos:
            for definition in file
                list(definition.values())[0] -> pega cada deficinao/texto da noticia
                assim, realiza o seguinte comando
                sent_tokenizer.tokenize(list(definition.values())[0]) que gera uma lista de tokens por setenca (sent)
                para cada texto da noticia, entao a chamada word_tokenize(sent, language='portuguese') retorna uma lista
                de tokens de cada palavra de cada sentenca, sendo assim, temos uma lista de listas, entao realizamos
                o flatten dessa lista para retornar apenas uma, com o comando sum(minhaLista,[]), o resultado
                do flatten eh entao passado por um filter, onde tira-se as stopwords e por fim realiza-se o stemming de
                cada palavra da lista, posteriormente fazemos o join das palavras e realizando um processamento
                de regex para retornar o texto da noticia perfeitamente tratado
            """
            labels.append([classes[list(d.keys())[0]] for d in file])

            docs_cleared.append(clean_definitions)
           
    return array(sum(docs_cleared, [])), array(sum(labels, []))

def evaluate(docs, labels, classifier, k_fold):
    """
    Function to performe the train/test

    docs -> python list with texts from documents
    labels -> labels from definitions in docs
    classifier -> scikit classfier
    k_fold -> a k fold object from scikit
    """
    clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', classifier)])
        
    scores = []
    classes = list(set(labels))
    start = time()
    for train_indices, test_indices in k_fold.split(docs):
        
        train_text = docs[train_indices]
        train_y = labels[train_indices]

        test_text = docs[test_indices]
        test_y = labels[test_indices]
        
        clf.fit(train_text, train_y)
        predictions = clf.predict(test_text)
        
        score = accuracy_score(test_y, predictions)
        scores.append(score)
    end = time()
    return len(docs), sum(scores)/len(scores), clf, end - start

def present_info(total, score, pipe, t):
    """
    Function to present some information to the user.

    total -> total number of definitions of documents
    score -> a total score from a train/test process
    pipe -> a pipeline object from scikit
    t -> time spent with trai/test   
    """
    cls = str(pipe.get_params()["classifier"].__class__.__name__)
    
    print("Using the classifier: ", cls)
    print('Total classified:', total)
    print('Score:', score)
    print("Time spent:", t,"s","\n")

    return cls, score

def potting(VectorizerList):
    
    sns.set(style="whitegrid", color_codes=True)
    TFIDF = pd.DataFrame({'Classificadores': [i for i,_ in VectorizerList], 'Scores': [j for _,j in VectorizerList]})
    x_tfidf = array(range(len(VectorizerList)))
    plt.xticks(x_tfidf,[i for i,_ in VectorizerList], rotation=30)
    plt.plot(x_tfidf, [j for _,j, in VectorizerList], marker='o', linestyle='--')
    plt.grid(True)
    plot = sns.pointplot(palette="Set2", dodge=True, markers=["x"], x='Classificadores',y='Scores',data=TFIDF).set_xticklabels(TFIDF['Classificadores'], rotation=35)
    
    plt.show()

In [None]:
ldocs = ['news_data.xml']
VectorizerList = []
stemmer = RSLPStemmer()
stopwords = stpw.words('portuguese')
tokenizer = load('tokenizers/punkt/portuguese.pickle')

algorithms= {'LinearSVC' : LinearSVC(),\
            'MultinomialNB' : MultinomialNB(alpha=0.01),\
            'SGDClassifier' : SGDClassifier(n_iter=500),\
            'RidgeClassifier' : RidgeClassifier(tol=1e-2, solver="sag"),\
            'Perceptron' : Perceptron(n_iter=300),\
            'PassiveAggressiveClassifier' : PassiveAggressiveClassifier(n_iter=300),\
            'KNeighborsClassifier' : KNeighborsClassifier(n_neighbors=50),\
            'RandomForestClassifier' : RandomForestClassifier(n_estimators=30),\
            'BernoulliNB' : BernoulliNB(alpha=.01),\
            'DecisionTreeClassifier' : DecisionTreeClassifier(random_state=0),\
            'NearestCentroid' : NearestCentroid()}

k_fold = KFold(n_splits=10, shuffle=True)

docs, labels = transform(ldocs, stopwords, stemmer, tokenizer)

i = time()
for clf in algorithms.values():
    VectorizerList.append(present_info(*evaluate(docs, labels, clf, k_fold)))
f = time()
print("Classification took {} second(s)".format(f-i))

## Abaixo temos a comparação entre os classificadores

In [None]:
potting(VectorizerList)