# Normalizador de texto

In [11]:
import nltk
import re
import unicodedata
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')


#Limpar tags html do texto se tiver
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

#Remover palavraas com acento
def remove_accent(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

#Remover caracter especial
def remove_special_char(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

#Remove stopwords e aplica steeming
def remove_stopwords_stemming(text):
    tokens = tokenizer.tokenize(text)
    filtered_text = ' '.join([token for token in tokens if token.lower() not in stopword_list])
    return filtered_text


def normalizator(text, remove_html=True, remove_sw=True):

    text = text.lower()
    
    if(remove_html==True):
        #remove html
        text = strip_html_tags(text)

    if (remove_sw==True):
        # remove 'stopword'
        text = remove_stopwords_stemming(text)

    #remove acento
    text = remove_accent(text)

    #remove new line extra
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text)

    #inserir espaco entre caracter especial
    special_char_pattern = re.compile(r'[\}\}\\\(\)\./!-]')
    text = special_char_pattern.sub(" ", text)

    #remove caracter especial
    text = remove_special_char(text)

    # remove extra whitespace
    text = re.sub(' +', ' ', text)

    return text



In [14]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np

In [15]:
# Carrega a data do 20 newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [16]:
# Vetoriza a os dados
vectorize = TfidfVectorizer()
X_train = vectorize.fit_transform(twenty_train.data)
X_train.shape

(11314, 130107)

In [17]:
# treinando o NB
# clf = MultinomialNB().fit(X_train, twenty_train.target)
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [18]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [19]:
# Dado de treino
twenty_test.data[0].split('\n')[:3]

['From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)',
 'Subject: Need info on 88-89 Bonneville',
 'Organization: University at Buffalo']

In [20]:
# Dado de testo
twenty_train.data[0].split('\n')[:3]

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu']

In [21]:
# Predict com native bayners
predict = text_clf.predict(twenty_test.data)
np.mean(predict == twenty_test.target)

0.7738980350504514

In [22]:
clf = MultinomialNB().fit(X_train, twenty_train.target)

In [23]:
clf.fit(X_train, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
clf.predict(vectorize.transform(twenty_test.data))

array([ 7, 11,  0, ...,  9,  3, 15])

In [27]:
from sklearn.linear_model import SGDClassifier
import numpy as np
clf = SGDClassifier().fit(X_train, twenty_train.target)



In [28]:
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [29]:
np.mean(clf.predict(vectorize.transform(twenty_test.data)) == twenty_test.target)

0.852761550716941

In [41]:
from xgboost import XGBClassifier

In [38]:
dtrain = xgb.DMatrix(X_train, label=twenty_train.target)
dtest = xgb.DMatrix(vectorize.transform(twenty_test.data))

In [43]:
clf = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)

In [44]:
clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [45]:
clf.fit(X_train, twenty_train.target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [48]:
preds = clf.predict(vectorize.transform(twenty_test.data))

  if diff:


In [49]:
preds

array([ 4, 12,  0, ...,  9, 12, 15])

In [50]:
np.mean(preds == twenty_test.target)

0.7753584705257568