In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
import nltk
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('../Data/final.csv', index_col=0)

In [3]:
df = df.sample(frac=1)

In [4]:
y = []
X = []

for index, row in df.iterrows():
    y.append(row[1])
    X.append(row[0])

In [5]:
def tokenize(text):
    text = text.replace('\n',' ')
    text = text.replace('  ',' ')
    text = text.lower()
    text = text.replace(',','')
    text = text.replace('?','')
    text = text.replace('!','')
    text = text.replace('.','')
    stopwordslist = stopwords.words('portuguese')
    stemmer = RSLPStemmer()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwordslist]
    stems = [stemmer.stem(item) for item in tokens]
    return stems

In [6]:
countVec = TfidfVectorizer(tokenizer=tokenize, sublinear_tf=True)

In [7]:
countVec.fit(X)

TfidfVectorizer(sublinear_tf=True,
                tokenizer=<function tokenize at 0x000001E20EF859D0>)

In [8]:
pickle.dump(countVec, open('./TfidfVectorizer.pkl', 'wb'))

In [9]:
X = countVec.transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
lr = LogisticRegression(max_iter= 500, C=5)
lr.fit(X_train,y_train)

LogisticRegression(C=5, max_iter=500)

In [12]:
lr.get_params()

{'C': 5,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 500,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [13]:
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("acurácia da regressão lógica: %s"%(accuracy))

acurácia da regressão lógica: 0.8708661417322835


In [14]:
np.mean(cross_val_score(lr, X, y, scoring='accuracy')) * 100

86.64144663305098

In [15]:
def evaluation_preds(y_true, y_preds):
    accuracy = accuracy_score(y_true, y_preds)
    print('acurácia: %s'%accuracy)
    return accuracy

In [16]:
train_split = round(0.7 * len(df))
valid_split = round(train_split + 0.15 * len(df))

X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split:]

In [17]:
logReg = LogisticRegression()
logReg.fit(X_train, y_train)

LogisticRegression()

In [18]:
logReg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [19]:
y_pred = logReg.predict(X_valid)

In [20]:
evaluation_preds(y_valid, y_pred)

acurácia: 0.8508403361344538


0.8508403361344538