In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, average_precision_score

## Carregando os Dados

In [None]:
df = pd.read_csv('livros.csv')

## Dividindo as variáveis nos conjuntos de teste e treino

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
description = df['description']
y = df['y'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(description, y,test_size=0.6, random_state=42)

## Aplicando o Tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [None]:
stopwords = nltk.corpus.stopwords.words('portuguese')

vectorizer = TfidfVectorizer(min_df=1,ngram_range=(1,3), stop_words=stopwords)

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
X_train_vectorized.shape, X_test_vectorized.shape

intro to machine learning -> intro, to, machine, learning  -> ngram_range=(1,1)  
intro to machine learning -> intro, to, machine, learning, intro to, to machine, machine learning -> ngram_range=(1,2)  
intro to machine learning -> intro to, to machine, machine learning -> ngram_range=(2,2)  


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=1000, random_state=42, min_samples_leaf=1, n_jobs=6)
rfc.fit(X_train_vectorized, y_train)

In [None]:
p = rfc.predict_proba(X_test_vectorized)[:, 1]

In [None]:
average_precision_score(y_test, p), roc_auc_score(y_test, p)

# 7 Logistic Reg

In [None]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression

In [None]:
scaler = MaxAbsScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train_vectorized)
X_test_scaled = scaler.transform(X_train_vectorized)

In [None]:
lr = LogisticRegression(C=0.5,n_jobs=6, random_state=0)
lr.fit(X_train_scaled, y_train)

In [None]:
p = lr.predict_proba(X_test_vectorized)[:, 1]

In [None]:
average_precision_score(y_test, p), roc_auc_score(y_test, p)

## Testando na Pratica

In [None]:
print('Insira um resumo de um livro de aventura')
text = [input()]
text_vectorized = vectorizer.transform(text)

print('A turma que fez esse algoritmo tem {}% de achar o livro interesante'.format(rfc.predict_proba(text_vectorized)[:,1][0]*100))