In [86]:
import pandas as pd
import numpy as np



from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler,StandardScaler
from scipy.sparse import hstack,csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import average_precision_score , roc_auc_score

In [87]:
df = pd.read_csv('raw_data_wlabels.csv').drop('Unnamed: 0',axis=1).dropna()

y = df['y']
xdata = df.drop('y',axis=1)

data_train,data_val,ytrain,yval = train_test_split(xdata,
                                                    y,
                                                    test_size = 0.5,
                                                    random_state=0)

In [88]:
# Vetorizando as features de texto
title_train = data_train['titles']
title_val = data_val['titles']

title_vec = TfidfVectorizer(min_df = 2,ngram_range=(1,1))

title_vec3 = TfidfVectorizer(min_df = 4,ngram_range=(1,5))

title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

title_bow_train3 = title_vec3.fit_transform(title_train)
title_bow_val3 = title_vec3.transform(title_val)


autor_train = data_train['author']
autor_val = data_val['author']

autor_vec = TfidfVectorizer(min_df = 2,ngram_range=(1,1))
autor_vec = TfidfVectorizer(min_df = 4,ngram_range=(1,5))

autor_bow_train = autor_vec.fit_transform(autor_train)
autor_bow_val = autor_vec.transform(autor_val)

autor_bow_train3 = autor_vec3.fit_transform(autor_train)
autor_bow_val3 = autor_vec3.transform(autor_val)

In [89]:
mask_train = data_train.drop(['titles','author'],axis=1)
mask_val = data_val.drop(['titles','author'],axis=1)

xtrain_wvec = hstack([title_bow_train,autor_bow_train,mask_train])
xval_wvec = hstack([title_bow_val,autor_bow_val,mask_val])

xtrain_wvec3 = hstack([title_bow_train3,autor_bow_train3,mask_train])
xval_wvec3 = hstack([title_bow_val3,autor_bow_val3,mask_val])


967      66.22
833     159.00
109      30.00
379      43.92
171      48.90
         ...  
1035     39.90
765      72.80
837      41.25
559      62.95
686     249.90
Name: price, Length: 569, dtype: float64

# Regressão Logistica

In [90]:
xtrain_wvec2 = csr_matrix(xtrain_wvec)
xval_wvec2 = csr_matrix(xval_wvec)

In [91]:
from sklearn.linear_model import LogisticRegression
lr = make_pipeline(MaxAbsScaler(),LogisticRegression(C=30,n_jobs=6,random_state=0))
lr.fit(xtrain_wvec2,ytrain)

Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('logisticregression',
                 LogisticRegression(C=30, n_jobs=6, random_state=0))])

In [92]:
p_lr = lr.predict_proba(xval_wvec2)[:,1]
print(f'AVP: {average_precision_score(yval,p_lr)}')    
print(f'ROC: {roc_auc_score(yval,p_lr)}')

AVP: 0.524067608236109
ROC: 0.7158684241918167


# Random Forest


In [93]:
from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(n_estimators = 1000,random_state=0,min_samples_leaf=3,class_weight = 'balanced', n_jobs=6)
rdf = rdf.fit(csr_matrix(xtrain_wvec),ytrain)

In [94]:
p_rd = rdf.predict_proba(csr_matrix(xval_wvec))[:,1]
print(f'AVP: {average_precision_score(yval,p_rd)}')    
print(f'ROC: {roc_auc_score(yval,p_rd)}')

AVP: 0.48127168867962844
ROC: 0.6562239055230824


# Árvore de Decisão

In [95]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 0,max_depth = 4,class_weight = "balanced")
dt.fit(csr_matrix(xtrain_wvec),ytrain)

DecisionTreeClassifier(class_weight='balanced', max_depth=4, random_state=0)

In [96]:
p_dt = dt.predict_proba(csr_matrix(xval_wvec))[:,1]
print(f'AVP: {average_precision_score(yval,p_dt)}')    
print(f'ROC: {roc_auc_score(yval,p_dt)}')

AVP: 0.3203137553091555
ROC: 0.563357389955863


# LGBM

In [97]:
best = [0.0017606961005459585,10,2,0.7976031342753723,0.23056440691282926,906,4,5]

In [98]:
from lightgbm import LGBMClassifier
LGBM =  LGBMClassifier(learning_rate=best[0], num_leaves= 2 ** best[1], max_depth=best[1],
                         min_child_samples=best[2], subsample=best[3],
                         colsample_bytree=best[4], bagging_freq = 1, n_estimators=best[5], 
                         random_state = 0, class_weight ='balanced',n_jobs=6)
LGBM.fit(csr_matrix(xtrain_wvec3),ytrain)



LGBMClassifier(bagging_freq=1, class_weight='balanced',
               colsample_bytree=0.23056440691282926,
               learning_rate=0.0017606961005459585, max_depth=10,
               min_child_samples=2, n_estimators=906, n_jobs=6, num_leaves=1024,
               random_state=0, subsample=0.7976031342753723)

In [99]:
p_lgb = LGBM.predict_proba(csr_matrix(xval_wvec3))[:,1]

In [104]:
p_lgb = LGBM.predict_proba(csr_matrix(xval_wvec3))[:,1]
print(f'AVP: {average_precision_score(yval,p_lgb)}')    
print(f'ROC: {roc_auc_score(yval,p_lgb)}')

AVP: 0.5296939830564515
ROC: 0.6951121316950972


# Montando o Ensamble

In [105]:
pd.DataFrame({'LR': p_lr,'RDF':p_rd,'DCT':p_dt,'LGBM':p_lgb}).corr()

Unnamed: 0,LR,RDF,DCT,LGBM
LR,1.0,0.569184,0.279887,0.56467
RDF,0.569184,1.0,0.630142,0.872227
DCT,0.279887,0.630142,1.0,0.434967
LGBM,0.56467,0.872227,0.434967,1.0


In [106]:
p = (0.25*p_rd + 0.25*p_lr + 0.5*p_lgb)

In [107]:
print(f'AVG : {average_precision_score(yval,p)} \nROC : {roc_auc_score(yval,p)}')

AVG : 0.5512737816445318 
ROC : 0.7172849815101994


# Salvando os Modelos

In [108]:
import joblib as jb
jb.dump(lr,'logistic_reg_2021_05_20.pkl')
jb.dump(rdf,'random_for_2021_05_20.pkl')
jb.dump(LGBM,'LGBM_2021_05_20.pkl')

jb.dump(title_vec,'title_vectorizer_2021_05_20.pkl')
jb.dump(title_vec3,'title_vectorizer3_2021_05_20.pkl')
jb.dump(autor_vec,'autor_vectorizer_2021_05_20.pkl')



['autor_vectorizer_2021_05_20.pkl']