### imports

In [1]:
import os
import re

import pandas as pd
import numpy as np

labels_arq = './base_treino.csv'
data_dir = './base_treino/'

### lendo labels

In [2]:
labels = pd.read_csv(labels_arq, sep=';', header=None)
labels.columns = ['id', 'url', '_label_']
labels.set_index('id', inplace = True)

In [3]:
# total de labels
len(labels)

101674

In [4]:
labels.sample()

Unnamed: 0_level_0,url,_label_
id,Unnamed: 1_level_1,Unnamed: 2_level_1
74019,ecosense.com.br,Food and Nutrition


In [5]:
Dominios_Ecommerce = './Dominios_Ecommerce.csv'
labels_ecommerce = pd.read_csv(Dominios_Ecommerce
                               , error_bad_lines = False
                               , sep=';'
                               , encoding='latin-1'
                               , header = 0)

labels_ecommerce.rename(columns= {'Dominio': 'url'}, inplace= True)
labels_ecommerce.rename(columns= {'flag_ecommerce': '_label_ecommerce'}, inplace= True)

In [6]:
# total de labels
len(labels_ecommerce)

175287

In [7]:
labels_ecommerce.sample()

Unnamed: 0,url,_label_ecommerce,ecommerce,nota
156785,revisandotextos.com.br,0,No,51.0


In [8]:
# transforma os labels para int
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels['_label_'] = le.fit_transform(labels._label_)

In [9]:
le.classes_

array(['Adult', 'Art - Entertainment ', 'Beauty and Fashion - Health',
       'Electronics',
       'Finance - IT-services and Telecom - Multimedia - Work - Government and Society ',
       'Food and Nutrition', 'Home and Garden - Nature',
       'Leisure - Sports', 'Science and Education', 'Travel and Transport'], dtype=object)

In [10]:
labels_ecommerce = pd.merge(labels_ecommerce, labels, how='inner', on=['url'])

In [11]:
# total de labels
len(labels_ecommerce)

101674

In [12]:
labels_ecommerce.head()

Unnamed: 0,url,_label_ecommerce,ecommerce,nota,_label_
0,mygconsultoria.com.br,0,No,25.0,4
1,m4web.com.br,0,No,20.0,4
2,ccsciadalimpeza.com.br,0,No,17.0,4
3,centroirai.com.br,0,No,22.0,7
4,gugli.com.br,0,No,20.0,4


### lendo homepages

In [13]:
# quantidade de homepages
len(os.listdir(data_dir))

101677

In [14]:
os.chdir(data_dir)
lista_arquivos = os.listdir('.')

In [15]:
# carregando as homes 

df_features = []
for idx, home in enumerate(lista_arquivos):
    with open(home, 'rb') as fp:
        df_features.append((home, fp.read()))
    if not idx % 1000: print(idx, 'homes lidas \r', end="\r", flush=True)
else:
    print('FIM:', idx, 'homes lidas', end="\r", flush=True)

df_features = pd.DataFrame(df_features, )
df_features.columns = ['url', 'conteudo']

df_features = pd.merge(df_features, labels_ecommerce)
#df_features.drop(['url', 'ecommerce', 'nota', '_label_'], axis = 1, inplace = True)
df_features.drop(['ecommerce', 'nota', '_label_'], axis = 1, inplace = True)



In [16]:
df_features.rename(columns= {'_label_ecommerce': '_label_'}, inplace= True)

In [17]:
df_features.sample()

Unnamed: 0,url,conteudo,_label_
16587,cardiocurso.com.br,b'cardiocurso.com.br\n\n<!DOCTYPE HTML PUBLIC ...,0


### Filtrar sites não encontrados

In [18]:
#Filtrar html não achados
init_size = len(df_features['_label_'])
print('Num. total sites: '+str(init_size))
df_features = df_features[df_features.apply(lambda x: (not 'Internal Server Error' in x.conteudo.decode('utf-8')) & (not '404' in x.conteudo.decode('utf-8')), axis = 1)]
print('Num. sites não achados: '+str(init_size-len(df_features['_label_'])))

Num. total sites: 101674
Num. sites não achados: 15903


In [19]:
# para debug, e
# testar rapidamente diversos algoritmos
df_features = df_features.sample(frac = 0.2)

In [20]:
df_features.sample()

Unnamed: 0,url,conteudo,_label_
47812,imobiliariaorleans.com.br,b'imobiliariaorleans.com.br\n\n<!DOCTYPE html>...,0


### Limpando dataset

In [21]:
RE_D = re.compile('\d')
def nao_tem_numero(string):
    return not bool(RE_D.search(string))

In [22]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('portuguese')

In [23]:
# limpeza simples de tags html, etc...
# stemming
from bs4 import BeautifulSoup

#exclusao = set('!#$%&*+,-./:;<=>?@[\]^_`{|}~\n\t.1234567890')

def limpeza_numeros(txt):
    exclusao = set('1234567890')
    lista_retorno = [value for value in  txt.split(' ') if (len(set(value) & exclusao) is 0)]
    return ' '.join(lista_retorno)
    
def limpeza_beatiful_soup(html):
    soup = BeautifulSoup(html, 'html.parser')

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    # drop numbers, '.', left '\n' and left '\t'
    text = limpeza_numeros(text.replace('\n',' ').replace('\t', ' ').replace('.','').replace('-','').replace('/',''))
    #Stem algorithim
    #text = ' '.join([stemmer.stem(t) for t in text.split(' ')])
    
    return text.lower()

In [24]:
%%time
df_features['conteudo'] = df_features.conteudo.apply(lambda x: limpeza_beatiful_soup(x))

Wall time: 5min 49s


In [25]:
# ver amostra e checar oportunidades de melhorar a limpeza
df_features.conteudo.sample().values[0]

'justintimecargocombr just in time: logística login senha home empresa serviços notícias links downloads cotação vídeos trabalhe conosco contato notícias china passa ser a principal importadora da bolivia importações de minério da china devem subir embarques de carne de frango recuam em atendimento online índices econômicos fale com o presidente © copyright just in time'

### Extraindo as features do texto

In [26]:
stop_words_list = ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'Em', 'um', 'para', 'é', 'com', 
                   'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'foi', 
                   'ao', 'ele', 'das', 'tem', 'à', 'seu', 'sua', 'ou', 'ser', 'quando', 'muito', 'há', 
                   'nos', 'já', 'está', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 
                   'era', 'depois', 'sem', 'mesmo', 'aos', 'ter', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 
                   'estão', 'você', 'tinha', 'foram', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 
                   'têm', 'numa', 'pelos', 'elas', 'havia', 'seja', 'qual', 'será', 'nós', 'tenho', 'lhe', 
                   'deles', 'essas', 'esses', 'pelas', 'este', 'fosse', 'dele', 'tu', 'te', 'vocês', 'vos', 
                   'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 
                   'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 
                   'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 
                   'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 
                   'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessemos', 'estivessem', 'estiver', 
                   'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 
                   'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 
                   'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 
                   'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 
                   'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 
                   'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 
                   'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 
                   'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 
                   'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 
                   'teria', 'teríamos', 'teriam', 'a', 'about', 'above', 'after', 'again', 'against', 'all', 
                   'am', 'an', 'and', 'any', 'are', 'arent', 'as', 'at', 'be', 'because', 'been', 'before', 
                   'being', 'below', 'between', 'both', 'but', 'by', 'cant', 'cannot', 'could', 'couldnt', 
                   'did', 'didnt', 'do', 'does', 'doesnt', 'doing', 'dont', 'down', 'during', 'each', 'few', 
                   'for', 'from', 'further', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 
                   'hed', 'hes', 'her', 'here', 'heres', 'hers', 'herself', 'him', 'himself', 'his', 'how', 
                   'hows', 'i', 'id', 'im', 'ive', 'if', 'in', 'into', 'is', 'isnt', 'it', 'its', 'its', 
                   'itself', 'lets', 'me', 'more', 'most', 'mustnt', 'my', 'myself', 'no', 'nor', 'not', 'of', 
                   'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 
                   'own', 'same', 'shant', 'she', 'shed', 'shell', 'shes', 'should', 'shouldnt', 'so', 'some', 
                   'such', 'than', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 
                   'theres', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'this', 'those', 'through', 
                   'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasnt', 'we', 'wed', 'well', 'were', 
                   'weve', 'were', 'werent', 'what', 'whats', 'when', 'whens', 'where', 'wheres', 'which', 'while', 
                   'who', 'whos', 'whom', 'why', 'whys', 'with', 'wont', 'would', 'wouldnt', 'you', 'youd', 'youll', 
                   'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves']

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(strip_accents='ascii', min_df = 40, max_df = 0.5, stop_words = stop_words_list)
vec_cv = cv
df_tokens_cv = pd.DataFrame(vec_cv.fit_transform(df_features.conteudo).toarray(),
                         index = df_features.index).rename(columns = {v:k for k,v in vec_cv.vocabulary_.items()})

# MODEL`S INPUTS
df_features_cv = pd.concat([df_features, df_tokens_cv], axis = 1).drop('conteudo', axis = 1)

In [28]:
del df_features
del df_tokens_cv

In [29]:
lista_aux = list(df_features_cv.columns)

In [30]:
len(lista_aux)

7752

In [31]:
df_features_cv._label_.mean()

0.04022385449457853

In [32]:
df_features_cv['_label_'] = [1 if carrinho != 0 else label for carrinho, label in zip
                             (df_features_cv.carrinho, df_features_cv._label_)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (loja>=1)&(online>=1) else label for label, loja, online in zip 
                             (df_features_cv._label_, df_features_cv.loja, df_features_cv.online)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (loja>=1)&(virtual>=1) else label for label, loja, virtual in zip
                             (df_features_cv._label_, df_features_cv.loja, df_features_cv.virtual)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (comprar>=1)&(online>=1) else label for label, comprar, online in zip
                             (df_features_cv._label_, df_features_cv.comprar, df_features_cv.online)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (outlet>=1)&(virtual>=1) else label for label, outlet, virtual in zip
                             (df_features_cv._label_, df_features_cv.outlet, df_features_cv.virtual)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (outlet>=1)&(online>=1) else label for label, outlet, online in zip
                             (df_features_cv._label_, df_features_cv.outlet, df_features_cv.online)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (prazo>=1)&(entrega>=1) else label for label, prazo, entrega in zip
                             (df_features_cv._label_, df_features_cv.prazo, df_features_cv.entrega)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if frete>=1 else label for label, frete in zip
                             (df_features_cv._label_, df_features_cv.frete)]
print(df_features_cv._label_.mean())


df_features_cv['_label_'] = [1 if (vendas>=1)&(online>=1) else label for label, vendas, online in zip
                             (df_features_cv._label_, df_features_cv.vendas, df_features_cv.online)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (finalizar>=1)&(pedido>=1) else label for label, finalizar, pedido in zip
                             (df_features_cv._label_, df_features_cv.finalizar, df_features_cv.pedido)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if (formas>=1)&(pagamento>=1) else label for label, formas, pagamento in zip
                             (df_features_cv._label_, df_features_cv.formas, df_features_cv.pagamento)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if checkout>=1 else label for label, checkout in zip
                             (df_features_cv._label_, df_features_cv.checkout)]
print(df_features_cv._label_.mean())

df_features_cv['_label_'] = [1 if devolucoes>=1 else label for label, devolucoes in zip
                             (df_features_cv._label_, df_features_cv.devolucoes)]
print(df_features_cv._label_.mean())

0.06569896234114492
0.08126384516730792
0.09274804710271657
0.09799463681940072
0.09799463681940072
0.09851929579106913
0.10726361198554273
0.11198554273055847
0.12084644980762504
0.12119622245540398
0.13023201585636004
0.1316894018887723
0.1322723563017372


In [42]:
df_features_cv['_label_'] = [1 if (acido>=1)&(online>=1) else label for label, acido, online in zip (df_features_cv._label_, df_features_cv.acido, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (acupuntura>=1)&(online>=1) else label for label, acupuntura, online in zip (df_features_cv._label_, df_features_cv.acupuntura, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (adesivo>=1)&(online>=1) else label for label, adesivo, online in zip (df_features_cv._label_, df_features_cv.adesivo, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (adesivos>=1)&(online>=1) else label for label, adesivos, online in zip (df_features_cv._label_, df_features_cv.adesivos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (agua>=1)&(online>=1) else label for label, agua, online in zip (df_features_cv._label_, df_features_cv.agua, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (alcool>=1)&(online>=1) else label for label, alcool, online in zip (df_features_cv._label_, df_features_cv.alcool, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (algodao>=1)&(online>=1) else label for label, algodao, online in zip (df_features_cv._label_, df_features_cv.algodao, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (alimentacao>=1)&(online>=1) else label for label, alimentacao, online in zip (df_features_cv._label_, df_features_cv.alimentacao, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (alimenticio>=1)&(online>=1) else label for label, alimenticio, online in zip (df_features_cv._label_, df_features_cv.alimenticio, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (alimenticios>=1)&(online>=1) else label for label, alimenticios, online in zip (df_features_cv._label_, df_features_cv.alimenticios, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (alimento>=1)&(online>=1) else label for label, alimento, online in zip (df_features_cv._label_, df_features_cv.alimento, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (alimentos>=1)&(online>=1) else label for label, alimentos, online in zip (df_features_cv._label_, df_features_cv.alimentos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (aluminio>=1)&(online>=1) else label for label, aluminio, online in zip (df_features_cv._label_, df_features_cv.aluminio, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (aparelho>=1)&(online>=1) else label for label, aparelho, online in zip (df_features_cv._label_, df_features_cv.aparelho, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (aparelhos>=1)&(online>=1) else label for label, aparelhos, online in zip (df_features_cv._label_, df_features_cv.aparelhos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (arcondicionado>=1)&(online>=1) else label for label, arcondicionado, online in zip (df_features_cv._label_, df_features_cv.arcondicionado, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (armarios>=1)&(online>=1) else label for label, armarios, online in zip (df_features_cv._label_, df_features_cv.armarios, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (armas>=1)&(online>=1) else label for label, armas, online in zip (df_features_cv._label_, df_features_cv.armas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (aromas>=1)&(online>=1) else label for label, aromas, online in zip (df_features_cv._label_, df_features_cv.aromas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (artefatos>=1)&(online>=1) else label for label, artefatos, online in zip (df_features_cv._label_, df_features_cv.artefatos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (artes>=1)&(online>=1) else label for label, artes, online in zip (df_features_cv._label_, df_features_cv.artes, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (atrativos>=1)&(online>=1) else label for label, atrativos, online in zip (df_features_cv._label_, df_features_cv.atrativos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bandejas>=1)&(online>=1) else label for label, bandejas, online in zip (df_features_cv._label_, df_features_cv.bandejas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (baterias>=1)&(online>=1) else label for label, baterias, online in zip (df_features_cv._label_, df_features_cv.baterias, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bebidas>=1)&(online>=1) else label for label, bebidas, online in zip (df_features_cv._label_, df_features_cv.bebidas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bermudas>=1)&(online>=1) else label for label, bermudas, online in zip (df_features_cv._label_, df_features_cv.bermudas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bike>=1)&(online>=1) else label for label, bike, online in zip (df_features_cv._label_, df_features_cv.bike, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (blusas>=1)&(online>=1) else label for label, blusas, online in zip (df_features_cv._label_, df_features_cv.blusas, df_features_cv.online)]
#print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bolos>=1)&(online>=1) else label for label, bolos, online in zip (df_features_cv._label_, df_features_cv.bolos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bolsas>=1)&(online>=1) else label for label, bolsas, online in zip (df_features_cv._label_, df_features_cv.bolsas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bombas>=1)&(online>=1) else label for label, bombas, online in zip (df_features_cv._label_, df_features_cv.bombas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (bones>=1)&(online>=1) else label for label, bones, online in zip (df_features_cv._label_, df_features_cv.bones, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (botas>=1)&(online>=1) else label for label, botas, online in zip (df_features_cv._label_, df_features_cv.botas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (brincadeiras>=1)&(online>=1) else label for label, brincadeiras, online in zip (df_features_cv._label_, df_features_cv.brincadeiras, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (brincos>=1)&(online>=1) else label for label, brincos, online in zip (df_features_cv._label_, df_features_cv.brincos, df_features_cv.online)]
#print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (brindes>=1)&(online>=1) else label for label, brindes, online in zip (df_features_cv._label_, df_features_cv.brindes, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (brinquedos>=1)&(online>=1) else label for label, brinquedos, online in zip (df_features_cv._label_, df_features_cv.brinquedos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cabos>=1)&(online>=1) else label for label, cabos, online in zip (df_features_cv._label_, df_features_cv.cabos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cadeiras>=1)&(online>=1) else label for label, cadeiras, online in zip (df_features_cv._label_, df_features_cv.cadeiras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cadernos>=1)&(online>=1) else label for label, cadernos, online in zip (df_features_cv._label_, df_features_cv.cadernos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (caixas>=1)&(online>=1) else label for label, caixas, online in zip (df_features_cv._label_, df_features_cv.caixas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (calcados>=1)&(online>=1) else label for label, calcados, online in zip (df_features_cv._label_, df_features_cv.calcados, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (calcas>=1)&(online>=1) else label for label, calcas, online in zip (df_features_cv._label_, df_features_cv.calcas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (caldas>=1)&(online>=1) else label for label, caldas, online in zip (df_features_cv._label_, df_features_cv.caldas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (camaras>=1)&(online>=1) else label for label, camaras, online in zip (df_features_cv._label_, df_features_cv.camaras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cameras>=1)&(online>=1) else label for label, cameras, online in zip (df_features_cv._label_, df_features_cv.cameras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (caminhoes>=1)&(online>=1) else label for label, caminhoes, online in zip (df_features_cv._label_, df_features_cv.caminhoes, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (camisas>=1)&(online>=1) else label for label, camisas, online in zip (df_features_cv._label_, df_features_cv.camisas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (camisetas>=1)&(online>=1) else label for label, camisetas, online in zip (df_features_cv._label_, df_features_cv.camisetas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (canecas>=1)&(online>=1) else label for label, canecas, online in zip (df_features_cv._label_, df_features_cv.canecas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (canetas>=1)&(online>=1) else label for label, canetas, online in zip (df_features_cv._label_, df_features_cv.canetas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (canoas>=1)&(online>=1) else label for label, canoas, online in zip (df_features_cv._label_, df_features_cv.canoas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (capas>=1)&(online>=1) else label for label, capas, online in zip (df_features_cv._label_, df_features_cv.capas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (carros>=1)&(online>=1) else label for label, carros, online in zip (df_features_cv._label_, df_features_cv.carros, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (carteiras>=1)&(online>=1) else label for label, carteiras, online in zip (df_features_cv._label_, df_features_cv.carteiras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cartuchos>=1)&(online>=1) else label for label, cartuchos, online in zip (df_features_cv._label_, df_features_cv.cartuchos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (celulares>=1)&(online>=1) else label for label, celulares, online in zip (df_features_cv._label_, df_features_cv.celulares, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cervejas>=1)&(online>=1) else label for label, cervejas, online in zip (df_features_cv._label_, df_features_cv.cervejas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cestas>=1)&(online>=1) else label for label, cestas, online in zip (df_features_cv._label_, df_features_cv.cestas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (chapas>=1)&(online>=1) else label for label, chapas, online in zip (df_features_cv._label_, df_features_cv.chapas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (chaveiros>=1)&(online>=1) else label for label, chaveiros, online in zip (df_features_cv._label_, df_features_cv.chaveiros, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (chaves>=1)&(online>=1) else label for label, chaves, online in zip (df_features_cv._label_, df_features_cv.chaves, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (chocolates>=1)&(online>=1) else label for label, chocolates, online in zip (df_features_cv._label_, df_features_cv.chocolates, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cilindros>=1)&(online>=1) else label for label, cilindros, online in zip (df_features_cv._label_, df_features_cv.cilindros, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (circuitos>=1)&(online>=1) else label for label, circuitos, online in zip (df_features_cv._label_, df_features_cv.circuitos, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (colares>=1)&(online>=1) else label for label, colares, online in zip (df_features_cv._label_, df_features_cv.colares, df_features_cv.online)]
#print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (colchoes>=1)&(online>=1) else label for label, colchoes, online in zip (df_features_cv._label_, df_features_cv.colchoes, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (comidas>=1)&(online>=1) else label for label, comidas, online in zip (df_features_cv._label_, df_features_cv.comidas, df_features_cv.online)]
#print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (copos>=1)&(online>=1) else label for label, copos, online in zip (df_features_cv._label_, df_features_cv.copos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cortinas>=1)&(online>=1) else label for label, cortinas, online in zip (df_features_cv._label_, df_features_cv.cortinas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cosmeticos>=1)&(online>=1) else label for label, cosmeticos, online in zip (df_features_cv._label_, df_features_cv.cosmeticos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (cristais>=1)&(online>=1) else label for label, cristais, online in zip (df_features_cv._label_, df_features_cv.cristais, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (decorativos>=1)&(online>=1) else label for label, decorativos, online in zip (df_features_cv._label_, df_features_cv.decorativos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (descartaveis>=1)&(online>=1) else label for label, descartaveis, online in zip (df_features_cv._label_, df_features_cv.descartaveis, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (eletrodomesticos>=1)&(online>=1) else label for label, eletrodomesticos, online in zip (df_features_cv._label_, df_features_cv.eletrodomesticos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (eletronicos>=1)&(online>=1) else label for label, eletronicos, online in zip (df_features_cv._label_, df_features_cv.eletronicos, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (embutidos>=1)&(online>=1) else label for label, embutidos, online in zip (df_features_cv._label_, df_features_cv.embutidos, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (empilhadeiras>=1)&(online>=1) else label for label, empilhadeiras, online in zip (df_features_cv._label_, df_features_cv.empilhadeiras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (ervas>=1)&(online>=1) else label for label, ervas, online in zip (df_features_cv._label_, df_features_cv.ervas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (espelhos>=1)&(online>=1) else label for label, espelhos, online in zip (df_features_cv._label_, df_features_cv.espelhos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (facas>=1)&(online>=1) else label for label, facas, online in zip (df_features_cv._label_, df_features_cv.facas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (fantasias>=1)&(online>=1) else label for label, fantasias, online in zip (df_features_cv._label_, df_features_cv.fantasias, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (farmacia>=1)&(online>=1) else label for label, farmacia, online in zip (df_features_cv._label_, df_features_cv.farmacia, df_features_cv.online)]
print(df_features_cv._label_.mean())
#df_features_cv['_label_'] = [1 if (fechaduras>=1)&(online>=1) else label for label, fechaduras, online in zip (df_features_cv._label_, df_features_cv.fechaduras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (ferragens>=1)&(online>=1) else label for label, ferragens, online in zip (df_features_cv._label_, df_features_cv.ferragens, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (ferramentas>=1)&(online>=1) else label for label, ferramentas, online in zip (df_features_cv._label_, df_features_cv.ferramentas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (fibras>=1)&(online>=1) else label for label, fibras, online in zip (df_features_cv._label_, df_features_cv.fibras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (florais>=1)&(online>=1) else label for label, florais, online in zip (df_features_cv._label_, df_features_cv.florais, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (flores>=1)&(online>=1) else label for label, flores, online in zip (df_features_cv._label_, df_features_cv.flores, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (laminados>=1)&(online>=1) else label for label, laminados, online in zip (df_features_cv._label_, df_features_cv.laminados, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (laminas>=1)&(online>=1) else label for label, laminas, online in zip (df_features_cv._label_, df_features_cv.laminas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (lampadas>=1)&(online>=1) else label for label, lampadas, online in zip (df_features_cv._label_, df_features_cv.lampadas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (lubrificantes>=1)&(online>=1) else label for label, lubrificantes, online in zip (df_features_cv._label_, df_features_cv.lubrificantes, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (luvas>=1)&(online>=1) else label for label, luvas, online in zip (df_features_cv._label_, df_features_cv.luvas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (madeiras>=1)&(online>=1) else label for label, madeiras, online in zip (df_features_cv._label_, df_features_cv.madeiras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (mascaras>=1)&(online>=1) else label for label, mascaras, online in zip (df_features_cv._label_, df_features_cv.mascaras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (medicamentos>=1)&(online>=1) else label for label, medicamentos, online in zip (df_features_cv._label_, df_features_cv.medicamentos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (notebooks>=1)&(online>=1) else label for label, notebooks, online in zip (df_features_cv._label_, df_features_cv.notebooks, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (pedras>=1)&(online>=1) else label for label, pedras, online in zip (df_features_cv._label_, df_features_cv.pedras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (perfumes>=1)&(online>=1) else label for label, perfumes, online in zip (df_features_cv._label_, df_features_cv.perfumes, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (persianas>=1)&(online>=1) else label for label, persianas, online in zip (df_features_cv._label_, df_features_cv.persianas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (plantas>=1)&(online>=1) else label for label, plantas, online in zip (df_features_cv._label_, df_features_cv.plantas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (pneus>=1)&(online>=1) else label for label, pneus, online in zip (df_features_cv._label_, df_features_cv.pneus, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (portas>=1)&(online>=1) else label for label, portas, online in zip (df_features_cv._label_, df_features_cv.portas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (pratos>=1)&(online>=1) else label for label, pratos, online in zip (df_features_cv._label_, df_features_cv.pratos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (proteses>=1)&(online>=1) else label for label, proteses, online in zip (df_features_cv._label_, df_features_cv.proteses, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (protetores>=1)&(online>=1) else label for label, protetores, online in zip (df_features_cv._label_, df_features_cv.protetores, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (pulseiras>=1)&(online>=1) else label for label, pulseiras, online in zip (df_features_cv._label_, df_features_cv.pulseiras, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (quadros>=1)&(online>=1) else label for label, quadros, online in zip (df_features_cv._label_, df_features_cv.quadros, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (queijos>=1)&(online>=1) else label for label, queijos, online in zip (df_features_cv._label_, df_features_cv.queijos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (roupas>=1)&(online>=1) else label for label, roupas, online in zip (df_features_cv._label_, df_features_cv.roupas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (smartphones>=1)&(online>=1) else label for label, smartphones, online in zip (df_features_cv._label_, df_features_cv.smartphones, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (suplementos>=1)&(online>=1) else label for label, suplementos, online in zip (df_features_cv._label_, df_features_cv.suplementos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (tapetes>=1)&(online>=1) else label for label, tapetes, online in zip (df_features_cv._label_, df_features_cv.tapetes, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (telhas>=1)&(online>=1) else label for label, telhas, online in zip (df_features_cv._label_, df_features_cv.telhas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (tenis>=1)&(online>=1) else label for label, tenis, online in zip (df_features_cv._label_, df_features_cv.tenis, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (tintas>=1)&(online>=1) else label for label, tintas, online in zip (df_features_cv._label_, df_features_cv.tintas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (toalhas>=1)&(online>=1) else label for label, toalhas, online in zip (df_features_cv._label_, df_features_cv.toalhas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (vasos>=1)&(online>=1) else label for label, vasos, online in zip (df_features_cv._label_, df_features_cv.vasos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (veiculos>=1)&(online>=1) else label for label, veiculos, online in zip (df_features_cv._label_, df_features_cv.veiculos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (velas>=1)&(online>=1) else label for label, velas, online in zip (df_features_cv._label_, df_features_cv.velas, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (vidros>=1)&(online>=1) else label for label, vidros, online in zip (df_features_cv._label_, df_features_cv.vidros, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (vinhos>=1)&(online>=1) else label for label, vinhos, online in zip (df_features_cv._label_, df_features_cv.vinhos, df_features_cv.online)]
print(df_features_cv._label_.mean())
df_features_cv['_label_'] = [1 if (vitaminas>=1)&(online>=1) else label for label, vitaminas, online in zip (df_features_cv._label_, df_features_cv.vitaminas, df_features_cv.online)]


0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387
0.14993587501457387


In [43]:
df_features_cv.reset_index(inplace= True)

In [44]:
#df_features_cv[df_features_cv.americanas!=0][['url','netshoes']]

In [60]:
df_features_cv._label_.mean()

0.15932144106330884

### Keras - redes neurais

In [61]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout #, Conv2D, LSTM
from keras.optimizers import RMSprop

In [62]:
# tamanho do batch: numero de amostras em cada vez (stochastic gradient descend) 
# qtd de epochs: numero de varreduras completas no dataset
batch_size = 512
epochs = 5

X_cv = df_features_cv.drop(['_label_', 'url'], axis = 1).values
y = df_features_cv._label_.values

#Train and test data
train_size = 0.8
x_train_cv, x_test_cv = X_cv[:int(len(X_cv)*train_size)+1],X_cv[int(len(X_cv)*train_size)+1: len(X_cv)]
y_train, y_test = y[:int(len(y)*train_size)+1],y[int(len(y)*train_size)+1: len(y)]

# saida:
num_classes = 2
y_nn_train = keras.utils.to_categorical(y_train, num_classes)
y_nn_test = keras.utils.to_categorical(y_test, num_classes)


In [63]:
#Topologia rede
def _get_model_():
    num_features = len(x_train_cv[0])

    model = Sequential()
    model.add(Dense(int(num_features/4),       # Dense: tot. conectado (FC) a camada anterior
              activation='relu', 
              input_shape=(num_features,),    # na 1a. vez incluir input_shape, input_dim
              name = 'layer_1'))   

    model.add(Dropout(0.01))
    model.add(Dense(int(num_features/8), activation='sigmoid', name = 'layer_2'))
    model.add(Dropout(0.01))
    model.add(Dense(int(num_features/64), activation='sigmoid', name = 'layer_3'))
    model.add(Dropout(0.01))
    model.add(Dense(2, activation='softmax', name = 'output_layer'), )

    #Resumo modelo
    #model.summary()
    
    return model

model = _get_model_()

In [64]:
# Compile model
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

Treinamento

In [65]:
# Etapa .fit()
# valores X e y são submetidos a rede para treinamento

history = model.fit(x_train_cv, y_nn_train,
                    batch_size = batch_size,
                    epochs = epochs,   # baixo aprende pouco, alto pode causar overfitting
                    verbose = 1,
                    validation_data = (x_test_cv, y_nn_test))

Train on 13724 samples, validate on 3430 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [66]:
score = model.evaluate(x_test_cv, y_nn_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.446280553239
Test accuracy: 0.894752186589


In [67]:
df_features_cv.shape

(17154, 7752)

In [68]:
#df_features_cv.reset_index(inplace=True)

In [69]:
#df_features_cv.drop('level_0', axis = 1, inplace=True)

In [70]:
df_features_cv.head() 

Unnamed: 0,url,_label_,aa,aba,abaixo,abastecimento,abc,aberta,abertas,aberto,...,years,yes,yet,yoga,york,youtube,ze,zero,zona,zoom
0,funprint.com.br,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,mauaplaza.com.br,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,labscs.com.br,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,caisep.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cursoscooperativistas.com.br,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
X_cv = df_features_cv.drop(['_label_','url'], axis = 1).values
x_base = X_cv[:int(len(X_cv))+1]

In [72]:
y_pred = model.predict(x_base)
y_pred = pd.DataFrame(data=y_pred)
#y_pred.rename(columns= {0: 'Result_Keras'}, inplace= True)

In [73]:
y_pred.rename(columns= {1: 'ProbEcomm'}, inplace= True)
y_pred.drop(0, axis= 1, inplace= True)

In [74]:
y_pred.sample()

Unnamed: 0,ProbEcomm
11903,0.002523


In [77]:
Result = pd.concat([df_features_cv.reset_index(), y_pred], axis = 1)

In [78]:
Result.shape

(17154, 7754)

In [79]:
Result.ProbEcomm.describe()

count    17154.000000
mean         0.150853
std          0.340827
min          0.000210
25%          0.001558
50%          0.003929
75%          0.015368
max          0.997656
Name: ProbEcomm, dtype: float64

In [80]:
Result[['_label_', 'ProbEcomm']].head()

Unnamed: 0,_label_,ProbEcomm
0,1,0.885301
1,0,0.000737
2,0,0.005276
3,0,0.002267
4,0,0.001944


In [88]:
def cria_curva(percentiles, variavel):
    Perc = list()
    for i in range(len(percentiles)):
        Perc.append(np.percentile(variavel, percentiles[i]))
    return Perc

perc = cria_curva([80,85,86,87,88,89,90,99],Result.ProbEcomm)
perc

[0.028416633605957031,
 0.318212693929672,
 0.79992550015449493,
 0.94253732085227993,
 0.96833503246307373,
 0.97740084946155559,
 0.98409538269042973,
 0.99753229737281801]

In [89]:
def marca_base(Perc, x):
    if x >= Perc[(len(Perc)-1)]:
        return len(Perc) +1
    else:
        for i in range(len(Perc)):
            if x < Perc[i]:
                return i + 1

Result['ProbEcomm_']= [marca_base(perc,s) for s in Result.ProbEcomm]

In [90]:
Result[Result._label_ == 1].ProbEcomm_.value_counts()

8    1510
1     210
9     171
7     167
6     165
5     156
4     142
2     121
3      91
Name: ProbEcomm_, dtype: int64

In [91]:
Result[Result._label_ == 0].ProbEcomm_.value_counts()

1    13502
2      748
3       80
8       34
4       30
5       15
6        7
7        4
9        1
Name: ProbEcomm_, dtype: int64

In [98]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

agg_dict_heatmap = {'level_0' : 'count'}
heatmap = Result.groupby(['_label_','ProbEcomm_']).agg(agg_dict_heatmap).copy()
heatmap.reset_index(inplace= True)
heatmap = heatmap.pivot('_label_','ProbEcomm_','level_0').copy()
heatmap.fillna(0, inplace= True)
#sns.heatmap(heatmap, annot=False, linewidths=.5) 
heatmap['TOTAL'] = heatmap[heatmap.columns].sum(axis = 1)    
heatmap

ProbEcomm_,1,2,3,4,5,6,7,8,9,TOTAL
_label_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,13502,748,80,30,15,7,4,34,1,14421
1,210,121,91,142,156,165,167,1510,171,2733


In [109]:
Result[(Result.ProbEcomm_ == 6)&(Result._label_== 0)].sample()

Unnamed: 0,level_0,url,_label_,aa,aba,abaixo,abastecimento,abc,aberta,abertas,...,yet,yoga,york,youtube,ze,zero,zona,zoom,ProbEcomm,ProbEcomm_
8559,8559,emefarmaonline.com.br,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.96902,6


In [114]:
Result[(Result.ProbEcomm_ == 9)&(Result._label_== 1)].sample(4)

Unnamed: 0,level_0,url,_label_,aa,aba,abaixo,abastecimento,abc,aberta,abertas,...,yet,yoga,york,youtube,ze,zero,zona,zoom,ProbEcomm,ProbEcomm_
3968,3968,clickviagens.com,1,0,0,2,0,0,0,0,...,0,0,2,0,0,0,0,0,0.997586,9
8244,8244,mservicedistribuidora.com.br,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0.997535,9
8418,8418,titanium.autobrasil24h.com.br,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.997556,9
12395,12395,crg2rastrear.autobrasil24h.com.br,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.997556,9


In [105]:
#Result[(Result.loja>=1)&(Result.online>=1)].shape
Result[(Result.loja>=1)&(Result.virtual>=1)].shape


(399, 7755)

In [67]:
palavras = list(Result.drop(['level_0', 'url', '_label_', 'ProbEcomm', 'ProbEcomm_'], axis=1).columns)

In [73]:
auxiliar = Result[Result.level_0 == 10][palavras].values
auxiliar = auxiliar.tolist()
auxiliar = auxiliar[0]

In [74]:
aux = range(len(auxiliar))

In [75]:
palavras_utilizadas = list()

In [76]:
for a in aux:
    if auxiliar[a] != 0:
        palavras_utilizadas.append(palavras[a])

In [77]:
palavras_utilizadas

['acesse',
 'acessorios',
 'acima',
 'antiga',
 'aproveitar',
 'aqui',
 'assinar',
 'ate',
 'baby',
 'bater',
 'beach',
 'bem',
 'bermudas',
 'bones',
 'browser',
 'buscar',
 'cadastrese',
 'cadastro',
 'calcados',
 'camisas',
 'camiseta',
 'camisetas',
 'campos',
 'carrinho',
 'cartao',
 'categorias',
 'ce',
 'cep',
 'chat',
 'cnpj',
 'compra',
 'comprar',
 'compras',
 'confeccoes',
 'copos',
 'desconto',
 'devolucoes',
 'disabled',
 'duvidas',
 'email',
 'entrar',
 'enviar',
 'fale',
 'faq',
 'feedback',
 'feminino',
 'formas',
 'frete',
 'funcionalidades',
 'ganhe',
 'gente',
 'gratis',
 'habilitar',
 'industria',
 'informacoes',
 'inicio',
 'item',
 'javascript',
 'jeans',
 'juros',
 'la',
 'ligue',
 'live',
 'login',
 'loja',
 'look',
 'ltda',
 'masculina',
 'masculino',
 'meias',
 'moda',
 'navegador',
 'nenhum',
 'newsletter',
 'obrigatorios',
 'oculos',
 'online',
 'pagamento',
 'pague',
 'papo',
 'peixoto',
 'plus',
 'politica',
 'polos',
 'powered',
 'precisa',
 'preco',
 'pr

In [None]:
carrinho


In [128]:
#Perc.append(np.percentile(variavel, percentiles[i]))
valores = list()

In [129]:
aa = Result[Result.level_0 == 14906].aa
valores.append(aa)
abaixo = Result[Result.level_0 == 14906].abaixo
valores.append(abaixo)

In [136]:
Result[Result.level_0 == 14906].abaixo*1

14906    0
Name: abaixo, dtype: int64

In [None]:
def reune_palavras(url):
    return

In [90]:
Result['palavras'] =' '

In [96]:
Result[Result.level_0== 14906]

Unnamed: 0,level_0,url,_label_,aa,abaixo,abastecimento,abc,abdominal,aberta,abertas,...,yet,yoga,york,youtube,ze,zero,zona,zoom,ProbEcomm,ProbEcomm_
14906,14906,pilotocivil.com.br,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.200188,2


In [112]:
len(palavras)

7579

In [114]:
teste = Result[Result.level_0 == 14906].copy()
for i in range(len(palavras)):
     if teste[palavras[i]] == 0:
        print('não')
    

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
def cria_curva(percentiles, variavel):
    Perc = list()
    for i in range(len(percentiles)):
        Perc.append(np.percentile(variavel, percentiles[i]))
    return Perc