# Leitura da base de dados original

In [1]:
import pandas as pd

dados = pd.read_csv('../data/raw/train.tsv', sep='\t')

dados.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


Quantidade de registros

In [2]:
print('Quantidade de registros = ', len(dados))
print('Quantidade de registros únicos = ', dados.name.nunique())
print('Porcentagem de registros únicos = ', 100 * dados.name.nunique()/len(dados))

Quantidade de registros =  1482535
Quantidade de registros únicos =  1225273
Porcentagem de registros únicos =  82.64715504187085


Quantidade de categorias do dataset

In [3]:
print('Números de categorias:', dados.category_name.unique().size)

Números de categorias: 1288


Descrição da coluna `price`

Quantidade de produtos com preço de anúncio igual a zero.

In [4]:
dados[dados.price == 0].size

6992

Quantidad de produtos com preço de anúncio nulo.

In [5]:
dados.price.isna().sum()

0

# Separação das categorias em diversas colunas

In [6]:
colunas = dados.category_name.str.split('/', expand=True)

In [7]:
colunas

Unnamed: 0,0,1,2,3,4
0,Men,Tops,T-shirts,,
1,Electronics,Computers & Tablets,Components & Parts,,
2,Women,Tops & Blouses,Blouse,,
3,Home,Home Décor,Home Décor Accents,,
4,Women,Jewelry,Necklaces,,
...,...,...,...,...,...
1482530,Women,Dresses,Mid-Calf,,
1482531,Kids,Girls 2T-5T,Dresses,,
1482532,Sports & Outdoors,Exercise,Fitness accessories,,
1482533,Home,Home Décor,Home Décor Accents,,


Quantidade de valores nulos na coluna de índice 3

In [8]:
colunas[3].isnull().sum() / len(colunas)

0.997039530264041

Quantidade de valores nulos na coluna de índice 4

In [9]:
colunas[4].isnull().sum() / len(colunas)

0.9979366423052407

# Tratamento de inconsistências

Remoção de preços com valor igual a zero.

Padronização do nome dos anúncios para apenas minúsculas.

In [10]:
dados2 = dados.query('price > 0')
dados2.loc[:, 'name'] = dados2.name.str.lower()
dados2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:, 'name'] = dados2.name.str.lower()


(1481661, 8)

Função para remover caracteres especiais dos nomes.

In [11]:
from nltk.tokenize import RegexpTokenizer
import numpy as np

def clean_names(name):
    tokenizer = RegexpTokenizer(r'\w+')
    
    token = tokenizer.tokenize(name)
    name = ''
    
    for n in token:
        if not n.isdigit() and not n == 'rm':
            name += n
            name += ' '

    name = name.strip()
    
    return name

Remoção de caracters especiais da coluna nome.

In [12]:
dados2.loc[:, 'name'] = [clean_names(name) for name in dados2.name]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:, 'name'] = [clean_names(name) for name in dados2.name]


Contagem das marcas com valores nulos.

In [13]:
dados2.brand_name.isna().sum()/len(dados2)

0.426775085529011

Função para preencher valores nulos.

In [14]:
def preecher_nans(dado, fill=''):
    dado[dado.isna()] = fill
    return dado 

Substituição dos valores nulos em `brand_name` por "No brand"

In [15]:
dados2.loc[:,'brand_name'] = preecher_nans(dados2['brand_name'], fill='No Brand')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dado[dado.isna()] = fill
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dado[dado.isna()] = fill
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:,'brand_name'] = preecher_nans(dados2['brand_name'], fill='No Brand')


Separação da coluna `category` em sub-categorias.

In [16]:
dados2.loc[:,'category_1'] = colunas[0]
dados2.loc[:,'category_2'] = colunas[1]
dados2.loc[:,'category_3'] = colunas[2]
dados2.drop('category_name', axis=1, inplace=True)
dados2 = dados2[['name', 'category_1', 'category_2',
       'category_3', 'item_condition_id', 'brand_name', 'price',
       'shipping', 'item_description']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:,'category_1'] = colunas[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:,'category_2'] = colunas[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:,'category_3'] = colunas[2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in

Contagem de valores nulos em `category_1`, `category_2` e `category_3`

In [17]:
dados2.category_1.isna().sum()/len(dados2)

0.0042614336207810015

In [18]:
dados2.category_2.isna().sum()/len(dados2)

0.0042614336207810015

In [19]:
dados2.category_3.isna().sum()/len(dados2)

0.0042614336207810015

In [20]:
dados2.loc[:, 'category_1'] = preecher_nans(dados2['category_1'], fill='No category')
dados2.loc[:,'category_2'] = preecher_nans(dados2['category_2'], fill='No category')
dados2.loc[:,'category_3'] = preecher_nans(dados2['category_3'], fill='No category')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dado[dado.isna()] = fill
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dado[dado.isna()] = fill
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados2.loc[:, 'category_1'] = preecher_nans(dados2['category_1'], fill='No category')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Função para geração de datas e estoques.

In [21]:
import random
import numpy as np


def data(n, seed):

    datas = []
    
    random.seed(seed)
    for i in range(n):
        dia_maximo = 32
        dia_minimo = 1
        mes_maximo = 13
        mes_minimo = 1
        
        dia = int(random.random() * (dia_maximo - dia_minimo) + dia_minimo)
        mes = int(random.random() * (mes_maximo - mes_minimo) + mes_minimo)
        
        if mes == 2:
            while dia > 28:
                dia = int(random.random() * (dia_maximo - dia_minimo) + dia_minimo)
        
        if mes in [4, 6, 9, 11]:
            while dia > 30:
                dia = int(random.random() * (dia_maximo - dia_minimo) + dia_minimo)
        
        datas.append(str(dia)+'-'+str(mes)+'-2018')
        
    return datas

def estoque(n, seed):

    np.random.seed(seed)
    mu, sigma = 1, 20
    s = np.random.normal(mu, sigma, n)
    s[s < 0] = s[s < 0] * -0.5
    s = s.astype(int)
    s[s < 1] = 1
    
    return s

dados2['date']  = data(n = dados2.shape[0], seed = 10)
dados2['stock'] = estoque(n = dados2.shape[0], seed = 10)

Padronização das descrições dos itens

Nomes para apenas minúsculas.

Remoção de caracteres especiais.

Preenchimento de nulos por "No comment yet."

In [22]:
import numpy as np
descriptions = []
for name in dados.item_description:
    if type(name) == float:
        if np.isnan(name):
            descriptions.append('no description yet')
    else:
        descriptions.append(clean_names(name.lower()))


In [23]:
dados.drop('item_description', axis=1, inplace=True)
dados.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0


In [24]:
descriptions = pd.DataFrame(descriptions)
descriptions.head()

Unnamed: 0,0
0,no description yet
1,this keyboard is in great condition and works ...
2,adorable top with a hint of lace and a key hol...
3,new with tags leather horses retail for each s...
4,complete with certificate of authenticity


In [25]:
dados

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0
...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1
1482531,1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1


In [26]:
len(descriptions)

1482535

In [27]:
len(dados2)

1481661

In [28]:
dados['item_description'] = descriptions[0]
dados.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,no description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,this keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,new with tags leather horses retail for each s...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,complete with certificate of authenticity


Salva arquivo com dados tratatados.

In [29]:
dados2.to_csv('train_data_prep02.csv', index=False)

In [31]:
dados2.head()

Unnamed: 0,name,category_1,category_2,category_3,item_condition_id,brand_name,price,shipping,item_description,date,stock
0,mlb cincinnati reds t shirt size xl,Men,Tops,T-shirts,3,No Brand,10.0,1,No description yet,18-6-2018,27
1,razer blackwidow chroma keyboard,Electronics,Computers & Tablets,Components & Parts,3,Razer,52.0,0,This keyboard is in great condition and works ...,18-3-2018,15
2,ava viv blouse,Women,Tops & Blouses,Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,26-10-2018,14
3,leather horse statues,Home,Home Décor,Home Décor Accents,1,No Brand,35.0,1,New with tags. Leather horses. Retail for [rm]...,21-2-2018,1
4,24k gold plated rose,Women,Jewelry,Necklaces,1,No Brand,44.0,0,Complete with certificate of authenticity,17-4-2018,13


In [33]:
from sklearn.model_selection import train_test_split

base, validacao = train_test_split(dados2, test_size=0.1, stratify=dados2.category_1)

validacao.to_csv('dados_validacao.csv', index=False)



In [34]:
base.head()

Unnamed: 0,name,category_1,category_2,category_3,item_condition_id,brand_name,price,shipping,item_description,date,stock
1395518,nwt nike power dri fit leggings m,Women,Athletic Apparel,"Pants, Tights, Leggings",1,Nike,53.0,0,brand new with tag size M black/white comes fr...,14-3-2018,8
924011,merrell backpack,Men,Men's Accessories,"Backpacks, Bags & Briefcases",2,No Brand,19.0,0,Small merrell daypack,8-5-2018,10
978246,adidas boy athletic pants size,Kids,Boys (4+),Bottoms,3,No Brand,8.0,1,"Good condition, no tears or obvious spots Blac...",9-1-2018,13
337101,love pink blanket,Home,Bedding,Blankets & Throws,1,No Brand,33.0,0,"51x62"" Super soft and comfy blanket. Brand new...",8-2-2018,20
811087,42d pk lace bras,Women,Underwear,Bras,1,No Brand,14.0,0,Size: 42D Brand: Chiara Vicci Color: black and...,3-5-2018,5


In [35]:
treino, teste = train_test_split(base, test_size=0.3, stratify=base.category_1)

treino.to_csv('dados_treino.csv', index=False)
teste.to_csv('dados_teste.csv', index=False)

## Limpeza dos datasets de treino, teste e validação

Ações realizadas para remoção de stopwords e lemmatização

In [36]:
import pandas as pd
import nltk
from nltk import tokenize
from nltk import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
stemmer = nltk.stem.RSLPStemmer()


In [147]:
dados_treino = pd.read_csv('../data/dados_teste.csv')
dados_treino.head()

Unnamed: 0,name,category_1,category_2,category_3,item_condition_id,brand_name,price,shipping,item_description,date,stock
0,two headed dragon figure,Kids,Toys,Action Figures & Statues,3,Imperial,14.0,0,Vintage Imperial 1983 double headed green drag...,16-10-2018,18
1,strapless backless push up lace up cupd,Women,Underwear,Bras,1,No Brand,11.0,1,"❤️All sizes are Available. Color(Beige,Black):...",9-7-2018,3
2,nike starter yankee romper,Kids,Boys 0-24 Mos,One-Pieces,2,Nike,7.0,1,Nike- New York Yankee romper. Size 6/9 months ...,16-4-2018,37
3,dermalogica gift set,Beauty,Makeup,Makeup Sets,1,No Brand,19.0,1,Brand new Authentic Includes: special cleansin...,9-9-2018,8
4,reserved for monica only,Women,Shoes,Athletic,2,Nike,12.0,0,DO NOT PURCHASE THIS LISTING UNLESS YOU ARE MO...,3-8-2018,14


In [136]:
cachedStopWords = stopwords.words("english")
def prep_data(name, no_name='no name'):
    try:
        name = name.lower()
        token = regexp_tokenize(name, r'\b[^\W\d_]+\b')
        token = [word for word in token if not word in cachedStopWords]
        return ' '.join(token)
    except:
        print(name)
        return no_name


In [148]:
results = dados_treino.item_description.apply(prep_data, no_name='no description')

nan


In [149]:
dados_treino.item_description = results

In [150]:
results = dados_treino.name.apply(prep_data)
dados_treino.name = results

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [151]:
dados_treino.to_csv('dados_teste_stem.csv', index=False)