In [49]:
import pandas as pd
import numpy as np
import re

In [51]:
def transform_training_data(name):
    # IMPLEMENT
    # lower case
    name = name.lower()
    # remove punctuation
    name = re.sub(r'[^\w\s]','', name)
    # remove extra white space
    name = " ".join(name.split())
    return name.replace('\n', ' ')

In [3]:
df_train = pd.read_csv('/workspace/datasets/train.csv')

In [35]:
path_product = '/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
df_product = pd.read_xml(path_or_buffer=path_product)

In [38]:
df_product.head()

Unnamed: 0,id,name,path,subCategories
0,abcat0010000,Gift Center,,
1,abcat0011000,Her,,
2,abcat0011001,Leisure Gifts,,
3,abcat0011002,Kitchen Essentials,,
4,abcat0011003,Electronics,,


In [24]:
df_train.head()

Unnamed: 0,user,sku,category,query,click_time,query_time
0,000000df17cd56a5df4a94074e133c9d4739fae3,2125233,abcat0101001,Televisiones Panasonic 50 pulgadas,2011-09-01 23:44:52.533,2011-09-01 23:43:59.752
1,000001928162247ffaf63185cd8b2a244c78e7c6,2009324,abcat0101001,Sharp,2011-09-05 12:25:37.42,2011-09-05 12:25:01.187
2,000017f79c2b5da56721f22f9fdd726b13daf8e8,1517163,pcmcat193100050014,nook,2011-08-24 12:56:58.91,2011-08-24 12:55:13.012
3,000017f79c2b5da56721f22f9fdd726b13daf8e8,2877125,abcat0101001,rca,2011-10-25 07:18:14.722,2011-10-25 07:16:51.759
4,000017f79c2b5da56721f22f9fdd726b13daf8e8,2877134,abcat0101005,rca,2011-10-25 07:19:51.697,2011-10-25 07:16:51.759


In [11]:
# check on the total number of categories
df_train['category'].nunique()

1540

In [25]:
# check on the how many queries are associated to a category.
df_train.groupby(['category'])['query'].nunique().describe()

count     1540.000000
mean       274.951299
std       1122.017776
min          1.000000
25%         12.000000
50%         56.000000
75%        208.000000
max      31924.000000
Name: query, dtype: float64

In [40]:
# check on the number of categories that has more than 100 queries
(df_train.groupby(['category'])['query'].nunique() >= 100).value_counts()

False    940
True     600
Name: query, dtype: int64

In [44]:
# trim the data
df_train_trim = \
    df_train[df_train['category']\
                 .isin(df_train.groupby(['category'])['query'].nunique()
                       [df_train.groupby(['category'])['query'].nunique() >= 100].index)]

In [52]:
df_train_trim['query'] = df_train_trim['query'].apply(lambda x: transform_training_data(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_trim['query'] = df_train_trim['query'].apply(lambda x: transform_training_data(x))


In [58]:
# training
# remove old data
!rm -f /workspace/datasets/fasttext/output.fasttext
!rm -f /workspace/datasets/fasttext/train.fasttext
!rm -f /workspace/datasets/fasttext/test.fasttext

In [61]:
# prep data
df_train_trim['label'] = '__label__' + df_train_trim['category']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_trim['label'] = '__label__' + df_train_trim['category']


In [76]:
# train and test split
df_train_fastext = df_train_trim.sample(frac=0.8)
df_test_fastext = df_train_trim.sample(frac=0.01)

In [77]:
# dimension check
print(df_train_fastext.shape)
print(df_test_fastext.shape)

(1432171, 7)
(17902, 7)


In [78]:
# produce data to disk
df_train_fastext[['label', 'query']].to_csv('/workspace/datasets/fasttext/train.fasttext', 
                                         header=None, 
                                         index=None, 
                                         sep=' ')

df_test_fastext[['label', 'query']].to_csv('/workspace/datasets/fasttext/test.fasttext', 
                                         header=None, 
                                         index=None, 
                                         sep=' ')

In [73]:
# check on the training data 
! head -10 /workspace/datasets/fasttext/train.fasttext

__label__cat02015 "star wars"
__label__pcmcat243000050004 "red dead redemption"
__label__pcmcat214000050002 panasinuc
__label__cat02015 "game of thrones"
__label__pcmcat247400050000 "gateway laptop"
__label__pcmcat209000050008 "toshiba tablet"
__label__abcat0101001 aquos
__label__abcat0910003 washer
__label__pcmcat164200050013 netbooks
__label__pcmcat152400050000 tablets


In [72]:
# check on the test data 
! head -10 /workspace/datasets/fasttext/test.fasttext

__label__cat02015 newsies
__label__abcat0205008 subwoofers
__label__pcmcat186100050006 "portable hard drive"
__label__abcat0504010 "flash drive"
__label__cat02719 "kid cudi"
__label__pcmcat243000050004 summersale_movietv_20110826
__label__pcmcat144700050004 beats
__label__pcmcat209000050008 tablet
__label__abcat0401004 "canon sd1000"
__label__abcat0503013 "comcast modem"


In [69]:
# create modeling data
import fasttext
model = \
    fasttext\
        .train_supervised(input="/workspace/datasets/fasttext/train.fasttext",
                          lr=0.5, 
                          epoch=1, 
                          wordNgrams=2)

Read 7M words
Number of words:  80417
Number of labels: 600
Progress: 100.0% words/sec/thread:    2864 lr:  0.000000 avg.loss:  3.154597 ETA:   0h 0m 0s


In [80]:
# validation
print(model.test("/workspace/datasets/fasttext/test.fasttext", k=1))
print(model.test("/workspace/datasets/fasttext/test.fasttext", k=5))
print(model.test("/workspace/datasets/fasttext/test.fasttext", k=10))

(17902, 0.5954641939448106, 0.5954641939448106)
(17902, 0.1714668752094738, 0.8573343760473691)
(17902, 0.09154284437493017, 0.9154284437493018)
