In [10]:
import pandas as pd

df = pd.read_csv('ecommerce_dataset.csv', names=['categories', 'description'], header=None)
df.head()

Unnamed: 0,categories,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [11]:
df.dropna(inplace=True)

In [12]:
df.categories.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
Name: categories, dtype: int64

In [13]:
df.categories.replace('Clothing & Accessories', 'Clothing_Accessories', inplace=True)

In [14]:
df.categories.value_counts()

Household               19313
Books                   11820
Electronics             10621
Clothing_Accessories     8670
Name: categories, dtype: int64

In [15]:
df['categories'] = '__label__' + df['categories'].astype(str)
df.head()

Unnamed: 0,categories,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [16]:
df['category_description'] = df['categories']+' '+df['description']
df.head()

Unnamed: 0,categories,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [17]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ', text)
text = re.sub(' +', ' ', text)
text.strip().lower()

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [18]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [19]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,categories,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [20]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [21]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [22]:
train.to_csv('ecommerce.train', columns=['category_description'], index=False, header=False)
test.to_csv('ecommerce.test', columns=['category_description'], index=False, header=False)

In [23]:
import fasttext

model = fasttext.train_supervised(input='ecommerce.train')
model.test('ecommerce.test')

Read 4M words
Number of words:  79063
Number of labels: 4
Progress: 100.0% words/sec/thread: 1954765 lr:  0.000000 avg.loss:  0.186855 ETA:   0h 0m 0s


(10085, 0.9701536936043629, 0.9701536936043629)

In [24]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99802291]))

In [25]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

In [26]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000978]))

In [27]:
model.get_nearest_neighbors("painting")

[(0.9987702369689941, 'street27'),
 (0.9987303614616394, 'toua'),
 (0.9986990094184875, 'escalators'),
 (0.9986922144889832, 'braceleti'),
 (0.9986922144889832, 'carat'),
 (0.9986922144889832, '0394'),
 (0.9986900091171265, '0052_steelplainco'),
 (0.9986491799354553, 'kraftmaniatransform'),
 (0.9986491799354553, 'cubesthese'),
 (0.9986491799354553, 'melter')]