In [2]:
#importing libraries
#dataset you can download from https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
import fasttext
import pandas as pd

In [3]:
#Lets load our dataset.
df = pd.read_csv('ecommerceDataset.csv', names =['category','description'], header=None)
print(df.shape)
df.head(3)

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [4]:
#Drop null values
df.dropna(inplace=True)
df.shape

(50424, 2)

In [5]:
#Lets check our categories
df.category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [6]:
#Clothing & Accessories could be problem for syntax so lets replace category's name
df.category.replace('Clothing & Accessories', 'Clothing_Accessories', inplace =True)

In [7]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

When you train a fasttext model, it expects labels to be specified with label prefix.

In [8]:
#When you train a fasttext model, it expects labels to be specified with label prefix.
#So we Created 
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [9]:
import re
# regEX funciton
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [10]:
text = "  VIKI's |//// Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi(/&+%)"

In [11]:
preprocess(text)

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [12]:
df['category_description'] = df['category']+ ' ' + df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [13]:
df['category_description'] = df['category_description'].map(preprocess)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train, test = train_test_split(df, test_size=0.2)

In [16]:
train.shape , test.shape

((40339, 3), (10085, 3))

In [18]:
#We will save train and test datasets as csv and train model on csvfile also tests on csv file.
train.to_csv('ecom.train', columns=['category_description'], index=False, header=False)
test.to_csv('ecom.test', columns = ['category_description'], index=False, header=False)

In [19]:
#train on csv
model = fasttext.train_supervised(input='ecom.train')

In [23]:
print('Test size, precision score , recall'+str(model.test('ecom.test')))

Test size, precision score , recall(10084, 0.9688615628718762, 0.9688615628718762)


In [24]:
model.predict("generic retro style camera neck strap for nikon models multicolour fasvintage hippie letter camera strap for dslr camera this camera strap weaved by various colors yarn very soft and comfortable you can see the pictures it is very good design and complicated work for this camera strap vintage color and hippie style make you looks different and fashionable why the camera strap always black let's choose our colorful strap match our personality length 102cm material fabric")

(('__label__electronics',), array([0.98664415]))

In [25]:
model.predict("cateye women's men's boy's girl's spectacle frame clear cateye selfy cmbo 002 clear")

(('__label__clothing_accessories',), array([1.00000978]))

In [26]:
model.predict("computer , keyboard, mouse, cable")

(('__label__books',), array([0.92451859]))

In [27]:
model.get_nearest_neighbors('computer')

[(0.9948907494544983, 'electronics'),
 (0.9946401715278625, 'games'),
 (0.9934792518615723, 'my'),
 (0.9928529262542725, 'writing'),
 (0.9926823377609253, 'volume'),
 (0.9880863428115845, 'examination'),
 (0.9880666136741638, 'specific'),
 (0.9869000315666199, 'sticker'),
 (0.9852550625801086, 'dome'),
 (0.9851141571998596, 'tin')]

In [30]:
model.get_nearest_neighbors('games')

[(0.9951537847518921, 'electronics'),
 (0.9946399331092834, 'computer'),
 (0.9904879927635193, 'my'),
 (0.9901248216629028, 'cpu'),
 (0.9899181723594666, 'dome'),
 (0.9896119236946106, 'writing'),
 (0.988883376121521, 'specific'),
 (0.9887799024581909, 'content'),
 (0.988288402557373, 'software'),
 (0.9877792000770569, 'volume')]

In [35]:
model.get_nearest_neighbors('mac')

[(0.9974715709686279, 'tripod'),
 (0.996778130531311, 'colour'),
 (0.9966684579849243, '4gb'),
 (0.9966057538986206, 'dvd'),
 (0.9964436292648315, 'smartphones'),
 (0.9962490200996399, 'ceiling'),
 (0.9960573315620422, 'ram'),
 (0.995823323726654, 'binocular'),
 (0.9957888722419739, 'calculator'),
 (0.9955744743347168, 'glossy')]