In [37]:
import pandas as pd
import numpy as np
import re
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
news_df = pd.read_csv("https://raw.githubusercontent.com/codebasics/nlp-tutorials/main/12_tf_idf/Ecommerce_data.csv")
news_df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [3]:
# Check the labels for class imbalance
news_df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [35]:
# Creating a function for preprocessing
def preprocess(text):
    clean_text = basic_cleanse(text)
    clean_text = re.sub('[^A-Za-z0-9]+', ' ', clean_text)
    clean_text = " ".join([text for text in clean_text.split(' ') if len(text)>2])
    return clean_text
    
def basic_cleanse(text):
    doc = nlp(text)
    clean_text=[]
    for token in doc:
        if not token.is_stop and not token.is_punct:
            clean_text.append(token.lemma_)
    return ' '.join(clean_text)

In [7]:
news_df.shape

(24000, 2)

In [9]:
# Mapping the target to label
target_map = {i:num for num,i in enumerate("Household,Electronics,Clothing & Accessories,Books".split(","))}
target_map

{'Household': 0, 'Electronics': 1, 'Clothing & Accessories': 2, 'Books': 3}

In [11]:
news_df["label"] = news_df["label"].map(target_map)

In [29]:
# Let's consider only 1000 records for simplicity from each category
df0 = news_df[news_df['label']==0].sample(200)
df1 = news_df[news_df['label']==1].sample(200)
df2 = news_df[news_df['label']==2].sample(200)
df3 = news_df[news_df['label']==3].sample(200)
final_df = pd.concat([df0, df1, df2, df3], axis=0)
final_df['label'].value_counts()

label
0    200
1    200
2    200
3    200
Name: count, dtype: int64

In [30]:
# Splitting train and test data
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(final_df.iloc[:,0], final_df.iloc[:,1], test_size=0.2, stratify=final_df.iloc[:,1], random_state=42)


In [31]:
ytrain.value_counts(), ytest.value_counts()

(label
 0    160
 1    160
 2    160
 3    160
 Name: count, dtype: int64,
 label
 2    40
 1    40
 3    40
 0    40
 Name: count, dtype: int64)

In [38]:
# Lets do preprocessing
Xtrain_cleaned = Xtrain.apply(preprocess)
Xtest_cleaned = Xtest.apply(preprocess)

In [39]:
Xtrain.iloc[0]

"Indian Art Villa Silver, Gold and Copper Plated Lotus Design Akhand Diya Deepak, Arti Poojan Temple (Width 8 Inch) Item Package Quantity:1                                                                                \xa0|\xa0                           Color Name:Multicolour   IndianArtVilla Presents Silver Plated Lotus Design Diya. The item is highly durable, elegant and a wonderful addition to your Home Decor. It can be readily turned into a wonderful Anniversary, Wedding, Diwali, Valentine gift item. Care Tips: Dusting - First, use a soft brush or clean cotton cloth to dust.. Washing and Drying - Once all dust are removed, wash the item by hand with warm water don't soak the silver in water for any length of time. Rinse the piece well with clean water, distilled is best, and dry immediately with a soft, non-lint cloth. A hair-dryer set on warm helps to dry hard-to-reach places. After your pieces are clean and completely dry, wrap each of them individually with no acid buffered tis

In [40]:
Xtrain_cleaned.iloc[0]

'indian Art Villa Silver Gold Copper Plated Lotus Design Akhand Diya Deepak Arti Poojan Temple Width Inch Item Package Quantity Color multicolour indianartvilla Presents Silver Plated Lotus Design Diya item highly durable elegant wonderful addition Home Decor readily turn wonderful anniversary Wedding Diwali Valentine gift item Care Tips Dusting use soft brush clean cotton cloth dust washing drying dust remove wash item hand warm water soak silver water length time rinse piece clean water distil good dry immediately soft non lint cloth hair dryer set warm help dry hard reach place piece clean completely dry wrap individually acid buffer tissue butter paper wash cotton linen polyester store use woolor newspaper cause excessive tarnishing difficult clean bad remove silver plating safely display Silver plate item home like display silver plate item store glass enclose cabinet make good choice avoid unvarnished wood shelf omit harmful vapor cause tarnishing Glass shelf prefer wooden shelf 

In [42]:
# Using Tf-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Using Naive-bayes model
from sklearn.naive_bayes import MultinomialNB

In [43]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("vectorization", TfidfVectorizer()),
    ("model", MultinomialNB())
])

In [44]:
# model training by fitting the data to model
clf.fit(Xtrain_cleaned, ytrain)

In [45]:
# Making prediction
ypred = clf.predict(Xtest_cleaned)

In [47]:
# Importing metrics for evaluation of model output
from sklearn.metrics import classification_report, confusion_matrix

In [48]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91        40
           1       0.93      1.00      0.96        40
           2       0.95      1.00      0.98        40
           3       0.94      0.85      0.89        40

    accuracy                           0.94       160
   macro avg       0.94      0.94      0.94       160
weighted avg       0.94      0.94      0.94       160



In [54]:
label_map = {v:k for k,v in target_map.items()}
label_map

{0: 'Household', 1: 'Electronics', 2: 'Clothing & Accessories', 3: 'Books'}

In [67]:
text_loc=34
print(Xtest.iloc[text_loc])
print("")
print(f"Actual: {label_map.get(ytest.iloc[text_loc])}, Predicted: {label_map.get(clf.predict([preprocess(Xtest.iloc[text_loc])])[0])}")

Robinbosky Premium Girls leggings (52 Colours and 13 sizes) |LIGHT WEIGHT| OPAQUE/NON SEE THROUGH| PREMIUM 200-210 GSM stretchable leggings made from premium quality combed cotton ,versatile for any occasion featuring| Long Lasting elastic waistband | Tapered | slim fit|and |full length| made from 95% COTTON AND 5% SPANDEX offering more 4 way stretch than any other leggings in the market. | BIOWASHED | making the fabric soft on the skin and comfortable to wear . | BIGGEST COLLECTION|We have the most popular and biggest selection of colorful leggings ; choose from 50 colours and 13 sizes (16,18,20,22,24,26,28,30,32,34,36,38,40 ) |EXPRESS SHIPPING| shipping within 1-2 days on all orders

Actual: Clothing & Accessories, Predicted: Clothing & Accessories
