In [1]:
# import all the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer            
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
data = pd.read_csv(r'E:\Shyplite\Categorical\output 28-Apr-2021.csv')

In [4]:
data=data.dropna(axis=0, inplace=False)

In [5]:
data

Unnamed: 0,item_description,category,occurance
0,Savlon Surface Disinfectant Spray Sanitizer Ge...,Health & Beauty,565.0
1,Dettol Disinfectant Sanitizer Spray for Germ P...,Health & Beauty,271.0
2,Medicine,medicine,256.0
3,Dettol Disinfectant Sanitizer Spray for Germ P...,Health & Beauty,256.0
4,KYC,generic/vague items,175.0
5,GIFT ARTICAL,Art and craft,165.0
6,K Ayurvedic Product,"Health, Household & Personal Care",152.0
8,Himalayan Organics Calcium Magnesium Zinc Vita...,medicine,125.0
9,Himalayan Organics Multivitamin for Men Women...,medicine,119.0
10,20000 mAh Large-Capacity Fashion Mini Power Bank,Power Bank,117.0


In [6]:
X = data['item_description']
y = data['category']

In [7]:
X.head()

0    Savlon Surface Disinfectant Spray Sanitizer Ge...
1    Dettol Disinfectant Sanitizer Spray for Germ P...
2                                             Medicine
3    Dettol Disinfectant Sanitizer Spray for Germ P...
4                                                  KYC
Name: item_description, dtype: object

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

#Let's check the shape of the splitted data
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (4274,)
Testing Data Shape: (1833,)


In [9]:
# Let's first try with Count Vectorizer from scikit learn
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_train_cv.shape

(4274, 7813)

In [10]:
X_train_cv

<4274x7813 sparse matrix of type '<class 'numpy.int64'>'
	with 35336 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_cv,y_train)

LinearSVC()

In [12]:
# Let's test it for the first 2 articles in the Test dataset
X_test1 = X_test[0:2]
print(X_test1)

5500    CUSTOM Naturale 8 Inch - 709 x 344 x 8
4593           Pirate Ship - Printed Shirt - L
Name: item_description, dtype: object


In [13]:
X_test1_cv = cv.transform(X_test1)
clf.predict(X_test1_cv)

array(['Gifts', 'Shirt'], dtype=object)

In [14]:
X_test_cv = cv.transform(X_test)

In [15]:
predictions = clf.predict(X_test_cv)

In [16]:
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[ 5  0  0 ...  0  0  0]
 [ 0 27  0 ...  0  0  0]
 [ 1  0 62 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0  3]]
                                   precision    recall  f1-score   support

                          Adapter       0.83      0.83      0.83         6
                          Apparel       0.87      0.87      0.87        31
             Art & Crafting Tools       0.89      0.82      0.85        76
                    Art and craft       0.73      0.63      0.68        38
              Audio & Accessories       1.00      1.00      1.00         2
                     Baby carrier       1.00      1.00      1.00         2
                    Baby products       0.84      0.91      0.87        23
                              Bag       1.00      1.00      1.00         1
                      Bath & Body       0.78      0.70      0.74        10
                  Beauty Products       0.81      0.79      0.80        91
                  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# single command to create a pipeline of activities...vectorize and classify the text, in this case
clf_cvec_lsvc = Pipeline([('cvec', CountVectorizer()),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_cvec_lsvc.fit(X_train, y_train)

Pipeline(steps=[('cvec', CountVectorizer()), ('clf', LinearSVC())])

In [18]:
Pipeline(steps=[('cvec', CountVectorizer()), ('clf', LinearSVC())])

Pipeline(steps=[('cvec', CountVectorizer()), ('clf', LinearSVC())])

In [19]:
# Form a prediction set
# No need to convert the test data. Classifier cretaed in the pipeline will take care of it
predictions = clf_cvec_lsvc.predict(X_test)
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[ 5  0  0 ...  0  0  0]
 [ 0 27  0 ...  0  0  0]
 [ 1  0 62 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0  3]]
                                   precision    recall  f1-score   support

                          Adapter       0.83      0.83      0.83         6
                          Apparel       0.87      0.87      0.87        31
             Art & Crafting Tools       0.89      0.82      0.85        76
                    Art and craft       0.73      0.63      0.68        38
              Audio & Accessories       1.00      1.00      1.00         2
                     Baby carrier       1.00      1.00      1.00         2
                    Baby products       0.84      0.91      0.87        23
                              Bag       1.00      1.00      1.00         1
                      Bath & Body       0.78      0.70      0.74        10
                  Beauty Products       0.81      0.79      0.80        91
                  

In [20]:
clf_tfidf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_tfidf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [21]:
# Form a prediction set
predictions = clf_tfidf_lsvc.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.8385160938352427


In [22]:
# MultinomialNB
clf_tfidf_mnb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

# Feed the training data through the pipeline
clf_tfidf_mnb.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [23]:
# Form a prediction set
predictions = clf_tfidf_mnb.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.408619749045281


In [24]:
# LogisticRegression
clf_tfidf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression())])

# Feed the training data through the pipeline
clf_tfidf_lr.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

In [25]:
predictions = clf_tfidf_lr.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.6813966175668303


In [26]:
#knn
clf_tfidf_knc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', KNeighborsClassifier())])

# Feed the training data through the pipeline
clf_tfidf_knc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', KNeighborsClassifier())])

In [27]:
predictions = clf_tfidf_knc.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.6813966175668303


In [28]:
#random forest
clf_tfidf_rfc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier())])

# Feed the training data through the pipeline
clf_tfidf_rfc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier())])

In [29]:
predictions = clf_tfidf_rfc.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.8046917621385706


In [30]:
# Create list of StopWords
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [31]:
clf_tfidf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_tfidf_lsvc2.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('clf', LinearSVC())])

In [46]:
predictions = clf_tfidf_lsvc2.predict(X_test)
predictions
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.8385160938352427


In [47]:
predictions

array(['Gifts', 'Shirt', 'Apparel', ..., 'LED Light Bulbs',
       'Grocery items', 'Medicine'], dtype=object)

In [33]:
X_test1 = X_test[10:22]
print(X_test1)
X_test1_cv = cv.transform(X_test1)
clf.predict(X_test1_cv)

3277    Faucet Sprayer head Attachment Flexible Tap Ex...
5313                      Chair Mechanismbolt Part - C104
6423                Oxidised Silver Flower Motif Toe Ring
378                            Empire Basmati Rice - 5 Kg
7029       Custom couple name wall hanging with led light
2017    Anchor by Panasonic 6A Modular Angle lamp Hold...
1417      Kaempferia Rotunda Bhumi Champa Pack of 5 bulbs
4725    Astaberry Shakambhri Skin Whitening Hair Remov...
129                  Vishal Paper Stump White Set of 6 Pc
3317              Turquoise Bracelet For Self Development
1590                        Customized Oppo F7 Back Cover
1832    Rahashya Sandhani Damayanti Samagra vol-2 Hard...
Name: item_description, dtype: object


array(['Jewellery', 'Art and craft', 'Jewellery', 'Grocery items',
       'Wall decor and hangings', 'Lighting Accessories ', 'bulbs',
       'Skin Care Cream', 'Stationery', 'Jewellery', 'Phone Cover',
       'Books'], dtype=object)

In [34]:
tx=pd.read_excel('E:/shyplite/Categorical/abc.xlsx', sheet_name='Sheet3')

In [35]:
tx

Unnamed: 0,Item_description,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Superbottoms Freesize UNO - Bummy,,,
1,Superbottoms Freesize UNO - Cherry Blossom ...,,,
2,Superbottoms Freesize UNO - Mommy Talk ...,,,
3,Superbottoms Newborn UNO - Baby Hearts ...,,,
4,Superbottoms Newborn UNO Booster,,,
5,Superbottoms Freesize UNO - Roar,,,
6,Dryfeel Langot - Day Dreamer - Size 1 Pack of ...,,,
7,Supercomfy - Rainbow -Size3,,,
8,Supercomfys - Sunny Bliss - Size4,,,
9,Supercomfys - Beach Bum- Size1,,,


In [36]:
tt=tx.Item_description

In [37]:
tt

0       Superbottoms Freesize UNO - Bummy                
1       Superbottoms Freesize UNO - Cherry Blossom    ...
2       Superbottoms Freesize UNO - Mommy Talk        ...
3       Superbottoms Newborn UNO - Baby Hearts        ...
4        Superbottoms Newborn UNO Booster                
5        Superbottoms Freesize UNO - Roar                
6       Dryfeel Langot - Day Dreamer - Size 1 Pack of ...
7             Supercomfy - Rainbow -Size3                
8       Supercomfys - Sunny Bliss - Size4                
9          Supercomfys - Beach Bum- Size1                
10        Supercomfys - Babysaurus- Size1                
11            Supercomfy - Rainbow -Size2                
12            Supercomfy - Rainbow -Size4                
13                                    Black Paper Napkins
14                       0056 Bird Fork Color Box Packing
15      0065 Manual Stainless Steel Compact Extra Shar...
16           R3 realme bluetooth neck band  not original 
17            

In [38]:
# Let's test it for the first 2 articles in the Test dataset
tt1 = tt[0:2]
print(tt1)

0    Superbottoms Freesize UNO - Bummy                
1    Superbottoms Freesize UNO - Cherry Blossom    ...
Name: Item_description, dtype: object


In [39]:
tt2 = cv.transform(tt1)
clf.predict(tt2)

array(['Diapers', 'Diapers'], dtype=object)

In [49]:
tt2 = cv.transform(tt1)
clf.predict(tt2)

array(['Diapers', 'Diapers'], dtype=object)

In [None]:
tt3 = tt[0:100]
print(tt3)
ttt1 = cv.transform(tt3)
clf.predict(ttt1)

In [51]:
tt4 = tt[0:4885]
print(tt4)
ttt2 = cv.transform(tt4)
clf.predict(ttt2)

0       Superbottoms Freesize UNO - Bummy                
1       Superbottoms Freesize UNO - Cherry Blossom    ...
2       Superbottoms Freesize UNO - Mommy Talk        ...
3       Superbottoms Newborn UNO - Baby Hearts        ...
4        Superbottoms Newborn UNO Booster                
                              ...                        
4880             Blue  White Cut  Sew Active Wear T-Shirt
4881          Boys Red Captain America printed top - 1-2y
4882               Peter pan collar top - Medium - medium
4883                         Brainsmith Swoora Tambourine
4884                 Cadiveu Acai Oil Treatment Oil 110ml
Name: Item_description, Length: 4885, dtype: object


array(['Diapers', 'Diapers', 'Diapers', ..., 'Topwear',
       'generic/vague items', 'Beauty Products'], dtype=object)

In [52]:
tt5 = tt
print(tt5)
ttt2 = cv.transform(tt5)
clf.predict(ttt2)

0       Superbottoms Freesize UNO - Bummy                
1       Superbottoms Freesize UNO - Cherry Blossom    ...
2       Superbottoms Freesize UNO - Mommy Talk        ...
3       Superbottoms Newborn UNO - Baby Hearts        ...
4        Superbottoms Newborn UNO Booster                
                              ...                        
4880             Blue  White Cut  Sew Active Wear T-Shirt
4881          Boys Red Captain America printed top - 1-2y
4882               Peter pan collar top - Medium - medium
4883                         Brainsmith Swoora Tambourine
4884                 Cadiveu Acai Oil Treatment Oil 110ml
Name: Item_description, Length: 4885, dtype: object


array(['Diapers', 'Diapers', 'Diapers', ..., 'Topwear',
       'generic/vague items', 'Beauty Products'], dtype=object)

In [56]:
import pandas as pd
kk=pd.read_csv('E:/Shyplite/Categorical/kk.csv')
kkk=kk.item_description
kkk

0                      Elly Dumbo Solar Powered Bobblehead
1                  Ganga Motu Thick Chunky Yarn - White 21
2        Tummy Roll-On With Hing  Saunf Oil For Instant...
3                                         Babys Travel Kit
4        Plant Powered Natural Liquid Cleanser With Gre...
                               ...                        
21463    Vermicompost Complete Plant Food Effective Org...
21464                                  Whole Wheat Noodles
21465              Wow Confetti Confeito Rainbow Balls 75g
21466                            X-Lava Filter Media 800ml
21467                                        YOYO Leg Rest
Name: item_description, Length: 21468, dtype: object

In [59]:
tt6 = kkk
print(tt6)

0                      Elly Dumbo Solar Powered Bobblehead
1                  Ganga Motu Thick Chunky Yarn - White 21
2        Tummy Roll-On With Hing  Saunf Oil For Instant...
3                                         Babys Travel Kit
4        Plant Powered Natural Liquid Cleanser With Gre...
5                                        Ancient Mint Salt
6          Enjoy Every Moment Sea Redmi K20 Pro Back Cover
7        Dashmool Hair Lep Infused With Dashamoola Curr...
8                  Gaia Organics Real Fruit Crunchy Muesli
9                     Limnophila Aromatica Green-1 Net Pot
10       French Marigold Super spry pack of 30 seeds Im...
11              Rainbow Genuine Liquid Vivo X50 Back Cover
12              Birds Feather Apple iPhone 11 Mobile Cover
13                                   Idli Podi   Home Made
14                              Mandala Printed Pop Holder
15                                      Marble Money Plant
16                           Flowers Oneplus 7T Back Cov

array(['Beauty Products', 'Apparel', 'Beauty Products', ...,
       'Art and craft', 'Mobile Phone', 'Bag'], dtype=object)

In [61]:
import numpy as np
np.set_printoptions(threshold=np.inf)

In [62]:
ttt3 = cv.transform(tt6)
clf.predict(ttt3)

array(['Beauty Products', 'Apparel', 'Beauty Products', 'Baby products',
       'Bath & Body', 'generic/vague items', 'Phone Cover',
       'Health & Household', 'Organic products', 'Flower pots',
       'Home & Garden', 'Phone Cover', 'Phone Cover', 'Home Decor',
       'Baby products', 'Home & Garden', 'Phone Cover',
       'Health, Household & Personal Care', 'Gifts', 'Flower pots',
       'Flower pots', 'Tooth Paste', 'Beauty Products',
       'Body face skin care', 'Luggage & Bags', 'Grocery items',
       'Beauty Products', 'Innerwear', 'Hair Oil', 'Cups & Mugs',
       'Grocery items', 'Organic products', 'Grocery items',
       'Flower pots', 'Beauty Products', 'Art and craft', 'Flower pots',
       'Flower pots', 'Flower pots', 'Art & Crafting Tools',
       'Art & Crafting Tools', 'Beauty Products', 'Food', 'Dry Fruits',
       'Diapers', 'Food', 'Candy & Chocolate', 'Art & Crafting Tools',
       'Apparel', 'bulbs', 'Led Bulb', 'Grocery items', 'Phone Cover',
       'Phone C

In [63]:
numpy_array=clf.predict(ttt3)

In [64]:
df1 = pd.DataFrame(numpy_array)

In [65]:
df1

Unnamed: 0,0
0,Beauty Products
1,Apparel
2,Beauty Products
3,Baby products
4,Bath & Body
5,generic/vague items
6,Phone Cover
7,Health & Household
8,Organic products
9,Flower pots


In [None]:
df4 = pd.concat([kk,df1], axis=1)
df4.to_csv('E:\Shyplite\Set1\Category_output.csv')