In [1]:
# Importing basic libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing required libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords

In [3]:
# Importing dataset
df = pd.read_csv('./Womens_Clothing_E_Commerce_Reviews.csv')

In [4]:
df.shape

(23486, 11)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [6]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
Title,19676,13993,Love it!,136
Review Text,22641,22634,Perfect fit and i've gotten so many compliment...,3
Division Name,23472,3,General,13850
Department Name,23472,6,Tops,10468
Class Name,23472,20,Dresses,6319


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [8]:
df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [9]:
# Drop unnecessary column
df.drop('Unnamed: 0',axis=1,inplace=True)

In [10]:
#  Drop null rows as there are only few
df.dropna(subset=['Review Text'],inplace=True)

In [11]:
df.isnull().sum()

Clothing ID                   0
Age                           0
Title                      2966
Review Text                   0
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                13
Department Name              13
Class Name                   13
dtype: int64

In [12]:
# Converting to lower case and removing special characters
df['Clean']=df['Review Text'].str.replace('[^a-z0-9]',' ').str.lower()

In [13]:
stop_list=stopwords.words('english')

In [14]:
# To remove the stop words
def sx(x):
    x = [word for word in x.split() if word not in stop_list]
    return' '.join(x)

In [15]:
df['final'] = df['Clean'].apply(sx)

In [16]:
# Initialize CountVectorizer for 20 desired parameters
CV = CountVectorizer(max_features=20)

In [17]:
# Fit
x = CV.fit_transform(df['final'])

In [18]:
# Get words from CountVectorizer
cv_words = CV.get_feature_names_out()
cv_words

array(['color', 'dress', 'fabric', 'fit', 'flattering', 'great', 'like',
       'little', 'look', 'love', 'one', 'ordered', 'perfect', 'really',
       'size', 'small', 'top', 'wear', 'well', 'would'], dtype=object)

In [19]:
top_20_cv_words = pd.DataFrame(x.toarray(), columns=cv_words).sum().sort_values(ascending=False).head(20)
top_20_cv_words

dress         10533
size           8763
love           7868
top            7408
fit            7304
like           7077
wear           6438
great          5791
would          5104
fabric         4787
small          4729
color          4593
look           4037
really         3818
ordered        3775
little         3773
perfect        3714
one            3617
flattering     3505
well           3367
dtype: int64

In [20]:
# Initialize TfidfVectorizer for 20 parameters
tfidf = TfidfVectorizer(max_features=20)

In [21]:
# Fit
y = tfidf.fit_transform(df['final'])

In [22]:
# Get words TfidfVectorizer
tfidf_words = tfidf.get_feature_names_out()
tfidf_words

array(['color', 'dress', 'fabric', 'fit', 'flattering', 'great', 'like',
       'little', 'look', 'love', 'one', 'ordered', 'perfect', 'really',
       'size', 'small', 'top', 'wear', 'well', 'would'], dtype=object)

In [23]:
top_20_tfidf_words = pd.DataFrame(y.toarray(), columns=tfidf_words).sum().sort_values(ascending=False).head(20)
top_20_tfidf_words

dress         3365.037040
love          2823.100227
size          2799.131367
top           2659.395989
fit           2507.496998
like          2462.314842
wear          2355.655822
great         2346.597090
would         1896.438128
fabric        1889.400413
color         1874.526396
small         1778.197178
look          1634.644510
perfect       1626.570827
little        1590.434234
flattering    1583.784140
really        1558.517740
ordered       1514.066915
one           1504.897101
well          1468.905021
dtype: float64