# **IMPORT LIBARIES**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as pyplot
import seaborn as sns

 This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers. Its nine supportive features offer a great environment to parse out the text through its multiple dimensions. Because this is real commercial data, it has been anonymized, and references to the company in the review text and body have been replaced with “retailer”.


In [2]:
data=pd.read_csv(r"C:\Users\adithyan s\Downloads\Womens Clothing E-Commerce Reviews.csv.zip")
data.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


# **BASIC DATA ANALYSIS**

In [3]:
data.shape

(23486, 11)

### **FEATURES**


In [4]:
data.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [5]:
# This dataset includes 23486 rows and 11 feature variables. Each row corresponds to a customer review, and includes the variables:

# Clothing ID: Integer Categorical variable that refers to the specific piece being reviewed.
# Age: Positive Integer variable of the reviewers age.
# Title: String variable for the title of the review.
# Review Text: String variable for the review body.
# Rating: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best.
# Recommended IND: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended.
# Positive Feedback Count: Positive Integer documenting the number of other customers who found this review positive.
# Division Name: Categorical name of the product high level division.
# Department Name: Categorical name of the product department name.
# Class Name: Categorical name of the product class name

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


# **CHECKING MISSING VALUES**

In [7]:
data.isna().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

# **CATEGORICAL FEATURES**

In [8]:
for i in data.select_dtypes(include='object'):
 print(i)

Title
Review Text
Division Name
Department Name
Class Name


# ***DATA PREPROCESSING***

# ***percentage of missing values***

In [9]:
# percentage of missing value
data.isna().sum()*100/len(data)

Unnamed: 0                  0.000000
Clothing ID                 0.000000
Age                         0.000000
Title                      16.222430
Review Text                 3.597888
Rating                      0.000000
Recommended IND             0.000000
Positive Feedback Count     0.000000
Division Name               0.059610
Department Name             0.059610
Class Name                  0.059610
dtype: float64

In [10]:
data['Title']=data['Title'].fillna('Others') # FILL MISSING VALUE WITH 'OTHERS'

In [11]:
data.dropna(subset=['Review Text'],inplace=True) # DROP REVIEW TEXT MISSING VALUES 

In [12]:

data['Division Name']=data['Division Name'].fillna(data['Division Name'].mode()[0]) # FILL MISSING WITH MODE


In [13]:
data['Department Name']=data['Department Name'].fillna(data['Department Name'].mode()[0]) # FILL MISSING WITH MODE

In [14]:
data['Class Name']=data['Class Name'].fillna(data['Class Name'].mode()[0]) # FILL MISSING WITH MODE

In [15]:
data.isna().sum()

Unnamed: 0                 0
Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

# ***REMOVE DUPLICATES***

In [16]:
# CHECKING FOR DUPLICATES
data.duplicated().sum()

0

## **REMOVE UNWANTED FEATURE**

In [17]:
data.drop(columns=['Unnamed: 0',],inplace=True)

# **REVIEW TEXT CLEANING NLP**

In [18]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



In [19]:
def cleaned_data(text):
    # lowercase
    text = text.lower()
    
    # remove urls
    text = re.sub(r'http\S+|www\S+', ' ', text)
    
    # remove html tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # remove punctuation & numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # tokenize
    tokens = word_tokenize(text)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # join tokens
    return " ".join(tokens)


In [None]:
data['cleaned_review'] = data['Review Text'].apply(cleaned_data)


In [None]:
data['cleaned_review'].head()

0          absolutely wonderful silky sexy comfortable
1    love dress sooo pretty happened find store gla...
2    high hope dress really wanted work initially o...
3    love love love jumpsuit fun flirty fabulous ev...
4    shirt flattering due adjustable front tie perf...
Name: cleaned_review, dtype: object

# **BALANCING DATA SET**

In [None]:
data['Rating'].value_counts()

Rating
5    12540
4     4908
3     2823
2     1549
1      821
Name: count, dtype: int64

### **CONVERT NUMERICAL RATINGS TO OBJECTS WISE**

In [None]:
def rating_to_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

data['sentiment'] = data['Rating'].apply(rating_to_sentiment)


In [None]:
data['sentiment'].value_counts()

sentiment
positive    17448
neutral      2823
negative     2370
Name: count, dtype: int64

### **BALANCING DATA**

In [None]:
target={'positive':3500,'neutral':2800,'negative':2370}


In [None]:
balanced_data = (
    data
    .groupby('sentiment', group_keys=False)
    .apply(lambda x: x.sample(
        n=target[x.name],
        random_state=42
    ))
)


  .apply(lambda x: x.sample(


In [None]:
balanced_data['sentiment'].value_counts()

sentiment
positive    3500
neutral     2800
negative    2370
Name: count, dtype: int64

In [None]:
balanced_data.shape

(8670, 12)

In [None]:
balanced_data.isna().sum()

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
cleaned_review             0
sentiment                  0
dtype: int64

## **DATA SPLITING**

In [None]:
x=balanced_data['cleaned_review']
y=balanced_data['sentiment']

# **TRAIN TEST SPLIT**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( x,y,test_size=0.2,stratify=y,        
    random_state=42
)

In [None]:
X_train.shape

(6936,)

# **VECTORIZATION**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9
)


X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# **MODEL TRAINING WITH LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression



model = LogisticRegression(
  C=0.01, class_weight='balanced', l1_ratio=0
)
model.fit(X_train_vec, y_train)





0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,100


In [None]:
ypred=model.predict(X_test_vec)

# **EVALUATION  METRICS** 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

print(classification_report(y_test, ypred))


              precision    recall  f1-score   support

    negative       0.54      0.65      0.59       474
     neutral       0.57      0.41      0.47       560
    positive       0.75      0.82      0.78       700

    accuracy                           0.64      1734
   macro avg       0.62      0.63      0.62      1734
weighted avg       0.64      0.64      0.63      1734



In [None]:
LogisticRegressionScore=accuracy_score(y_test, ypred)
print(round(LogisticRegressionScore,2))

0.64


### **EVALUATION OF BASED ON TRAINING SCORE AND TESTING SCORE**

In [None]:
from sklearn.metrics import f1_score

# Train predictions
y_train_pred = model.predict(X_train_vec)
train_f1 = f1_score(y_train, y_train_pred, average='macro')

# Test predictions
y_test_pred = model.predict(X_test_vec)
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print("Train F1:", train_f1)
print("Test F1:", test_f1)


Train F1: 0.6675163321956111
Test F1: 0.6170690654222851


#  **CONCLUSION**

The model learns meaningful patterns

Test F1 > 0.60 → better than random guessing

Works as a baseline NLP classifier

Big gap between Train & Test F1

Shows overfitting

Generalization is moderate

# **LABEL ENCODING FOR XGBOOST MODEL**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

print(le.classes_)


['negative' 'neutral' 'positive']


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42
)

xgb_model.fit(X_train_vec, y_train_enc)

y_pred = xgb_model.predict(X_test_vec)

print("accuracy Score:", accuracy_score(y_test_enc, y_pred))
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))


accuracy Score: 0.6309111880046137
              precision    recall  f1-score   support

    negative       0.57      0.50      0.54       474
     neutral       0.51      0.50      0.51       560
    positive       0.75      0.82      0.78       700

    accuracy                           0.63      1734
   macro avg       0.61      0.61      0.61      1734
weighted avg       0.62      0.63      0.63      1734



In [None]:
from sklearn.metrics import f1_score

# Train predictions
y_train_pred = xgb_model.predict(X_train_vec)
train_f1 = f1_score(y_train_enc, y_train_pred, average='macro')

# Test predictions
y_test_pred = xgb_model.predict(X_test_vec)
test_f1 = f1_score(y_test_enc, y_test_pred, average='macro')

print("Train F1:", train_f1)
print("Test F1:", test_f1)

Train F1: 0.9513993658165231
Test F1: 0.6079963179730826
