## **Ecommerce Product Categorization**

**Problem Statement:** In the rapidly evolving world of eCommerce, accurate product categorization is crucial for ensuring seamless customer experiences, reducing search friction, and increasing product discoverability. However, the sheer volume of diverse products poses a significant challenge. Current classification systems struggle to handle ambiguities, unconventional naming conventions, and multi-language data. This hackathon aims to address these challenges by inviting participants to create innovative solutions that enhance product categorization efficiency, accuracy, and scalability.
Develop a text classification model that categorizes products with maximum accuracy based on description of the product.


# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**1: Import Libraries**</p>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import nltk
import re

from string import punctuation
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import WordPunctTokenizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Embedding, Conv1D, MaxPooling1D, LSTM


## <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**2: Read dataset**</p>

In [None]:
df = pd.read_csv("/test_data.csv")
df.head(2)

In [None]:
df = pd.read_csv("/train_product_data.csv")
df.head(2)

In [None]:
df = df[['description','product_category_tree']]

In [None]:
df.head(5)

In [None]:
df['description'].iloc[0]

In [None]:
df.isnull().sum()

In [None]:
# Extract the main category from product_category_tree column by specifying regex pattern.
# If not found, split the sentence and get the first token after removing extra spaces and chars.

categories = df["product_category_tree"].copy()

for i in range(categories.shape[0]):
  z = re.match("(.*?)>",categories[i])  # splitting at '>'
  # print(categories[i])
  if z==None:
    categories[i] = categories[i].split()[0].strip('["]')
  else:
    z = z.group().strip('["]>')         # removing special characters
    categories[i]=z.strip()

print(categories)

In [None]:

# Let's see top 40 unique categories with their frequencies

print(categories.value_counts()[:40])

## <p id="3" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**3: Dataset Overview**</p>

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.describe(include="object")

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**4: EDA (Exploratory Data Analysis)**</p>

In [None]:
# Explore data (descriptive statistics, missing values, etc.)
print(df.describe())
print(df.isnull().sum())

In [None]:
# Visualize target distribution
plt.figure(figsize=(10, 6))
sns.countplot(y="product_category_tree", data=df, order=df['product_category_tree'].value_counts().index)
plt.title('Distribution of Product Categories')
plt.xlabel('Count')
plt.ylabel('Categories')
plt.show()

In [None]:
# Step 3: Box Plot for Retail Price vs Category
plt.figure(figsize=(12, 8))
sns.boxplot(y='product_category_tree', x='retail_price', data=df)
plt.title('Retail Price by Product Category')
plt.xlabel('Retail Price')
plt.ylabel('Product Category')
plt.xscale('log')  # Use log scale for better visualization
plt.show()

In [None]:
# Step 6: Interactive Scatter Plot with Plotly
fig = px.scatter(df, x='retail_price', y='discounted_price', color='product_category_tree',
                 hover_data=['product_name'], title='Retail Price vs Discounted Price')
fig.show()

In [None]:
# Step 7: Animated Bar Plot of Category Distribution Over Time (if timestamp is available)
if 'crawl_timestamp' in df.columns:
    df['crawl_timestamp'] = pd.to_datetime(df['crawl_timestamp'])
    df['year_month'] = df['crawl_timestamp'].dt.to_period('M')

    category_month = df.groupby(['year_month', 'product_category_tree']).size().reset_index(name='counts')

    fig = px.bar(category_month, x='product_category_tree', y='counts', color='product_category_tree',
                 animation_frame='year_month', animation_group='product_category_tree', range_y=[0, df['product_category_tree'].value_counts().max()],
                 title='Product Category Distribution Over Time')
    fig.show()

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**5: Text Normalization**</p>

In [None]:
# Let's have a look at all the unique category names
# We will talk about their significance and validty later
possible_labels = categories.unique()

print(type(possible_labels))
print("\n")
print("Possible Labels: \n", possible_labels)
print("\n")
print("Number of possible categories:", len(possible_labels))

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**5: Data Preprocessing**</p>

In [None]:
# Step 4: Data Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Preprocessing function to handle text data
def preprocess_text(text):
    if isinstance(text, str):
        return text.lower()
    return ""

# Apply preprocessing to 'product_name' and 'description'
df['product_name'] = df['product_name'].apply(preprocess_text)
df['description'] = df['description'].apply(preprocess_text)

In [None]:
# Combine text features for better representation
df['combined_text'] = df['product_name'] + ' ' + df['description']

In [None]:
# Similar preprocessing for test data
df['product_name'] = df['product_name'].apply(preprocess_text)
df['description'] = df['description'].apply(preprocess_text)
df['combined_text'] = df['product_name'] + ' ' + df['description']

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**6: Feature Engineering and Encoding**</p>

In [None]:
# Step 5: Feature Engineering and Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['product_category_tree'])

In [None]:
# Vectorize text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(df['combined_text'])
X_test_tfidf = tfidf_vectorizer.transform(df['combined_text'])

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**7: Train-Test splite**</p>

In [None]:
df = pd.read_csv("test_data.csv")

In [None]:
print("Testing data without label: ",df.shape)

In [None]:
from wordcloud import WordCloud

text = " ".join(description for description in X_test)

# Create the word cloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off the axis
plt.title('Word Cloud for Normalized Descriptions')
plt.show()

In [None]:
# Step 6: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, df['category'], test_size=0.2, random_state=42)

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**8: Model Building and Evaluation**</p>
## Naive Bayes

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_val)
print('Naive Bayes Accuracy:', accuracy_score(y_val, y_pred_nb))
print('Naive Bayes F1 Score:', f1_score(y_val, y_pred_nb, average='weighted'))

## Logistic Regression

In [None]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_val)
print('Logistic Regression Accuracy:', accuracy_score(y_val, y_pred_lr))
print('Logistic Regression F1 Score:', f1_score(y_val, y_pred_lr, average='weighted'))

## Support Vector Machine

In [None]:
# Support Vector Machine
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_val)
print('Support Vector Machine Accuracy:', accuracy_score(y_val, y_pred_svm))
print('Support Vector Machine F1 Score:', f1_score(y_val, y_pred_svm, average='weighted'))

## Random Forest

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
print('Random Forest Accuracy:', accuracy_score(y_val, y_pred_rf))
print('Random Forest F1 Score:', f1_score(y_val, y_pred_rf, average='weighted'))

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**9: Naive BayesDeep Learning Model**</p>
## Tokenization and Padding

In [None]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['combined_text'])
X_train_seq = tokenizer.texts_to_sequences(df['combined_text'])
X_test_seq = tokenizer.texts_to_sequences(df['combined_text'])
X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)


In [None]:
# Train-test split
X_train_dl, X_val_dl, y_train_dl, y_val_dl = train_test_split(X_train_pad, df['category'], test_size=0.2, random_state=42)

##  Build LSTM Model

In [None]:
# Build LSTM Model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train_dl, y_train_dl, validation_data=(X_val_dl, y_val_dl), epochs=5, batch_size=64)


In [None]:
# Evaluate
y_pred_dl = np.argmax(model.predict(X_val_dl), axis=1)
print('LSTM Accuracy:', accuracy_score(y_val_dl, y_pred_dl))
print('LSTM F1 Score:', f1_score(y_val_dl, y_pred_dl, average='weighted'))

# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**10: Hyperparameter Tuning**</p>

In [None]:

# Example with Logistic Regression using GridSearchCV
parameters = {'C': [0.1, 1, 10], 'penalty': ['l2']}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), parameters, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)
best_lr_model = grid_search.best_estimator_
y_pred_best_lr = best_lr_model.predict(X_val)
print('Tuned Logistic Regression Accuracy:', accuracy_score(y_val, y_pred_best_lr))
print('Tuned Logistic Regression F1 Score:', f1_score(y_val, y_pred_best_lr, average='weighted'))


# <p id="1" style="justify-content: center; align-items: center; background-color: #85C1E9; border-radius: 10px; border: 1px solid #3498DB; text-align: center; padding: 12px 0;">**11: Final Evaluation**</p>

In [None]:
# Use the best model to predict on the test set
df_model = best_lr_model # or the chosen best model after tuning
test_predictions = df_model.predict(X_test_tfidf)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

In [None]:
# Save the predictions
text_df = pd.DataFrame({'uniq_id': df['uniq_id'], 'predicted_category_tree': test_predictions_labels})
text_df.to_csv('text_df.csv', index=False)