In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
import os

returns_df = pd.read_csv('../data/processed/returns.csv')
returns_df.dropna(subset=['return_reason_text', 'class_name'], inplace=True)

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text), re.I|re.A).lower().strip()
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return " ".join(lemmatized_tokens)

returns_df['cleaned_text'] = returns_df['return_reason_text'].apply(preprocess_text)

# Train the model
X = returns_df['cleaned_text']
y = returns_df['class_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Evaluate and save
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

os.makedirs('../models', exist_ok=True)
joblib.dump(model, '../models/text_classifier_model.pkl')
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')
print("\n--- TEXT MODEL TRAINED AND SAVED ---")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  text = re.sub(r'[^a-zA-Z\s]', '', str(text), re.I|re.A).lower().strip()


              precision    recall  f1-score   support

     Dresses       0.72      1.00      0.84       136
     Jackets       0.00      0.00      0.00        14
       Jeans       0.88      0.41      0.56        17
       Pants       0.85      0.44      0.58        25
      Shorts       0.00      0.00      0.00         5
      Skirts       1.00      0.29      0.45        17

    accuracy                           0.74       214
   macro avg       0.57      0.36      0.41       214
weighted avg       0.71      0.74      0.68       214


--- TEXT MODEL TRAINED AND SAVED ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [2]:
# Download necessary NLTK data (only needs to be done once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    
    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return " ".join(lemmatized_tokens)

# Apply the preprocessing to our complaint text
returns_df['cleaned_text'] = returns_df['return_reason_text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)


In [3]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Diya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Define our features (X) and target (y)
X = returns_df['cleaned_text']
y = returns_df['class_name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (856,)
Testing data shape: (214,)


In [5]:
# TF-IDF turns text into numbers that the model can understand
vectorizer = TfidfVectorizer(max_features=1000) # Use top 1000 words

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Only transform the test data
X_test_tfidf = vectorizer.transform(X_test)

In [6]:
# We'll use a simple but powerful Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
print("Training the model...")
model.fit(X_train_tfidf, y_train)
print("Model training complete.")

Training the model...
Model training complete.


In [7]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Print a report showing how well the model performed
print("--- Classification Report ---")
print(classification_report(y_test, y_pred))

--- Classification Report ---
              precision    recall  f1-score   support

     Dresses       0.72      1.00      0.84       136
     Jackets       0.00      0.00      0.00        14
       Jeans       0.88      0.41      0.56        17
       Pants       0.85      0.44      0.58        25
      Shorts       0.00      0.00      0.00         5
      Skirts       1.00      0.29      0.45        17

    accuracy                           0.74       214
   macro avg       0.57      0.36      0.41       214
weighted avg       0.71      0.74      0.68       214



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [8]:
# We need to save both the model and the vectorizer to use them later
os.makedirs('../models', exist_ok=True) # Create models directory if it doesn't exist
joblib.dump(model, '../models/text_classifier_model.pkl')
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully in the 'models' folder.")

Model and vectorizer saved successfully in the 'models' folder.
