In [2]:
import json
import warnings
warnings.filterwarnings("ignore")

# File paths
file1 = "renttherunway_final_data.json"

# Function to read JSON lines data from a file
def read_json_lines(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))  # Load each JSON object per line
    return data

# Load data from each file
renttherunway_data = read_json_lines(file1)

# Display the data (printing just the first entry for readability)
print("Rent the Runway Data Sample:", renttherunway_data[0])

Rent the Runway Data Sample: {'fit': 'fit', 'user_id': '420272', 'bust size': '34d', 'item_id': '2260466', 'weight': '137lbs', 'rating': '10', 'rented for': 'vacation', 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.", 'body type': 'hourglass', 'review_summary': 'So many compliments!', 'category': 'romper', 'height': '5\' 8"', 'size': 14, 'age': '28', 'review_date': 'April 20, 2016'}


In [3]:
len(renttherunway_data)

192544

In [4]:
import pandas as pd

renttherunway_df = pd.read_json(file1, lines=True)
renttherunway_df.describe()

Unnamed: 0,user_id,item_id,rating,size,age
count,192544.0,192544.0,192462.0,192544.0,191584.0
mean,499494.100149,1045684.0,9.092371,12.245175,33.871017
std,289059.719328,805314.8,1.430044,8.494877,8.058083
min,9.0,123373.0,2.0,0.0,0.0
25%,250654.25,195076.0,8.0,8.0,29.0
50%,499419.0,948396.0,10.0,12.0,32.0
75%,750974.0,1678888.0,10.0,16.0,37.0
max,999997.0,2966087.0,10.0,58.0,117.0


In [5]:
print(f" Null values in Renttherunway dataset: \n{renttherunway_df.isnull().sum()}")

 Null values in Renttherunway dataset: 
fit                   0
user_id               0
bust size         18411
item_id               0
weight            29982
rating               82
rented for           10
review_text           0
body type         14637
review_summary        0
category              0
height              677
size                  0
age                 960
review_date           0
dtype: int64


In [6]:
# Remove rows with any null values
cleaned_renttherunway_df = renttherunway_df.dropna()

# Display null values after cleaning to verify
cleaned_null_values = cleaned_renttherunway_df.isnull().sum()
print(f"\nNull values in Renttherunway dataset after cleaning:\n{cleaned_null_values}")

# Summary of cleaned dataset
cleaned_summary = cleaned_renttherunway_df.describe()
cleaned_summary


Null values in Renttherunway dataset after cleaning:
fit               0
user_id           0
bust size         0
item_id           0
weight            0
rating            0
rented for        0
review_text       0
body type         0
review_summary    0
category          0
height            0
size              0
age               0
review_date       0
dtype: int64


Unnamed: 0,user_id,item_id,rating,size,age
count,146381.0,146381.0,146381.0,146381.0,146381.0
mean,498991.574528,1052277.0,9.081985,11.437919,34.0898
std,289658.524623,809107.6,1.437853,7.826784,8.113217
min,9.0,123373.0,2.0,0.0,0.0
25%,249294.0,195613.0,8.0,4.0,29.0
50%,499034.0,961819.0,10.0,9.0,32.0
75%,750840.0,1687082.0,10.0,16.0,37.0
max,999997.0,2966087.0,10.0,58.0,117.0


In [7]:
##### Transform the nan-numerical features to the numerical label.

import re
def preprocess_data(df):
    # Convert bust size (e.g., '34D') into numerical features
    def parse_bust_size(bust):
        if isinstance(bust, str) and re.match(r'^\d+[A-Z]$', bust.upper()):  # Validate format like '34D'
            bust = bust.upper()  # Normalize to uppercase
            band, cup = int(bust[:-1]), bust[-1]
            cup_size = ord(cup) - ord('A') + 1  # A=1, B=2, C=3, ...
            return band, cup_size
        return None, None

    df['band_size'], df['cup_size'] = zip(*df['bust size'].apply(parse_bust_size))

    # Convert weight (e.g., '137lbs') to numerical
    df['weight_numeric'] = df['weight'].str.extract(r'(\d+)').astype(float)  # Extract numeric part

    # Convert height (e.g., '5\' 8"') to inches
    def height_to_inches(height):
        if isinstance(height, str) and re.match(r'^\d+\' \d+"$', height):
            height = height.replace('"', '').replace("'", "")  # Remove double quotes and apostrophe
            feet, inches = map(int, height.split())
            return feet * 12 + inches
        return None

    df['height_inches'] = df['height'].apply(height_to_inches)

    # Encode body type
    body_type_mapping = {v: i for i, v in enumerate(df['body type'].dropna().unique())}
    df['body_type_encoded'] = df['body type'].map(body_type_mapping)

    # Encode rented for
    rented_for_mapping = {v: i for i, v in enumerate(df['rented for'].dropna().unique())}
    df['rented_for_encoded'] = df['rented for'].map(rented_for_mapping)

    # Drop or fill missing values
    df = df.dropna(subset=['band_size', 'cup_size', 'weight_numeric', 'height_inches', 
                           'body_type_encoded', 'rented_for_encoded', 'fit'])

    return df

# Apply preprocessing to the dataset
cleaned_df = preprocess_data(cleaned_renttherunway_df)

# Check the updated dataset
print(cleaned_df.head())


   fit  user_id bust size  item_id  weight  rating     rented for  \
0  fit   420272       34d  2260466  137lbs    10.0       vacation   
1  fit   273551       34b   153475  132lbs    10.0          other   
3  fit   909926       34c   126335  135lbs     8.0  formal affair   
4  fit   151944       34b   616682  145lbs    10.0        wedding   
5  fit   734848       32b   364092  138lbs     8.0           date   

                                         review_text          body type  \
0  An adorable romper! Belt and zipper were a lit...          hourglass   
1  I rented this dress for a photo shoot. The the...  straight & narrow   
3  I rented this for my company's black tie award...               pear   
4  I have always been petite in my upper body and...           athletic   
5  Didn't actually wear it. It fit perfectly. The...           athletic   

                                     review_summary  ... height size   age  \
0                              So many compliments!  ...

In [8]:
##### Ablation study about sample the fit label to make fit, small, large labels evenly distibuted.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Set a random seed for reproducibility
random_seed = 42

# Divide the cleaned dataset into train, validate, and test datasets
train_df, temp_df = train_test_split(cleaned_df, test_size=0.3, random_state=random_seed)  # 70% train
validate_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=random_seed)  # 15% validate, 15% test

# Function to undersample the 'fit' category
def undersample(df):
    # Find the number of samples in the smallest class
    min_class_size = min(df['fit'].value_counts())

    # Sample from each category
    fit_sample = df[df['fit'] == 'fit'].sample(min_class_size, random_state=random_seed)
    small_sample = df[df['fit'] == 'small']
    large_sample = df[df['fit'] == 'large']

    # Combine the samples into a new dataframe
    undersampled_df = pd.concat([fit_sample, small_sample, large_sample]).sample(frac=1, random_state=random_seed)  # Shuffle the dataset
    return undersampled_df

# Apply undersampling to the train, validate, and test datasets
train_df = undersample(train_df)
validate_df = undersample(validate_df)
test_df = undersample(test_df)

# Display the shapes of the resulting datasets
print(f"Train dataset shape: {train_df.shape}")
print(f"Validate dataset shape: {validate_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

# Distribution of labels (fit, small, large) in the datasets
train_label_distribution = train_df['fit'].value_counts(normalize=True) * 100  # Percentage distribution
validate_label_distribution = validate_df['fit'].value_counts(normalize=True) * 100
test_label_distribution = test_df['fit'].value_counts(normalize=True) * 100

# Create a summary table for label distribution
label_distribution_summary = pd.DataFrame({
    'Train (%)': train_label_distribution,
    'Validate (%)': validate_label_distribution,
    'Test (%)': test_label_distribution
}).fillna(0)  # Fill missing labels with 0

# Display the distribution summary
print("Label Distribution Summary:")
print(label_distribution_summary)

# Ensure 'fit' column is encoded for classification tasks
label_encoder = LabelEncoder()
train_df['fit_encoded'] = label_encoder.fit_transform(train_df['fit'])
validate_df['fit_encoded'] = label_encoder.transform(validate_df['fit'])
test_df['fit_encoded'] = label_encoder.transform(test_df['fit'])

Train dataset shape: (37404, 21)
Validate dataset shape: (8020, 21)
Test dataset shape: (8002, 21)
Label Distribution Summary:
       Train (%)  Validate (%)   Test (%)
fit    33.140841      33.19202  32.716821
large  33.140841      33.19202  32.716821
small  33.718319      33.61596  34.566358


In [9]:
## Methond 1.1. 
#  Just randomly guess the labels and test the f2 score on the test dataset.

import numpy as np
from sklearn.metrics import fbeta_score
import pandas as pd



test_y = test_df['fit_encoded']
# Generate random predictions with the same class distribution as the test set
unique_classes = np.unique(test_y)
random_predictions = np.random.choice(unique_classes, size=len(test_y))

# Compute the F2 score for the random predictions
f2_score_random = fbeta_score(test_y, random_predictions, beta=2, average='weighted')

print(f"Baseline F2 Score (Random Guessing): {f2_score_random:.4f}")

Baseline F2 Score (Random Guessing): 0.3375


In [10]:
## Methond 1.2. 
#  Randomly guess the labels based on the appeareance on the training dataset.
#  and test the f2 score on the test dataset.

train_y = train_df['fit_encoded']
test_y = test_df['fit_encoded']
# Calculate the probabilities of each label based on its appearance in the training dataset
label_probabilities = train_y.value_counts(normalize=True)

# Generate random predictions based on the calculated probabilities
weighted_random_predictions = np.random.choice(
    label_probabilities.index, size=len(test_y), p=label_probabilities.values
)

# Compute the F2 score for the weighted random predictions
f2_score_weighted_random = fbeta_score(test_y, weighted_random_predictions, beta=2, average='weighted')

print(f"Baseline F2 Score (Weighted Random Guessing): {f2_score_weighted_random:.4f}")

Baseline F2 Score (Weighted Random Guessing): 0.3414


In [11]:
## Methond 2.  Naïve Bayes
# A Naïve Bayes model is a probabilistic classification algorithm based on Bayes' theorem. It assumes that the features are conditionally independent given the class label, which simplifies the computation.

# Here's how we can build a Naïve Bayes model for the "fit feedback" classification task:

# Steps to Implement Naïve Bayes
# Choose the Variant:

# If the features are continuous (e.g., weight_numeric, height_inches), use Gaussian Naïve Bayes.

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score

# Define the features and target variable
features = ['weight_numeric', 'height_inches', 'band_size', 'cup_size', 'body_type_encoded']
# features = ['body_type_encoded']
target = 'fit_encoded'

# Prepare training and test datasets
train_X = train_df[features]
train_y = train_df[target]
test_X = test_df[features]
test_y = test_df[target]

# Standardize the continuous features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

# Initialize and train the Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(train_X_scaled, train_y)

# Make predictions on the test set
test_predictions = nb_model.predict(test_X_scaled)

# Compute the F2 score
f2_score_nb = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for Naïve Bayes: {f2_score_nb:.4f}")

F2 Score for Naïve Bayes: 0.3539


In [12]:
## Methond 3.1. LogisticRegression model using all eight features
# features = ['weight_numeric', 'height_inches', 'size', 'age', 'band_size', 'cup_size', 'body_type_encoded', 'rented_for_encoded']

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Ensure 'fit' column is encoded for classification tasks
label_encoder = LabelEncoder()
train_df['fit_encoded'] = label_encoder.fit_transform(train_df['fit'])
validate_df['fit_encoded'] = label_encoder.transform(validate_df['fit'])
test_df['fit_encoded'] = label_encoder.transform(test_df['fit'])

# Select relevant features (including preprocessed numerical and encoded features)
features = ['weight_numeric', 'height_inches', 'size', 'age', 
            'band_size', 'cup_size', 'body_type_encoded', 'rented_for_encoded']

# Prepare training and testing data
train_X = train_df[features]
train_y = train_df['fit_encoded']
test_X = test_df[features]
test_y = test_df['fit_encoded']

# Fill missing values in features (if any) with the mean
train_X = train_X.fillna(train_X.mean())
test_X = test_X.fillna(test_X.mean())

# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(train_X, train_y)

# Make predictions on the test data
test_predictions = model.predict(test_X)

# Compute the F2 score
f2_score_test = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score on the test data: {f2_score_test:.4f}")

# Identify most important factors based on coefficients
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nMost Connected Factors to Fit Feedback:")
print(feature_importance)

F2 Score on the test data: 0.4464

Most Connected Factors to Fit Feedback:
              Feature  Coefficient
0      weight_numeric     0.007689
3                 age     0.003980
6   body_type_encoded    -0.002026
1       height_inches    -0.007493
4           band_size    -0.011333
2                size    -0.025217
5            cup_size    -0.041787
7  rented_for_encoded    -0.041875


In [13]:
## Methond 3.2. The LogisticRegression model using the best 5 features.

# Purpose:

    # Identify the most important features (using RFE).
    # Build a simpler model using only those features.
    # Evaluate its performance using the F2 score.

# Outcome:

    # A logistic regression model trained on the top 5 features, achieving a specific F2 score on the test set.
    # Insights into which features most strongly influence the "Fit" prediction.

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd


# Select relevant features (including preprocessed numerical and encoded features)
features = ['weight_numeric', 'height_inches', 'size', 'age', 
            'band_size', 'cup_size', 'body_type_encoded', 'rented_for_encoded']

# Standardize the features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_df[features])
test_X_scaled = scaler.transform(test_df[features])
train_y = train_df['fit_encoded']
test_y = test_df['fit_encoded']

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)

# Recursive Feature Elimination (RFE)
rfe = RFE(model, n_features_to_select=5)  # Select the top 5 features
rfe.fit(train_X_scaled, train_y)

# Display selected features
selected_features = [features[i] for i in range(len(features)) if rfe.support_[i]]
print(f"Selected features by RFE: {selected_features}")

# Standardize the features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_df[selected_features])
test_X_scaled = scaler.transform(test_df[selected_features])
train_y = train_df['fit_encoded']
test_y = test_df['fit_encoded']

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(train_X_scaled, train_y)

# Make predictions on the test data
test_predictions = model.predict(test_X_scaled)

# Compute the F2 score
f2_score_test = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score on the test data: {f2_score_test:.4f}")

# Identify most important factors based on coefficients
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nMost Connected Factors to Fit Feedback:")
print(feature_importance)


Selected features by RFE: ['weight_numeric', 'height_inches', 'size', 'band_size', 'rented_for_encoded']
F2 Score on the test data: 0.4408

Most Connected Factors to Fit Feedback:
              Feature  Coefficient
0      weight_numeric     0.122287
3           band_size     0.038993
1       height_inches     0.020976
4  rented_for_encoded    -0.077131
2                size    -0.215325


In [14]:
## Methond 3.3. The LogisticRegression model using all body-related features
# features = ['weight_numeric', 'height_inches', 'size', 'age', 'band_size', 'cup_size', 'body_type_encoded', 'rented_for_encoded']

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Select relevant features (including preprocessed numerical and encoded features)
features = ['weight_numeric', 'height_inches', 'size',
            'band_size', 'cup_size', 'body_type_encoded']

# Standardize the features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_df[features])
test_X_scaled = scaler.transform(test_df[features])
train_y = train_df['fit_encoded']
test_y = test_df['fit_encoded']

# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(train_X_scaled, train_y)

# Make predictions on the test data
test_predictions = model.predict(test_X_scaled)

# Compute the F2 score
f2_score_test = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score on the test data: {f2_score_test:.4f}")

# Identify most important factors based on coefficients
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nMost Connected Factors to Fit Feedback:")
print(feature_importance)


F2 Score on the test data: 0.4301

Most Connected Factors to Fit Feedback:
             Feature  Coefficient
0     weight_numeric     0.134428
3          band_size     0.032943
1      height_inches     0.017679
5  body_type_encoded     0.006472
4           cup_size    -0.032655
2               size    -0.208988


In [15]:
## Methond 4.1.  Distance-Based Model (Similarity Matching) Euclidean Distance - KNN 
# this model using all features.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score

# Define the features and target variable
features = ['weight_numeric', 'height_inches', 'size', 'age', 
            'band_size', 'cup_size', 'body_type_encoded', 'rented_for_encoded']
target = 'fit_encoded'

# Prepare training and testing data
train_X = train_df[features]
train_y = train_df[target]
test_X = test_df[features]
test_y = test_df[target]

# Standardize the features to ensure proper scaling for distance calculations
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

# Initialize and train the k-NN model
knn_model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')  # Use Euclidean distance as default
knn_model.fit(train_X_scaled, train_y)

# Make predictions on the test set
test_predictions = knn_model.predict(test_X_scaled)

# Compute the F2 score
f2_score_knn = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for k-NN Model: {f2_score_knn:.4f}")


F2 Score for k-NN Model: 0.3915


In [16]:
## Methond 4.2.  Distance-Based Model (Similarity Matching) Euclidean Distance - KNN
# this model using the body related features and the size.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score

# Define the features and target variable
features = ['weight_numeric', 'height_inches', 'band_size', 'cup_size', 'body_type_encoded', 'size']
target = 'fit_encoded'

# Prepare training and testing data
train_X = train_df[features]
train_y = train_df[target]
test_X = test_df[features]
test_y = test_df[target]

# Standardize the features to ensure proper scaling for distance calculations
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

# Initialize and train the k-NN model
knn_model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')  # Use Euclidean distance as default
knn_model.fit(train_X_scaled, train_y)

# Make predictions on the test set
test_predictions = knn_model.predict(test_X_scaled)

# Compute the F2 score
f2_score_knn = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for k-NN Model: {f2_score_knn:.4f}")



F2 Score for k-NN Model: 0.4056


In [17]:
## Methond 5. SGD based SVM
# this model using body related features.

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score

# Define the features and target variable
features = ['weight_numeric', 'height_inches', 'band_size', 'cup_size', 'body_type_encoded', 'size']
target = 'fit_encoded'

# Prepare training and testing data
train_X = train_df[features]
train_y = train_df[target]
test_X = test_df[features]
test_y = test_df[target]

# Standardize the features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

# Initialize and train the SGDClassifier
sgd_svm = SGDClassifier(loss="hinge", random_state=42, max_iter=1000, tol=1e-3)
sgd_svm.fit(train_X_scaled, train_y)

# Make predictions on the test set
test_predictions = sgd_svm.predict(test_X_scaled)

# Compute the F2 score
f2_score_sgd = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for SGD-based SVM Model: {f2_score_sgd:.4f}")


F2 Score for SGD-based SVM Model: 0.4206


In [18]:
## Methond 6. For your classification problem (predicting fit feedback: "Small," "Fit," or "Large"), 
# we can use a Matrix Factorization approach to create latent representations and then classify based on these representations.

from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Define the features and target variable
features = ['weight_numeric', 'height_inches', 'band_size', 'cup_size', 'body_type_encoded', 'size']
target = 'fit_encoded'

# Prepare training and testing data
train_X = train_df[features]
train_y = train_df[target]
test_X = test_df[features]
test_y = test_df[target]

# Standardize the features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

# Apply Latent Factor Decomposition
n_components = 5  # Number of latent factors
svd = TruncatedSVD(n_components=n_components, random_state=42)
train_X_latent = svd.fit_transform(train_X_scaled)
test_X_latent = svd.transform(test_X_scaled)

# Train a Logistic Regression model on the latent features
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(train_X_latent, train_y)

# Make predictions on the test set
test_predictions = clf.predict(test_X_latent)

# Compute the F2 score
f2_score_latent = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for Latent-Factor Model: {f2_score_latent:.4f}")


F2 Score for Latent-Factor Model: 0.3922


In [19]:
## Methond 7. Gradient Boosting Models (e.g., XGBoost, LightGBM, CatBoost)
# Why it’s a good fit:
# Handles both numerical and categorical data well.
# Captures complex interactions between features without requiring much preprocessing.
# Works well for classification problems with structured data.
# How it works:
# Combines weak learners (decision trees) sequentially to improve predictions by minimizing errors from previous iterations.

from xgboost import XGBClassifier
from sklearn.metrics import fbeta_score

# Initialize and train the XGBoost model
xgb_model = XGBClassifier(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(train_X, train_y)

# Make predictions on the test set
test_predictions = xgb_model.predict(test_X)

# Compute the F2 score
f2_score_xgb = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for XGBoost Model: {f2_score_xgb:.4f}")


F2 Score for XGBoost Model: 0.4650


In [20]:
## Method 8
# Random Forest
# Why it’s a good fit:
# Handles non-linear relationships well.
# Robust to noisy features and less prone to overfitting compared to single decision trees.
# How it works:
# Creates multiple decision trees on random subsets of data and features, then aggregates their predictions.

from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
rf_model.fit(train_X, train_y)

# Make predictions on the test set
test_predictions = rf_model.predict(test_X)

# Compute the F2 score
f2_score_rf = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for Random Forest Model: {f2_score_rf:.4f}")


F2 Score for Random Forest Model: 0.4585


In [21]:
## method 9.1 Single Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import fbeta_score

# Define the features and target variable
features = ['weight_numeric', 'height_inches', 'band_size', 'cup_size', 'body_type_encoded', 'size']
target = 'fit_encoded'

# Prepare training and testing data
train_X = train_df[features]
train_y = train_df[target]
test_X = test_df[features]
test_y = test_df[target]

# Initialize and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)  # Limit depth to avoid overfitting
dt_model.fit(train_X, train_y)

# Make predictions on the test set
test_predictions = dt_model.predict(test_X)

# Compute the F2 score
f2_score_dt = fbeta_score(test_y, test_predictions, beta=2, average='weighted')

print(f"F2 Score for Decision Tree Model: {f2_score_dt:.4f}")


F2 Score for Decision Tree Model: 0.4549


In [22]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Define a custom Dataset for text data
class TextDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Parameters
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-3

# Load the BERT tokenizer (used only for tokenization)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split data into train and test sets
train_reviews, test_reviews, train_labels, test_labels = train_test_split(
    train_df['review_text'], train_df['fit_encoded'], test_size=0.2, random_state=42
)

# Create DataLoader
train_dataset = TextDataset(train_reviews.tolist(), train_labels.tolist(), tokenizer, MAX_LEN)
test_dataset = TextDataset(test_reviews.tolist(), test_labels.tolist(), tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Define the CNN model
class TextCNN(nn.Module):
    def __init__(self, num_classes, vocab_size, embedding_dim=100, num_filters=100, filter_sizes=[3, 4, 5]):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).unsqueeze(1)  # Add a channel dimension
        conved = [torch.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [torch.max(c, dim=2)[0] for c in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

# Initialize the CNN model
vocab_size = tokenizer.vocab_size
num_classes = len(set(train_labels))
cnn_model = TextCNN(num_classes=num_classes, vocab_size=vocab_size).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for epoch in range(EPOCHS):
    cnn_model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = cnn_model(input_ids)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Evaluation loop
cnn_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = cnn_model(input_ids)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute the F2 score
f2_score_cnn = fbeta_score(all_labels, all_preds, beta=2, average='weighted')
print(f"F2 Score for CNN Model: {f2_score_cnn:.4f}")


Epoch 1, Loss: 0.8944502981682109
Epoch 2, Loss: 0.7625614121619962
Epoch 3, Loss: 0.7138131280931143
F2 Score for CNN Model: 0.7056
