## Note: This notebook is ran on Google Colab with GPU enabled.

In [1]:
!pip install catboost
!pip install timm
!pip install transformers datasets evaluate
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.13-py3-none-any.whl (549 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, timm
Successfully installed huggingface-hub-0.13.4 timm-0.6.13
Lo

In [2]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from torchvision import transforms
import torch
from matplotlib import pyplot as plt
import timm
from transformers import AutoTokenizer
import datasets

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


**Load Training data**

In [64]:
# Preprocess the data
data = pd.read_csv("train.csv")

X_tabular = data.drop(['id', 'category'], axis=1)

categorical_cols = ['gender', 'baseColour', 'season', 'usage']
categorical_cols_idx = [X_tabular.columns.get_loc(col) for col in categorical_cols]
y_tabular = data['category']

X_img = data['id']
y_img = data['category']

# load test data
test_data = pd.read_csv("test.csv")
ids = test_data['id']
test_tabular_data = test_data.drop(['id'], axis=1)

# split the data into train and validation sets
train_idx, val_idx = train_test_split(data.index, test_size=0.2, random_state=42, stratify=data['category'])
X_train_tabular, X_val_tabular = X_tabular.iloc[train_idx], X_tabular.iloc[val_idx]
y_train_tabular, y_val_tabular = y_tabular.iloc[train_idx], y_tabular.iloc[val_idx]

# use idx to load images from img_folder for training into a list
def load_images(img_folder, train_idx):
    img_list = []
    for i in tqdm(train_idx):
        img = plt.imread(img_folder + str(i) + '.jpg')
        img_list.append(img)
    return img_list

X_train_img = load_images('', train_idx)
X_val_img = load_images('', val_idx)
X_test_img = load_images('', test_data['id'])
y_train_img = y_img.iloc[train_idx]
y_val_img = y_img.iloc[val_idx]

**Random Forest Classifier**


In [98]:
# Column transformer for handling both categorical and text features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('text', TfidfVectorizer(), 'noisyTextDescription')
    ])

# Create the Random Forest classifier
rand_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)

# Create a pipeline with the preprocessor and the classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rand_forest_clf)
])

# Train the pipeline on the training data
pipeline.fit(X_train_tabular, y_train_tabular)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   19.6s finished


**CatBoost Model**

In [4]:
# Train catboost model
catboost_model = CatBoostClassifier(iterations=2000, depth=8, l2_leaf_reg=3, loss_function="MultiClass", random_seed=42, task_type='GPU', verbose=1)
catboost_model.fit(X_train_tabular, y_train_tabular, cat_features=categorical_cols_idx, text_features=[4], eval_set=(X_val_tabular, y_val_tabular))

0:	learn: 2.9397897	test: 2.9214553	best: 2.9214553 (0)	total: 125ms	remaining: 4m 10s
1:	learn: 2.7389913	test: 2.7119626	best: 2.7119626 (1)	total: 223ms	remaining: 3m 42s
2:	learn: 2.5951871	test: 2.5620700	best: 2.5620700 (2)	total: 312ms	remaining: 3m 27s
3:	learn: 2.4822285	test: 2.4447730	best: 2.4447730 (3)	total: 376ms	remaining: 3m 7s
4:	learn: 2.3905458	test: 2.3494376	best: 2.3494376 (4)	total: 449ms	remaining: 2m 59s
5:	learn: 2.3122250	test: 2.2678894	best: 2.2678894 (5)	total: 510ms	remaining: 2m 49s
6:	learn: 2.2437675	test: 2.1973029	best: 2.1973029 (6)	total: 572ms	remaining: 2m 42s
7:	learn: 2.1793809	test: 2.1306519	best: 2.1306519 (7)	total: 634ms	remaining: 2m 37s
8:	learn: 2.1254179	test: 2.0739463	best: 2.0739463 (8)	total: 694ms	remaining: 2m 33s
9:	learn: 2.0768288	test: 2.0234517	best: 2.0234517 (9)	total: 758ms	remaining: 2m 30s
10:	learn: 2.0299919	test: 1.9736942	best: 1.9736942 (10)	total: 819ms	remaining: 2m 28s
11:	learn: 1.9880963	test: 1.9300281	best:

<catboost.core.CatBoostClassifier at 0x7f7ce7232b50>

In [6]:
# predict on test set with catboost only
catboost_preds = catboost_model.predict(test_tabular_data)
# add id column to the predictions
catboost_preds = pd.DataFrame(catboost_preds, columns=['category'])
catboost_preds['id'] = ids
# reorder the columns
catboost_preds = catboost_preds[['id', 'category']]
# to csv with header
catboost_preds.to_csv("catboost_preds.csv", index=False)

**EfficientNet Model**

In [66]:
X_train_img = pd.read_pickle('X_train_img.pkl')
X_test_img = pd.read_pickle('X_test_img.pkl')

In [67]:
X_train_img.columns

Index(['id', 'category', 'img'], dtype='object')

In [68]:
# save one-hot encoded category column names for later
dummy_columns = pd.get_dummies(X_train_img[['category']], columns=['category']).columns

In [69]:
y_img = torch.tensor(pd.get_dummies(X_train_img[['category']], columns=['category']).values, dtype=torch.float32)
X_img = X_train_img.drop(columns=['id', 'category'])

In [70]:
y_img

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [71]:
# use the same train_idx, val_idx as catBoost
X_train_img, X_val_img = X_img.iloc[train_idx], X_img.iloc[val_idx]
y_train_img, y_val_img = y_img[train_idx], y_img[val_idx]

In [73]:
# create the dataloaders
class ImageDataset(Dataset):
    def __init__(self, X, y=None, train=True):
        self.X = X
        self.y = y
        self.train = train
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        img = self.X[idx]
        img = Image.fromarray(img)
        if self.train:
            img = transforms.RandomHorizontalFlip()(img)
        if self.y is not None:
          return img, self.y[idx]
        else:
          return img

batch_size = 128
train_dataset = ImageDataset(X_train_img, y_train_img, train=True)
val_dataset = ImageDataset(X_val_img, y_val_img, train=False)
test_dataset = ImageDataset(X_test_img, train=False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [74]:
efficientnet = timm.create_model('tf_efficientnetv2_s', pretrained=True, num_classes=27).to(device)

In [75]:
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(efficientnet.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
epochs = 10

for epoch in tqdm(range(epochs)):
    efficientnet.train()
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = efficientnet(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch}, Training Loss: {loss.item():4f}")

    efficientnet.eval()
    val_loss = 0.0
    for i, (inputs, labels) in enumerate(val_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = efficientnet(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    print(f"Epoch: {epoch}, Validation Loss: {val_loss:4f}")

    # Update the scheduler
    scheduler.step(val_loss)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0, Training Loss: 0.049896


 10%|█         | 1/10 [00:29<04:23, 29.33s/it]

Epoch: 0, Validation Loss: 0.056056
Epoch: 1, Training Loss: 0.064641


 20%|██        | 2/10 [00:57<03:50, 28.80s/it]

Epoch: 1, Validation Loss: 0.053133
Epoch: 2, Training Loss: 0.030953


 30%|███       | 3/10 [01:26<03:20, 28.68s/it]

Epoch: 2, Validation Loss: 0.052570
Epoch: 3, Training Loss: 0.056901


 40%|████      | 4/10 [01:54<02:51, 28.65s/it]

Epoch: 3, Validation Loss: 0.054959
Epoch: 4, Training Loss: 0.040695


 50%|█████     | 5/10 [02:23<02:22, 28.60s/it]

Epoch: 4, Validation Loss: 0.055184
Epoch 00005: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 5, Training Loss: 0.038068


 60%|██████    | 6/10 [02:51<01:54, 28.52s/it]

Epoch: 5, Validation Loss: 0.053042
Epoch: 6, Training Loss: 0.028325


 70%|███████   | 7/10 [03:20<01:25, 28.49s/it]

Epoch: 6, Validation Loss: 0.056297
Epoch 00007: reducing learning rate of group 0 to 1.0000e-05.
Epoch: 7, Training Loss: 0.038694


 80%|████████  | 8/10 [03:48<00:56, 28.48s/it]

Epoch: 7, Validation Loss: 0.056876
Epoch: 8, Training Loss: 0.040646


 90%|█████████ | 9/10 [04:17<00:28, 28.49s/it]

Epoch: 8, Validation Loss: 0.057710
Epoch 00009: reducing learning rate of group 0 to 1.0000e-06.
Epoch: 9, Training Loss: 0.014923


100%|██████████| 10/10 [04:45<00:00, 28.57s/it]

Epoch: 9, Validation Loss: 0.057688





**MobileBERT model**

In [24]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder

In [25]:
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
max_length = 76 # the longest text in the dataset has 76 tokens

X_text = "The item is "  + data['noisyTextDescription'] + ". It is used by gender " + data['gender'] + ". Its colour is " + data['baseColour'] + ". It is suitable for " + data['season'] + ". Its usage is " + data['usage'] + "."
y_text = data["category"]
X_train_text, X_val_text = X_text.iloc[train_idx], X_text.iloc[val_idx]
y_train_text, y_val_text = y_text.iloc[train_idx], y_text.iloc[val_idx]
X_test_text = "The item is "  + test_data['noisyTextDescription'] + ". It is used by gender " + test_data['gender'] + ". Its colour is " + test_data['baseColour'] + ". It is suitable for " + test_data['season'] + ". Its usage is " + test_data['usage'] + "."

# Encode category strings as numerical labels
label_encoder = LabelEncoder()
y_train_text_encoded = label_encoder.fit_transform(y_train_text)
y_val_text_encoded = label_encoder.transform(y_val_text)

train_data = {"text": X_train_text, "label": y_train_text_encoded}
val_data = {"text": X_val_text, "label": y_val_text_encoded}
test__data = {"text": X_test_text}

train_dataset = datasets.Dataset.from_dict(train_data)
val_dataset = datasets.Dataset.from_dict(val_data)
test_dataset = datasets.Dataset.from_dict(test__data)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/17301 [00:00<?, ? examples/s]

Map:   0%|          | 0/4326 [00:00<?, ? examples/s]

Map:   0%|          | 0/21628 [00:00<?, ? examples/s]

In [26]:
# Create the mobilebert model
mobilebert = AutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=27).to(device)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="my_product_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Create the Trainer
trainer = Trainer(
    model=mobilebert,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Epoch,Training Loss,Validation Loss,Accuracy
1,248839.392,0.872535,0.754739
2,0.8267,0.720798,0.79681
3,0.6998,0.660796,0.816921
4,0.6323,0.702248,0.820157
5,0.6994,0.646826,0.82478
6,0.5336,0.634528,0.825012
7,0.497,0.632806,0.826399
8,0.4751,0.647289,0.827323
9,0.4319,0.643935,0.828941
10,0.6969,0.650416,0.827785


TrainOutput(global_step=5410, training_loss=22998.633973084623, metrics={'train_runtime': 1553.1708, 'train_samples_per_second': 111.391, 'train_steps_per_second': 3.483, 'total_flos': 1696252738824000.0, 'train_loss': 22998.633973084623, 'epoch': 10.0})

In [None]:
# Create the mobilebert model
mobilebert = AutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=27).to(device)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="my_product_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Create the Trainer
trainer = Trainer(
    model=mobilebert,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Epoch,Training Loss,Validation Loss,Accuracy
1,142005.696,0.828167,0.774387
2,0.8183,0.70895,0.801433
3,0.6899,0.655438,0.820388
4,0.7433,0.625558,0.825474
5,0.7781,0.631772,0.828941
6,0.5363,0.62457,0.824087
7,0.497,0.605467,0.829172
8,0.4807,0.620988,0.830328
9,0.647,0.623475,0.832178
10,0.4298,0.625483,0.833102


TrainOutput(global_step=5410, training_loss=13124.922032489882, metrics={'train_runtime': 1419.5769, 'train_samples_per_second': 121.874, 'train_steps_per_second': 3.811, 'total_flos': 1484221146471000.0, 'train_loss': 13124.922032489882, 'epoch': 10.0})

**Validation Set Predictions**

In [56]:
def proba_to_labels(proba):
  # convert to one-hot
  numerical = np.argmax(proba, axis=1)
  onehot = np.eye(27)[numerical]
  # convert to original labels
  category_df = pd.DataFrame(onehot, columns=dummy_columns).idxmax(axis=1)
  category_df = category_df.str.replace("category_", "")
  return category_df

In [76]:
# Obtain Random Forest predictions on the validation set
X_val_tabular_preprocessed = pipeline.named_steps['preprocessor'].transform(X_val_tabular)
val_preds_rand_forest = rand_forest_clf.predict_proba(X_val_tabular_preprocessed)

# Obtain CatBoost predictions on the validation set
val_preds_catboost = catboost_model.predict_proba(X_val_tabular)

# Obtain EfficientNet predictions on the validation set
val_preds_efficientnet = []
efficientnet.eval()
for i, (inputs, labels) in enumerate(val_loader):
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = efficientnet(inputs)
        preds = torch.softmax(outputs, dim=1).cpu().numpy()
        val_preds_efficientnet.append(preds)
val_preds_efficientnet = np.vstack(val_preds_efficientnet)

# Obtain mobilebert predictions on the validation set
val_predictions = trainer.predict(tokenized_val_dataset)
val_preds_mobilebert = torch.softmax(torch.tensor(val_predictions.predictions), dim=1).numpy()

In [100]:
# Random Forest validation set evaluation
val_preds_rand_forest_labels = proba_to_labels(val_preds_rand_forest)
print(f'accuracy: {accuracy_score(y_val_tabular, val_preds_rand_forest_labels)}')
print(classification_report(y_val_tabular, val_preds_rand_forest_labels))

accuracy: 0.8291724456773001
                          precision    recall  f1-score   support

             Accessories       1.00      0.58      0.74        12
             Apparel Set       0.00      0.00      0.00        10
                    Bags       0.81      0.84      0.82       307
                   Belts       0.88      0.56      0.69        80
              Bottomwear       0.91      0.52      0.66       262
               Cufflinks       1.00      0.91      0.95        11
                   Dress       0.96      0.52      0.68        48
                 Eyewear       0.95      0.76      0.84       104
              Flip Flops       0.88      0.67      0.76        90
               Fragrance       0.87      0.93      0.90       100
              Free Gifts       0.50      0.11      0.18         9
                Headwear       0.93      0.50      0.65        26
               Innerwear       0.94      0.73      0.82       181
               Jewellery       0.93      0.89 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
# CatBoost validation set evaluation
val_preds_catboost_labels = proba_to_labels(val_preds_catboost)
print(f'accuracy: {accuracy_score(y_val_tabular, val_preds_catboost_labels)}')
print(classification_report(y_val_tabular, val_preds_catboost_labels))

accuracy: 0.8421174294960703
                          precision    recall  f1-score   support

             Accessories       0.83      0.42      0.56        12
             Apparel Set       1.00      0.20      0.33        10
                    Bags       0.83      0.82      0.82       307
                   Belts       0.80      0.61      0.70        80
              Bottomwear       0.84      0.71      0.77       262
               Cufflinks       1.00      0.91      0.95        11
                   Dress       0.90      0.56      0.69        48
                 Eyewear       0.92      0.81      0.86       104
              Flip Flops       0.83      0.66      0.73        90
               Fragrance       0.96      0.91      0.93       100
              Free Gifts       0.00      0.00      0.00         9
                Headwear       0.88      0.58      0.70        26
               Innerwear       0.84      0.80      0.82       181
               Jewellery       0.90      0.90 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [91]:
# EfficientNet validation set evaluation
val_preds_efficientnet_labels = proba_to_labels(val_preds_efficientnet)
print(f'accuracy: {accuracy_score(y_val_tabular, val_preds_efficientnet_labels)}')
print(classification_report(y_val_tabular, val_preds_efficientnet_labels))

accuracy: 0.816458622283865
                          precision    recall  f1-score   support

             Accessories       0.82      0.75      0.78        12
             Apparel Set       0.70      0.70      0.70        10
                    Bags       0.81      0.82      0.82       307
                   Belts       0.80      0.85      0.82        80
              Bottomwear       0.79      0.77      0.78       262
               Cufflinks       0.62      0.73      0.67        11
                   Dress       0.44      0.33      0.38        48
                 Eyewear       0.82      0.81      0.82       104
              Flip Flops       0.72      0.68      0.70        90
               Fragrance       0.71      0.73      0.72       100
              Free Gifts       0.00      0.00      0.00         9
                Headwear       0.70      0.73      0.72        26
               Innerwear       0.82      0.81      0.81       181
               Jewellery       0.66      0.65  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
# MobileBERT validation set evaluation
val_preds_mobilebert_labels = proba_to_labels(val_preds_mobilebert)
print(f'accuracy: {accuracy_score(y_val_tabular, val_preds_mobilebert_labels)}')
print(classification_report(y_val_tabular, val_preds_mobilebert_labels))

accuracy: 0.8263985205732779
                          precision    recall  f1-score   support

             Accessories       0.70      0.58      0.64        12
             Apparel Set       1.00      0.50      0.67        10
                    Bags       0.81      0.78      0.79       307
                   Belts       0.74      0.60      0.66        80
              Bottomwear       0.84      0.71      0.77       262
               Cufflinks       0.91      0.91      0.91        11
                   Dress       0.96      0.54      0.69        48
                 Eyewear       0.83      0.74      0.78       104
              Flip Flops       0.90      0.69      0.78        90
               Fragrance       0.88      0.89      0.89       100
              Free Gifts       0.00      0.00      0.00         9
                Headwear       0.78      0.69      0.73        26
               Innerwear       0.80      0.73      0.76       181
               Jewellery       0.84      0.95 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Build a soft voting classifier using RandomForest Classifier, CatBoost Classifier, EfficientNet, and MobileBERT**

In [101]:
# Determine the optimal weights using cross-validation
kf = KFold(n_splits=10)
weights = np.linspace(0, 1, 21)
best_weights = (0, 0, 0, 0)
best_accuracy = 0

for w1 in weights:
    for w2 in weights:
        for w3 in weights:
            w4 = 1 - w1 - w2 - w3
            if w4 < 0:  # skip if the sum of weights exceeds 1
                continue
            accuracies = []
            for train_idx, val_idx in kf.split(X_val_tabular):
                val_preds_weighted = w1 * val_preds_catboost[val_idx] + w2 * val_preds_efficientnet[val_idx] + w3 * val_preds_mobilebert[val_idx] + w4 * val_preds_rand_forest[val_idx]
                val_preds_weighted = torch.softmax(torch.tensor(val_preds_weighted, dtype=torch.float), dim=-1)
                val_preds_weighted_labels = np.argmax(val_preds_weighted.numpy(), axis=1)
                val_preds_weighted_onehot = np.eye(27)[val_preds_weighted_labels]
                accuracy = accuracy_score(y_val_img[val_idx], val_preds_weighted_onehot)
                accuracies.append(accuracy)
            avg_accuracy = np.mean(accuracies)
            # print(f"weights: {w1:.2f}, {w2:.2f}, {w3:.2f}, {w4:.2f} accuracy: {avg_accuracy:.3f}")
            if avg_accuracy > best_accuracy:
                best_accuracy = avg_accuracy
                best_weights = (w1, w2, w3, w4)

best_weights = np.round(best_weights, 3)
print(f"Optimal weights: {best_weights}, best accuracy: {best_accuracy}")

Optimal weights: [0.05 0.45 0.45 0.05], best accuracy: 0.9269507527157643


In [102]:
# ensemble validation set accuracy
val_preds_weighted = best_weights[0] * val_preds_catboost + best_weights[1] * val_preds_efficientnet + best_weights[2] * val_preds_mobilebert + best_weights[3] * val_preds_rand_forest
val_preds_weighted_labels = proba_to_labels(val_preds_weighted)
print(f"Accuracy: {accuracy_score(y_val_tabular, val_preds_weighted_labels)}")
print(classification_report(y_val_tabular, val_preds_weighted_labels))

Accuracy: 0.9269533055940823
                          precision    recall  f1-score   support

             Accessories       0.80      0.67      0.73        12
             Apparel Set       1.00      0.70      0.82        10
                    Bags       0.93      0.94      0.94       307
                   Belts       0.92      0.91      0.92        80
              Bottomwear       0.94      0.86      0.90       262
               Cufflinks       0.91      0.91      0.91        11
                   Dress       0.88      0.60      0.72        48
                 Eyewear       0.94      0.90      0.92       104
              Flip Flops       0.89      0.86      0.87        90
               Fragrance       0.92      0.98      0.95       100
              Free Gifts       0.00      0.00      0.00         9
                Headwear       0.92      0.85      0.88        26
               Innerwear       0.97      0.94      0.96       181
               Jewellery       0.94      0.93 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Retrain all models using training + validation set before test set predictions**

In [103]:
# Retrain the Random Forest classifier
rand_forest_clf_retrained = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)
pipeline_retrained = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rand_forest_clf_retrained)
])
pipeline_retrained.fit(X_tabular, y_tabular)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   26.6s finished


In [83]:
# Retrain the CatBoostClassifier
catboost_model_retrained = CatBoostClassifier(iterations=2000, depth=8, l2_leaf_reg=3, loss_function="MultiClass", random_seed=42, task_type='GPU', verbose=0)
catboost_model_retrained.fit(X_tabular, y_tabular, cat_features=categorical_cols_idx, text_features=[4])



<catboost.core.CatBoostClassifier at 0x7f7cac9c98e0>

In [84]:
X_train_val_img = pd.concat([X_train_img, X_val_img], axis=0)
y_train_val_img = torch.cat([y_train_img, y_val_img], dim=0)

In [86]:
# Retrain the EfficientNet model

# Combine train and validation image data
X_train_val_img = pd.concat([X_train_img, X_val_img], axis=0)
y_train_val_img = torch.cat([y_train_img, y_val_img], dim=0)

# Create train_val_dataset and train_val_loader
train_val_dataset = ImageDataset(X_train_val_img, y_train_val_img, train=True)
train_val_loader = DataLoader(train_val_dataset, batch_size=batch_size, shuffle=True)

efficientnet_retrained = timm.create_model('tf_efficientnetv2_s', pretrained=True, num_classes=27).to(device)
optimizer_retrained = torch.optim.Adam(efficientnet_retrained.parameters(), lr=0.001)
scheduler_retrained = StepLR(optimizer_retrained, step_size=4, gamma=0.1)
epochs = 10

for epoch in tqdm(range(epochs)):
    efficientnet_retrained.train()
    epoch_loss = 0.0
    num_batches = 0
    for i, (inputs, labels) in enumerate(train_val_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer_retrained.zero_grad()
        outputs = efficientnet_retrained(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_retrained.step()
        epoch_loss += loss.item()
        num_batches += 1
    avg_epoch_loss = epoch_loss / num_batches
    print(f"Epoch: {epoch}, Training Loss: {avg_epoch_loss:.4f}")

    scheduler_retrained.step()

 10%|█         | 1/10 [00:36<05:24, 36.02s/it]

Epoch: 0, Training Loss: 0.0842


 20%|██        | 2/10 [01:10<04:43, 35.38s/it]

Epoch: 1, Training Loss: 0.0500


 30%|███       | 3/10 [01:46<04:06, 35.26s/it]

Epoch: 2, Training Loss: 0.0457


 40%|████      | 4/10 [02:21<03:31, 35.33s/it]

Epoch: 3, Training Loss: 0.0426


 50%|█████     | 5/10 [02:56<02:56, 35.24s/it]

Epoch: 4, Training Loss: 0.0335


 60%|██████    | 6/10 [03:31<02:20, 35.24s/it]

Epoch: 5, Training Loss: 0.0282


 70%|███████   | 7/10 [04:07<01:45, 35.23s/it]

Epoch: 6, Training Loss: 0.0239


 80%|████████  | 8/10 [04:42<01:10, 35.22s/it]

Epoch: 7, Training Loss: 0.0189


 90%|█████████ | 9/10 [05:17<00:35, 35.32s/it]

Epoch: 8, Training Loss: 0.0139


100%|██████████| 10/10 [05:52<00:00, 35.30s/it]

Epoch: 9, Training Loss: 0.0129





In [93]:
# Combine train and validation text data
X_train_val_text = pd.concat([X_train_text, X_val_text], axis=0)
y_train_val_text_encoded = np.concatenate([y_train_text_encoded, y_val_text_encoded], axis=0)

train_val_data = {"text": X_train_val_text, "label": y_train_val_text_encoded}
train_val_dataset = datasets.Dataset.from_dict(train_val_data)
tokenized_train_val_dataset = train_val_dataset.map(preprocess_function, batched=True)

training_args_no_eval = TrainingArguments(
    output_dir="my_product_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="no",
    save_strategy="epoch",
    load_best_model_at_end=False
)

# Retrain the MobileBERT model with combined train and validation data
mobilebert_retrained = AutoModelForSequenceClassification.from_pretrained("google/mobilebert-uncased", num_labels=27).to(device)
trainer_retrained = Trainer(
    model=mobilebert_retrained,
    args=training_args_no_eval,
    train_dataset=tokenized_train_val_dataset,
    compute_metrics=compute_metrics,
)
trainer_retrained.train()

Map:   0%|          | 0/21627 [00:00<?, ? examples/s]

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Step,Training Loss
500,124869.384
1000,0.8702
1500,0.7056
2000,0.6287
2500,0.6234
3000,0.5488
3500,0.5216
4000,0.4902
4500,0.4838
5000,0.446


TrainOutput(global_step=6760, training_loss=9236.400953870694, metrics={'train_runtime': 1813.6808, 'train_samples_per_second': 119.244, 'train_steps_per_second': 3.727, 'total_flos': 2120389456248000.0, 'train_loss': 9236.400953870694, 'epoch': 10.0})

**Make predictions on the test set**

In [94]:
# Obtain Random Forest predictions on the test set
X_test_tabular_preprocessed = pipeline.named_steps['preprocessor'].transform(test_tabular_data)
test_preds_rand_forest = rand_forest_clf_retrained.predict_proba(X_test_tabular_preprocessed)

# Obtain CatBoost predictions on the test set
test_preds_catboost = catboost_model_retrained.predict_proba(test_tabular_data)

# Obtain EfficientNet predictions on the test set
test_preds_efficientnet = []
efficientnet.eval()
for i, inputs in enumerate(test_loader):
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = efficientnet_retrained(inputs)
        preds = torch.softmax(outputs, dim=1).cpu().numpy()
        test_preds_efficientnet.append(preds)
test_preds_efficientnet = np.vstack(test_preds_efficientnet)

# Obtain mobilebert predictions on the test set
test_predictions = trainer_retrained.predict(tokenized_test_dataset)
test_preds_mobilebert = torch.softmax(torch.tensor(test_predictions.predictions), dim=1).numpy()

In [106]:
test_preds_weighted = best_weights[0] * test_preds_catboost + best_weights[1] * test_preds_efficientnet + best_weights[2] * test_preds_mobilebert + best_weights[3] * test_preds_rand_forest
test_preds_weighted_labels = proba_to_labels(test_preds_weighted)
test_preds_category_df = pd.concat([ids, test_preds_weighted_labels], axis=1)
test_preds_category_df.rename(columns={0: "category"}, inplace=True)
# reorder the columns
test_preds_category_df = test_preds_category_df[['id', 'category']]
# to csv with header
test_preds_category_df.to_csv("test_pred_weighted.csv", index=False)