In [77]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


In [78]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

# import for NLP
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical
from torcheval.metrics.functional import multiclass_f1_score



In [79]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [80]:
# mount the drive where your dataset is available
from google.colab import drive
drive.mount('/content/drive')
filepath='/content/drive/MyDrive/datasets/multimodal_product_classification/' # add your own path. Where to save the dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
# Load data
X_train = pd.read_csv(filepath+'X_train.csv')
y_train = pd.read_csv(filepath+'Y_train.csv')
X_train=X_train.drop(columns="Unnamed: 0")
y_train=y_train.drop(columns="Unnamed: 0")

In [82]:
# Cleaning and Preprocessing Text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-ZäöüßÄÖÜ ]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [83]:
# Apply cleaning function to the 'designation' column
X_train['designation'] = X_train['designation'].fillna('').apply(clean_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['designation'])
sequences = tokenizer.texts_to_sequences(X_train['designation'])

# Padding to max length of text
data = pad_sequences(sequences, maxlen=34)

# Assuming the number of unique words in the tokenizer plus 1 is vocab_size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)# nearly 70 000 of vocab size, it seems too much

69189


In [84]:
# Split data into training and validation set (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, y_train, test_size=0.2,shuffle=False)
y_train=y_train["prdtypecode"].tolist()
y_val=y_val["prdtypecode"].tolist()

In [85]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert to one-hot encoding
y_train_categorical = to_categorical(y_train_encoded)
y_val_categorical = to_categorical(y_val_encoded)


In [86]:
class TextDataset(Dataset):
    def __init__(self, X, Y):
        self.inputs = X
        self.labels = Y

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.inputs[idx]).to(device)
        y = torch.tensor(self.labels[idx], dtype=torch.float).to(device)
        return x, y

In [87]:
embedding_dim = 300
batch_size=64

In [88]:
train_dataset=TextDataset(X_train,y_train_categorical)
train_loader=DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset=TextDataset(X_val,y_val_categorical)
val_loader=DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# Model definition


In [89]:

# Define the model
class CNN_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super(CNN_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_blocks = nn.ModuleList([
            nn.Conv2d(1, 512, (i, embedding_dim), padding=(0, 0))
            for i in range(1,7)
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(512 * 6, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add channel dimension for Conv2d
        conv_outputs = [nn.functional.relu(conv_block(x)).max(dim=3)[0].max(dim=2)[0] for conv_block in self.conv_blocks]# [0] to get only the values and not the indices ( in pos 1 )
        x = torch.cat(conv_outputs, dim=1)
        # Dense Layer

        # Flatten Layer
        x = x.view(x.size(0), -1)
        # Dropout Layer
        x = self.dropout(x)

        x = self.fc(x)
        return x


In [90]:
# Initialize the model
num_classes = 27
model = CNN_classifier(vocab_size, embedding_dim, num_classes)

# Convert the model to CUDA if available
model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
from sklearn.metrics import f1_score
f1_metric = lambda preds, labels: f1_score(labels, torch.argmax(preds, dim=1), average='macro')


# Print the model summary
print(model)


CNN_classifier(
  (embedding): Embedding(69189, 300)
  (conv_blocks): ModuleList(
    (0): Conv2d(1, 512, kernel_size=(1, 300), stride=(1, 1))
    (1): Conv2d(1, 512, kernel_size=(2, 300), stride=(1, 1))
    (2): Conv2d(1, 512, kernel_size=(3, 300), stride=(1, 1))
    (3): Conv2d(1, 512, kernel_size=(4, 300), stride=(1, 1))
    (4): Conv2d(1, 512, kernel_size=(5, 300), stride=(1, 1))
    (5): Conv2d(1, 512, kernel_size=(6, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=3072, out_features=27, bias=True)
)


In [91]:
# print(dummy_vec.shape)
# model(dummy_vec)

In [92]:
# Train the model
num_epochs = 10  # Adjust as needed
for epoch in tqdm(range(num_epochs)):
    model.train()
    for inputs, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs=outputs.squeeze(0)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = []
        val_labels = []
        for val_inputs, val_labels_batch in val_loader:
            val_outputs = model(val_inputs)
            val_preds.append(val_outputs.cpu())
            val_labels.append(val_labels_batch.cpu())

    val_preds = torch.cat(val_preds, dim=0)
    val_labels = torch.cat(val_labels, dim=0)
    print(f'{val_preds=}')
    print(f'{val_labels=}')
    print(f'{val_preds.shape=}')
    print(f'{val_labels.shape=}')

    val_f1 =f1_metric(val_preds, val_labels)
    #'samples'

    print(f"Epoch {epoch + 1}/{num_epochs}, Validation F1 Score: {val_f1:.4f}")


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1062 [00:00<?, ?it/s]

val_preds=tensor([[ -4.0427,  -4.0959,  -5.2553,  ...,   2.9180,  -5.9637,  -4.3642],
        [ -7.3676,   2.5120,   2.6863,  ...,  -4.9910, -10.5937,   2.5285],
        [ -4.2794,  -7.8291,  -4.4304,  ...,  -1.9325,  -6.4721,  -5.6992],
        ...,
        [ -5.2567,  -9.0309,  -5.4933,  ...,   0.2080,  -7.2515,  -1.6414],
        [ -5.5837,  -4.0576,  -6.4347,  ...,  -2.6642,  -7.6614,  -2.3570],
        [  1.9537,   0.8935,  -2.6909,  ...,  -1.0201,   0.6385,  -2.7060]])
val_labels=tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
val_preds.shape=torch.Size([16984, 27])
val_labels.shape=torch.Size([16984, 27])


ValueError: ignored

In [None]:
val_labels[2]