<a href="https://colab.research.google.com/github/gatienc/multimodal_product_data_classification/blob/main/gatien_text_model_3_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*diff* : trying training on description too

In [59]:
CLIP_FEATURE_SIZE=768
google_colab=False

In [60]:
%pip install transformers pandas tqdm scikit-learn imageio matplotlib wget plotly dash unidecode tensorflow

Note: you may need to restart the kernel to use updated packages.


In [61]:
import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import copy

from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

# import for NLP
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical

import zipfile

from sklearn.metrics import f1_score
import unidecode



In [62]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [63]:
if google_colab:
# mount the drive where your dataset is availabledevice = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  from google.colab import drive
  drive.mount('/content/drive')
  filepath='/content/drive/MyDrive/datasets/multimodal_product_classification/' # add your own path. Where to save the dataset

  if not os.path.exists('datasets'):
    os.makedirs('datasets')
    with zipfile.ZipFile(filepath+images_name+'.zip', 'r') as zip_ref:
        zip_ref.extractall('datasets')

  datasets_path="/content/datasets/"
  save_directory="/content/drive/MyDrive/Lessons/Models/multimodal_classification/"
else:
    import wget
    if not os.path.exists('datasets'):
        os.makedirs('datasets')
        output_directory="datasets"
        csv_zip = wget.download("https://nextcloud.its-tps.fr/s/BTpB4SC93NreZxg/download/csv_data.zip",out=output_directory)
        with zipfile.ZipFile(output_directory+'/csv_data.zip', 'r') as zip_ref:
            zip_ref.extractall('datasets')
    filepath=os.getcwd()+'/datasets/'
    save_directory='../models/'
    datasets_path=filepath


In [64]:
# Load data
X_train = pd.read_csv(filepath+'X_train.csv')
y_train = pd.read_csv(filepath+'Y_train.csv')
X_train=X_train.drop(columns="Unnamed: 0")
y_train=y_train.drop(columns="Unnamed: 0")

In [65]:
# Cleaning and Preprocessing Text
CLEANR = re.compile('<.*?>') # delete html tag

def clean_html(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

def clean_text(text):
    text=clean_html(text)
    # Remove special characters and numbers
    text = unidecode.unidecode(text)
    text = re.sub(r'[^a-zA-ZäöüßÄÖÜ ]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [66]:
# Apply cleaning function to the 'designation' column
X_train['designation'] = X_train['designation'].fillna('').apply(clean_text)
X_train['description'] = X_train['description'].fillna('').apply(clean_text)
description_list=[]
description_index=[]
for index,element in enumerate(X_train['description']):
    if len(element)>10:
        # print("element",element)
        description_list.append(element)
        description_index.append(index)
y_train_description=[]
for index in description_index:
    y_train_description.append(y_train["prdtypecode"][int(index)])


# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['designation'])
sequences = tokenizer.texts_to_sequences(pd.concat([X_train['designation'], pd.Series(description_list)], axis=0))

# Padding to max length of text
data = pad_sequences(sequences, maxlen=34)

# Assuming the number of unique words in the tokenizer plus 1 is vocab_size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)# nearly 70 000 of vocab size, it seems too much

67465


In [67]:
# Split data into training and validation set (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data,pd.concat([y_train, pd.DataFrame(y_train_description,columns=['prdtypecode'])], axis=0) , test_size=0.2,shuffle=True)
y_train=y_train["prdtypecode"].tolist()
y_val=y_val["prdtypecode"].tolist()

In [68]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert to one-hot encoding
y_train_categorical = to_categorical(y_train_encoded)
y_val_categorical = to_categorical(y_val_encoded)


In [69]:
class TextDataset(Dataset):
    def __init__(self, X, Y):
        self.inputs = X
        self.labels = Y

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.inputs[idx]).to(device)
        y = torch.tensor(self.labels[idx], dtype=torch.float).to(device)
        return x, y

In [70]:
embedding_dim = 300
batch_size=128

In [71]:
train_dataset=TextDataset(X_train,y_train_categorical)
train_loader=DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset=TextDataset(X_val,y_val_categorical)
val_loader=DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


# Model definition


In [72]:

# Define the model
class CNN_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super(CNN_classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_blocks = nn.ModuleList([
            nn.Conv2d(1, 512, (i, embedding_dim), padding=(0, 0))
            for i in range(1,7)
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(512 * 6, CLIP_FEATURE_SIZE)
        self.classif=nn.Linear(CLIP_FEATURE_SIZE,num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add channel dimension for Conv2d
        conv_outputs = [nn.functional.relu(conv_block(x)).max(dim=3)[0].max(dim=2)[0] for conv_block in self.conv_blocks]# [0] to get only the values and not the indices ( in pos 1 )
        x = torch.cat(conv_outputs, dim=1)
        # Dense Layer

        # Flatten Layer
        x = x.view(x.size(0), -1)
        # Dropout Layer
        x = self.dropout(x)

        x = self.fc(x)
        x = self.classif(x)
        return x


In [73]:
# Initialize the model
num_classes = 27
model = CNN_classifier(vocab_size, embedding_dim, num_classes)

# Convert the model to CUDA if available
model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Print the model summary
print(model)


CNN_classifier(
  (embedding): Embedding(67465, 300)
  (conv_blocks): ModuleList(
    (0): Conv2d(1, 512, kernel_size=(1, 300), stride=(1, 1))
    (1): Conv2d(1, 512, kernel_size=(2, 300), stride=(1, 1))
    (2): Conv2d(1, 512, kernel_size=(3, 300), stride=(1, 1))
    (3): Conv2d(1, 512, kernel_size=(4, 300), stride=(1, 1))
    (4): Conv2d(1, 512, kernel_size=(5, 300), stride=(1, 1))
    (5): Conv2d(1, 512, kernel_size=(6, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=3072, out_features=768, bias=True)
  (classif): Linear(in_features=768, out_features=27, bias=True)
)


In [74]:
def train_model(model,train_loader,val_loader,num_epochs=10):  # Train the model
  val_f1=0
  max_val_f1=0
  for epoch in tqdm(range(num_epochs)):
      model.train()
      for inputs, labels in tqdm(train_loader,desc=f"Epoch {epoch + 1}/{num_epochs}, Validation F1 Score: {val_f1:.4f}"):
          optimizer.zero_grad()
          outputs = model(inputs)
          outputs=outputs.squeeze(0)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

      model.eval()
      with torch.no_grad():
          val_preds = []
          val_labels = []
          for val_inputs, val_labels_batch in val_loader:
              val_outputs = model(val_inputs)
              val_preds.append(val_outputs.cpu())
              val_labels.append(val_labels_batch.cpu())

      val_preds = torch.cat(val_preds, dim=0)
      val_labels = torch.cat(val_labels, dim=0)

    #   print(f'{val_labels=}')
    #   print(f'{torch.argmax(val_preds, dim=1)=}')

      val_f1 = f1_score(torch.argmax(val_labels,dim=1), torch.argmax(val_preds, dim=1), average='macro')
      if val_f1>max_val_f1:
        max_val_f1=val_f1
        torch.save(model.state_dict(), save_directory + 'Text_model_val_f1_{:.3f}_epoch{}.ckpt'.format(val_f1,epoch))
        best_model_wts = copy.deepcopy(model.state_dict())

  model.load_state_dict(best_model_wts)
  return(model,max_val_f1)


In [75]:
model,max_val_f1=train_model(model,train_loader,val_loader,num_epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10, Validation F1 Score: 0.0000:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 2/10, Validation F1 Score: 0.6893:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 3/10, Validation F1 Score: 0.7113:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 4/10, Validation F1 Score: 0.7221:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 5/10, Validation F1 Score: 0.7346:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 6/10, Validation F1 Score: 0.7309:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 7/10, Validation F1 Score: 0.7314:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 8/10, Validation F1 Score: 0.7469:   0%|          | 0/874 [00:00<?, ?it/s]

Epoch 9/10, Validation F1 Score: 0.7450:   0%|          | 0/874 [00:00<?, ?it/s]

# Test part

In [None]:
X_test = pd.read_csv(filepath+'X_test.csv')
X_test=X_test.drop(columns="Unnamed: 0")
X_test['designation'] = X_test['designation'].fillna('').apply(clean_text)
sequences = tokenizer.texts_to_sequences(X_test['designation'])

# Padding to max length of text
data = pad_sequences(sequences, maxlen=34)


In [None]:
print(data)

[[    0     0     0 ...  1681    47  5871]
 [    0     0     0 ...    45    29  1062]
 [    0     0     0 ...   181  3681  5153]
 ...
 [    0     0     0 ...  1436    75    12]
 [    0     0     0 ...  1785  3159  2372]
 [    0     0     0 ... 12215     4    63]]


In [None]:
model.to(device)
model.eval()

CNN_classifier(
  (embedding): Embedding(67465, 300)
  (conv_blocks): ModuleList(
    (0): Conv2d(1, 512, kernel_size=(1, 300), stride=(1, 1))
    (1): Conv2d(1, 512, kernel_size=(2, 300), stride=(1, 1))
    (2): Conv2d(1, 512, kernel_size=(3, 300), stride=(1, 1))
    (3): Conv2d(1, 512, kernel_size=(4, 300), stride=(1, 1))
    (4): Conv2d(1, 512, kernel_size=(5, 300), stride=(1, 1))
    (5): Conv2d(1, 512, kernel_size=(6, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=3072, out_features=768, bias=True)
  (classif): Linear(in_features=768, out_features=27, bias=True)
)

In [None]:
val_preds = []
with torch.no_grad():
  for input in tqdm(data):
      input=torch.from_numpy(input)
      input=input.to(device)
      input=input.unsqueeze(0)
      val_outputs = model(input)[0]
      val_outputs=val_outputs.detach().cpu().numpy()
      val_preds.append(val_outputs)

  0%|          | 0/13812 [00:00<?, ?it/s]

In [None]:
val_preds

[array([ -5.272253 ,   7.8971553,   6.38516  ,   0.607543 ,   2.2213042,
         -3.0202463,  -4.7830267,  11.351688 ,   7.903993 ,   8.845875 ,
         -3.5132868,   6.8123507,   3.1651266,  -6.5462103,  -5.1938534,
         -8.5658   ,  -3.4536624,  -1.5530404,  -9.089815 , -11.972496 ,
         -0.1779795,  -4.3356156,  -4.2121844,  -1.0013313,  -3.520455 ,
         -1.7471671,  -4.5462885], dtype=float32),
 array([ 4.218072  ,  5.261774  , -2.1438437 , -3.6450455 , 11.511696  ,
        12.132251  ,  4.987509  , -1.099399  ,  2.6333835 ,  0.46840933,
        -3.1776762 ,  0.9336338 ,  2.27367   ,  0.2583831 ,  0.987429  ,
        -5.654638  ,  0.4368955 ,  0.05490318, -2.8246634 , -2.248065  ,
        -4.6703014 , -7.192701  , -0.27931604,  1.2513499 , -2.8757114 ,
        -3.037928  , -2.568433  ], dtype=float32),
 array([ -8.656684 ,  -5.4484053,  -7.248537 , -11.697184 ,  -5.5103602,
         -4.8637557, -16.87762  ,   1.7211697, -10.378589 ,   6.3936744,
         -7.453518 ,  

In [None]:
val_preds=np.argmax(val_preds,axis=1)

In [None]:
val_preds=label_encoder.inverse_transform(val_preds)

In [None]:
df_preds=pd.DataFrame(val_preds)
df_preds=df_preds.set_index(df_preds.index+84916)

In [None]:
df_preds

Unnamed: 0,0
84916,1280
84917,1160
84918,2583
84919,2583
84920,2522
...,...
98723,1560
98724,2705
98725,2583
98726,2582


In [None]:
df_preds.to_csv("eval_text_designation.csv")