Process the text

In [57]:
import pandas as pd
import re
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Embedding, Conv2D, MaxPooling2D, Flatten, Dropout, Dense, Reshape, Concatenate, Input
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [58]:
# mount the drive where your dataset is available
from google.colab import drive
drive.mount('/content/drive')
filepath='/content/drive/MyDrive/datasets/multimodal_product_classification/' # add your own path. Where to save the dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [59]:
# Load data
X_train = pd.read_csv(filepath+'X_train.csv')
y_train = pd.read_csv(filepath+'Y_train.csv')

In [60]:
X_train=X_train.drop(columns="Unnamed: 0")

y_train=y_train.drop(columns="Unnamed: 0")

In [61]:
# Cleaning and Preprocessing Text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-ZäöüßÄÖÜ ]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [62]:
# Apply cleaning function to the 'designation' column
X_train['designation'] = X_train['designation'].fillna('').apply(clean_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['designation'])
sequences = tokenizer.texts_to_sequences(X_train['designation'])

# Padding to max length of text
data = pad_sequences(sequences, maxlen=34)

In [63]:
# Assuming the number of unique words in the tokenizer plus 1 is vocab_size
vocab_size = len(tokenizer.word_index) + 1

# Input Layer
input_layer = Input(shape=(34,))

# Embedding Layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=300, input_length=34)(input_layer)

# Reshape Layer
reshape_layer = Reshape((34, 300, 1))(embedding_layer)

# Conv2D and MaxPooling2D Blocks
conv_blocks = []
for i in range(1, 7):
    # Conv2D Block
    conv = Conv2D(512, (i, 300), activation='relu', padding='valid')(reshape_layer)
    # MaxPooling2D Block
    # Adjusting the pooling size to (1, 1) as the output of Conv2D will have a height of 1
    pool = MaxPooling2D(pool_size=(1, 1))(conv)
    conv_blocks.append(pool)

# Concatenate Layer
concatenate_layer = Concatenate(axis=1)(conv_blocks)

# Flatten Layer
flatten_layer = Flatten()(concatenate_layer)

# Dropout Layer
dropout_layer = Dropout(0.5)(flatten_layer)

# Dense Layer
output_layer = Dense(27, activation='softmax')(dropout_layer)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 34)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 34, 300)              2075670   ['input_3[0][0]']             
                                                          0                                       
                                                                                                  
 reshape_2 (Reshape)         (None, 34, 300, 1)           0         ['embedding_2[0][0]']         
                                                                                                  
 conv2d_12 (Conv2D)          (None, 34, 1, 512)           154112    ['reshape_2[0][0]']     

In [64]:
# Split data into training and validation set (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, y_train, test_size=0.2, random_state=42)


In [76]:
batch_size=32
epochs=10

In [77]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert to one-hot encoding
y_train_categorical = to_categorical(y_train_encoded)
y_val_categorical = to_categorical(y_val_encoded)

# Adjust the final layer of the model to have as many units as there are unique classes
num_classes = y_train_categorical.shape[1]
model.layers[-1].units = num_classes

# Compile the model again
f1_score = metrics.F1Score(average='macro')
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[f1_score])
# Train the model
history = model.fit(X_train, y_train_categorical, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val_categorical))


Epoch 1/10


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [78]:
# Make predictions
predictions = model.predict(X_val)

predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))



In [79]:
# One-hot encode the validation labels
y_val_categorical = to_categorical(y_val_encoded, num_classes=num_classes)

# Evaluate the model using the one-hot encoded labels
loss, accuracy = model.evaluate(X_val, y_val_categorical)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 4.129349231719971, Test Accuracy: 0.7483856678009033


In [80]:
# Selecting the first 10 samples from the validation set
X_val_sample = X_val[0:10]
y_val_sample = y_val[0:10]

# Because the model uses one-hot encoding, convert y_val_sample to categorical
y_val_sample_categorical = to_categorical(label_encoder.transform(y_val_sample))

# Making predictions
predictions = model.predict(X_val_sample)

# Because the model uses one-hot encoding, convert predictions back to label encoding
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Comparing actual and predicted labels
for i in range(len(X_val_sample)):
    print(f"Sample {i+1}:")
    t=y_val_sample["prdtypecode"].iloc[i]
    print(f"Actual Label: {t}")
    print(f"Predicted Label: {predicted_labels[i]}\n")


Sample 1:
Actual Label: 2905
Predicted Label: 2905

Sample 2:
Actual Label: 1281
Predicted Label: 2403

Sample 3:
Actual Label: 2060
Predicted Label: 2060

Sample 4:
Actual Label: 1280
Predicted Label: 1280

Sample 5:
Actual Label: 2280
Predicted Label: 2403

Sample 6:
Actual Label: 1300
Predicted Label: 1300

Sample 7:
Actual Label: 2280
Predicted Label: 2280

Sample 8:
Actual Label: 2583
Predicted Label: 2583

Sample 9:
Actual Label: 2060
Predicted Label: 2060

Sample 10:
Actual Label: 2705
Predicted Label: 2403



  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [81]:
X_test = pd.read_csv(filepath+'X_test.csv')
X_test=X_test.drop(columns="Unnamed: 0")

In [82]:
X_test.head()

Unnamed: 0,designation,description,productid,imageid
0,Folkmanis Puppets - 2732 - Marionnette Et Théâ...,,516376098,1019294171
1,Porte Flamme Gaxix - Flamebringer Gaxix - 136/...,,133389013,1274228667
2,Pompe de filtration Speck Badu 95,,4128438366,1295960357
3,Robot de piscine électrique,<p>Ce robot de piscine d&#39;un design innovan...,3929899732,1265224052
4,Hsm Destructeur Securio C16 Coupe Crois¿E: 4 X...,,152993898,940543690


In [83]:

X_test['designation'] = X_test['designation'].fillna('').apply(clean_text)
tokenizer.fit_on_texts(X_test['designation'])
sequences = tokenizer.texts_to_sequences(X_test['designation'])

# Padding to max length of text
X_test = pad_sequences(sequences, maxlen=34)
predictions = model.predict(X_test)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))




In [84]:
df = pd.DataFrame(predicted_labels, columns=['prdtypecode'])


In [85]:
df.index+=84916
df.head()

Unnamed: 0,prdtypecode
84916,2522
84917,2905
84918,2522
84919,2583
84920,1281


In [86]:
df.to_csv('output.csv')
