Process the text 

In [10]:
import pandas as pd
import re
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Embedding, Conv2D, MaxPooling2D, Flatten, Dropout, Dense, Reshape, Concatenate, Input
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('Y_train.csv')

In [3]:
# Cleaning and Preprocessing Text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-ZäöüßÄÖÜ ]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [6]:
# Apply cleaning function to the 'designation' column
X_train['designation'] = X_train['designation'].fillna('').apply(clean_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['designation'])
sequences = tokenizer.texts_to_sequences(X_train['designation'])

# Padding to max length of text
data = pad_sequences(sequences, maxlen=34)

In [14]:
# Assuming the number of unique words in the tokenizer plus 1 is vocab_size
vocab_size = len(tokenizer.word_index) + 1  

# Input Layer
input_layer = Input(shape=(34,))

# Embedding Layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=300, input_length=34)(input_layer)

# Reshape Layer
reshape_layer = Reshape((34, 300, 1))(embedding_layer)

# Conv2D and MaxPooling2D Blocks
conv_blocks = []
for i in range(1, 7):
    # Conv2D Block
    conv = Conv2D(512, (i, 300), activation='relu', padding='valid')(reshape_layer)
    # MaxPooling2D Block
    # Adjusting the pooling size to (1, 1) as the output of Conv2D will have a height of 1
    pool = MaxPooling2D(pool_size=(1, 1))(conv)
    conv_blocks.append(pool)

# Concatenate Layer
concatenate_layer = Concatenate(axis=1)(conv_blocks)

# Flatten Layer
flatten_layer = Flatten()(concatenate_layer)

# Dropout Layer
dropout_layer = Dropout(0.5)(flatten_layer)

# Dense Layer
output_layer = Dense(27, activation='softmax')(dropout_layer)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 34)]                 0         []                            
                                                                                                  
 embedding_5 (Embedding)     (None, 34, 300)              2075670   ['input_4[0][0]']             
                                                          0                                       
                                                                                                  
 reshape_5 (Reshape)         (None, 34, 300, 1)           0         ['embedding_5[0][0]']         
                                                                                                  
 conv2d_6 (Conv2D)           (None, 34, 1, 512)           154112    ['reshape_5[0][0]']      

In [15]:
# Split data into training and validation set (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(data, y_train, test_size=0.2, random_state=42)


In [21]:
# Convert labels to categorical
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Convert to one-hot encoding
y_train_categorical = to_categorical(y_train_encoded)
y_val_categorical = to_categorical(y_val_encoded)

# Adjust the final layer of the model to have as many units as there are unique classes
num_classes = y_train_categorical.shape[1]
model.layers[-1].units = num_classes

# Compile the model again
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_categorical, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val_categorical))


Epoch 1/10


 172/2123 [=>............................] - ETA: 1:43:51 - loss: 2.2242 - accuracy: 0.3681

KeyboardInterrupt: 

In [23]:
import numpy as np

In [24]:
# Make predictions
predictions = model.predict(X_val)

predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

In [28]:
# One-hot encode the validation labels
y_val_categorical = to_categorical(y_val_encoded, num_classes=num_classes)

# Evaluate the model using the one-hot encoded labels
loss, accuracy = model.evaluate(X_val, y_val_categorical)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 1.4924018383026123, Test Accuracy: 0.5754827857017517


In [32]:
# Selecting the first 10 samples from the validation set
X_val_sample = X_val[30:50]
y_val_sample = y_val[30:50]

# Because the model uses one-hot encoding, convert y_val_sample to categorical
y_val_sample_categorical = to_categorical(label_encoder.transform(y_val_sample))

# Making predictions
predictions = model.predict(X_val_sample)

# Because the model uses one-hot encoding, convert predictions back to label encoding
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Comparing actual and predicted labels
for i in range(len(X_val_sample)):
    print(f"Sample {i+1}:")
    print(f"Actual Label: {y_val_sample[i]}")
    print(f"Predicted Label: {predicted_labels[i]}\n")


Sample 1:
Actual Label: 1160
Predicted Label: 1160

Sample 2:
Actual Label: 2583
Predicted Label: 2583

Sample 3:
Actual Label: 1920
Predicted Label: 1920

Sample 4:
Actual Label: 1140
Predicted Label: 1140

Sample 5:
Actual Label: 1920
Predicted Label: 1920

Sample 6:
Actual Label: 50
Predicted Label: 50

Sample 7:
Actual Label: 2905
Predicted Label: 2905

Sample 8:
Actual Label: 2060
Predicted Label: 2060

Sample 9:
Actual Label: 40
Predicted Label: 2583

Sample 10:
Actual Label: 2522
Predicted Label: 2522

Sample 11:
Actual Label: 2705
Predicted Label: 2705

Sample 12:
Actual Label: 1302
Predicted Label: 1300

Sample 13:
Actual Label: 2522
Predicted Label: 2522

Sample 14:
Actual Label: 2403
Predicted Label: 2403

Sample 15:
Actual Label: 1281
Predicted Label: 2280

Sample 16:
Actual Label: 2403
Predicted Label: 2403

Sample 17:
Actual Label: 2582
Predicted Label: 1560

Sample 18:
Actual Label: 2583
Predicted Label: 2583

Sample 19:
Actual Label: 10
Predicted Label: 1160

Sample 20: