In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import matplotlib.pyplot as plt
import time

In [3]:

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('../datasets/train_set.csv')

# Drop any columns that are not relevant for classification
df = df.drop(columns=['Assembly Code', 'Assembly Description', 'Type Comments'])

EMPTYCONST = "*empty*"

# Define input sequences (example: Family, SubFamily, ObjectGroup)
sequences = ['Family', 'SubFamily', 'Description', 'ObjectGroup', 'ObjectName', 'Type Name', 'Structural Material', 'Material']

# Tokenize and pad each sequence separately
max_words = 200  # Maximum number of words to consider in tokenizer
max_len = 50  # Maximum length of sequences
tokenizer = Tokenizer(num_words=max_words)

X_seqs = []
for seq in sequences:
    X_seq = df[seq].fillna(EMPTYCONST).apply(lambda x: x + "_*" + seq.lower() + "*" if x != EMPTYCONST else x)
    tokenizer.fit_on_texts(X_seq)
    X_seq = tokenizer.texts_to_sequences(X_seq)
    X_seq = pad_sequences(X_seq, maxlen=max_len)
    X_seqs.append(X_seq)

# Split data into features (X) and target variable (y)
y = df["Category"]
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train_seqs = [X_seq[:int(len(X_seq)*0.9)] for X_seq in X_seqs]
X_test_seqs = [X_seq[int(len(X_seq)*0.9):] for X_seq in X_seqs]
y_train = y[:int(len(y)*0.9)]
y_test = y[int(len(y)*0.9):]

In [4]:

# Start timing
start_time = time.time()
# Define the CNN model using Functional API
input_layers = []
embedding_layers = []
for seq_index, seq in enumerate(sequences):
    input_layer = Input(shape=(max_len,), name=f"input_{seq}")
    embedding_layer = Embedding(input_dim=max_words, output_dim=10, input_length=max_len)(input_layer)
    conv_layer = Conv1D(128, 5, activation='relu')(embedding_layer)
    pool_layer = GlobalMaxPooling1D()(conv_layer)
    input_layers.append(input_layer)
    embedding_layers.append(pool_layer)

merged = Concatenate()(embedding_layers)
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(merged)
dropout_layer = Dropout(0.5)(dense_layer)
output_layer = Dense(len(df['Category'].unique()), activation='softmax')(dropout_layer)

model = Model(inputs=input_layers, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_seqs, y_train, epochs=50, batch_size=32, validation_split=0.1)

# End timing
end_time = time.time()
# Calculate total runtime
total_time = end_time - start_time
print(f"Total runtime: {total_time} seconds")
# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test_seqs, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Total runtime: 80.46228861808777 seconds
Test Accuracy: 0.5181518197059631


In [6]:
# Plot training & validation accuracy values
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

AttributeError: module 'matplotlib' has no attribute 'colors'

Error in callback <function _enable_matplotlib_integration.<locals>.configure_once at 0x0000025BD05D4C10> (for post_run_cell), with arguments args (<ExecutionResult object at 25d5b857ca0, execution_count=6 error_before_exec=None error_in_exec=module 'matplotlib' has no attribute 'colors' info=<ExecutionInfo object at 25d5b857790, raw_cell="# Plot training & validation accuracy values
plt.f.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/v%3A/F-Project/mcr/BIMVet/Machine%20Learning/FNN/FNN_train_seq_based.ipynb#W3sZmlsZQ%3D%3D> result=None>,),kwargs {}:


AttributeError: module 'matplotlib' has no attribute 'backends'

In [None]:

# Save the model to disk
model.save('FNN_model_py_seq_based.h5')
print("Model saved to disk.")

# Save the label_encoder to a file
with open('label_encoder_seq.pkl', 'wb') as le_file:
    pickle.dump(y_encoder, le_file)
print("LabelEncoder saved to disk.")

Model saved to disk.
