In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Flatten, Dropout, Conv1D, MaxPooling1D
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import load_model
import tensorflow as tf
import joblib
import time
import numpy as np

In [28]:
# df = pd.DataFrame(data)
file_path = "../datasets/train_set2.csv"
df = pd.read_csv(file_path)

EMPTYCONST = "*empty*"

# Concatenate relevant text features
catFamily = df["Family"].fillna(EMPTYCONST)
catSubFamily = df["SubFamily"].fillna(EMPTYCONST)
catObjectGroup = df["ObjectGroup"].fillna(EMPTYCONST)
catObjectName = df["ObjectName"].fillna(EMPTYCONST)
catDescription = df["Description"].fillna(EMPTYCONST)
catTypeComments = df["Type Comments"].fillna(EMPTYCONST)
catStructuralMaterial = df["Structural Material"].fillna(EMPTYCONST)
catMaterial = df["Material"].fillna(EMPTYCONST)


df["Features"] = (
    catFamily
    + "|"
    + catSubFamily
    + "|"
    + catObjectGroup
    + "|"
    + catObjectName
    + "|"
    + catDescription
    + "|"
    + catTypeComments
    + "|"
    + catStructuralMaterial
    + "|"
    + catMaterial
)

# Start timing
start_time = time.time()

# Encode categorical variable 'Category'
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Split data into features (X) and target variable (y)
X = df["Features"].fillna(EMPTYCONST)
y = df["Category"]

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Convert X_tfidf to a dense numpy array
X_tfidf_dense = X_tfidf.toarray()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf_dense, y, test_size=0.2, random_state=42
)

timesteps = 1  # Each sample is treated as a single time step
features = X_train.shape[1]  # The number of features is the dimensionality of the TF-IDF vector

X_train_reshaped = np.expand_dims(X_train, axis=1)  # Shape (batch_size, timesteps, features)
X_test_reshaped = np.expand_dims(X_test, axis=1)    # Shape (batch_size, timesteps, features)

# Define the neural network model
model_nn = Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(timesteps, features)),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    Flatten(),
    Dense(units=256, activation='relu'),
    Dropout(0.5),
    Dense(len(df['Category'].unique()), activation='softmax')
])


# Compile the model
model_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_nn.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_split=0.2)

# End timing
end_time = time.time()

# Calculate total runtime
total_time = end_time - start_time
print(f"Total runtime: {total_time} seconds")

# Evaluate the model on the testing set
loss, accuracy = model_nn.evaluate(X_test_reshaped, y_test)
print("Test Accuracy:", accuracy)


# Save the model to disk
model_nn.save('CNN_model_tif.h5')
print("Model saved to disk.")
# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, predictions))

[[[0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]]]


ValueError: One of the dimensions in the output is <= 0 due to downsampling in conv1d_20. Consider increasing the input size. Received input shape [None, 1, 3034] which would produce output shape with a zero or negative value in a dimension.

In [None]:
model = load_model('CNN_model_tif.h5')
# Load new data (example: assume `new_data.csv` has the same structure as `train_set.csv`)
new_data_file = "../datasets/test_set.csv"
new_df = pd.read_csv(new_data_file)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),  # or any other optimizer you want to use
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

# Concatenate relevant text features
catFamily = new_df["Family"].fillna("*empty*")
catSubFamily = new_df["SubFamily"].fillna("*empty*")
catObjectGroup = new_df["ObjectGroup"].fillna("*empty*")
catObjectName = new_df["ObjectName"].fillna("*empty*")
catDescription = new_df["Description"].fillna("*empty*")
catTypeComments = new_df["Type Comments"].fillna("*empty*")
catStructuralMaterial = new_df["Structural Material"].fillna("*empty*")
catMaterial = new_df["Material"].fillna("*empty*")

new_df["Features"] = (
    catFamily
    + "|"
    + catSubFamily
    + "|"
    + catObjectGroup
    + "|"
    + catObjectName
    + "|"
    + catDescription
    + "|"
    + catTypeComments
    + "|"
    + catStructuralMaterial
    + "|"
    + catMaterial
)

test2343 = label_encoder.transform(new_df['Category'])


# Vectorize text data using the same CountVectorizer instance
X_new_tfidf = vectorizer.transform(new_df["Features"].fillna(EMPTYCONST))
# Convert X_tfidf to a dense numpy array
X_tfidf_new_dense = X_new_tfidf.toarray()
X_test_new_reshaped = np.expand_dims(X_tfidf_new_dense, axis=1)    # Shape (batch_size, timesteps, features)

# Predict using the loaded model
predictions_new = model_nn.predict(X_test_new_reshaped)
predicted_classes = predictions_new.argmax(axis=1)  # Assuming predictions_new is one-hot encoded

# Calculate accuracy
accuracy = accuracy_score(test2343, predicted_classes)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(test2343, predicted_classes))

Accuracy: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0
           4       0.00      0.00      0.00      22.0
           7       0.00      0.00      0.00      14.0
           9       0.00      0.00      0.00      10.0
          11       0.00      0.00      0.00       1.0
          12       0.00      0.00      0.00       5.0
          13       0.00      0.00      0.00       9.0
          16       0.00      0.00      0.00      21.0
          17       0.00      0.00      0.00      12.0
          18       0.00      0.00      0.00       1.0
          21       0.00      0.00      0.00       4.0
          23       0.00      0.00      0.00       0.0
          31       0.00      0.00      0.00       3.0
          32       0.00      0.00      0.00       2.0
          35       0.00      0.00      0.00      11.0
          37       0.00      0.00      0.00       2.0
          38 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
