In [None]:
!python -V
!java -version

Python 3.10.12
openjdk version "11.0.24" 2024-07-16
OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04)
OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)


In [None]:
import os
print(os.getcwd())
from google.colab import files
uploaded = files.upload()
os.listdir()


/content


Saving diagnoses.csv to diagnoses.csv
Saving summaries.csv to summaries.csv
Saving symptoms.csv to symptoms.csv


['.config',
 'summaries.csv',
 '.ipynb_checkpoints',
 'summaries (2).csv',
 'diagnoses.csv',
 'summaries (1).csv',
 'symptoms (1).csv',
 'data',
 'diags_varied.csv',
 'diags (1).csv',
 'diagnoses (1).csv',
 'symptoms.csv',
 'symptoms (2).csv',
 'diags.csv',
 'sample_data']

Roadmap:

1. Get the MIMIC-III data

2. Preprocess the data like how its done in the paper: A disease inference method based on symptom extraction and bidirectional Long Short Term Memory networks

  a. Filter out the no-symptom entities from the medical texts according to the structural characteristics of MIMIC-III

  b. Use the existing natural language processing tool MetaMap [20] to identify symptom entities which are extracted from full clinical texts

  c. Then the vector representation of symptoms is obtained based on two representations:

      i. TF-IDF obtains the strength of the association of each symptom with all the diseases and uses this as an element of the symptom vector

      ii. At the same time, the preprocessed text is used to train Word2Vec to obtain the word vector which is utilized to generate the symptom vector

3. Construct the model

4. Run the model

5. Evaluate the results

MIMIC-III data structure for the paper:

https://mimic.mit.edu/docs/iii/tables/noteevents/



In [None]:
import re
import numpy as np
import nltk
import pandas as pd

In [None]:
#Feature Extraction and data cleaning
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

# read diagnoses and symptoms data (symptoms was generated from MetaMap)
diag_df = pd.read_csv('diagnoses.csv')
diagnosis_counts = diag_df['diagnosis'].value_counts()
print(diagnosis_counts)
print()
symptoms_df = pd.read_csv('symptoms.csv')
merged_df = pd.merge(diag_df, symptoms_df, on='patientID')
print(merged_df)

symptoms_dict = {}
for patient_id, group in merged_df.groupby('patientID'):
    symptoms_dict[patient_id] = group['preferred_name'].tolist()

# create list of lists of symptoms
extracted_symptoms = list(symptoms_dict.values())

# create list of correlated patients and diagnoses
extracted_patients = list(symptoms_dict.keys())
extracted_diags = diag_df['diagnosis'].tolist()
print()
print(f'Patient {extracted_patients[0]}:')
print(extracted_symptoms[0])
print(extracted_diags[0])
print()

# extracted_symptoms = [['back pain', 'chest pain'], ['dyspnea', 'tiredness']]


diagnosis
Heart Disease      26
Infection          21
General Checkup    20
Vertigo            11
Muscle Strain       9
Arthritis           7
Pneumonia           6
Name: count, dtype: int64

     patientID        diagnosis preferred_name
0        12197    Heart Disease      Back Pain
1        12197    Heart Disease        Dyspnea
2        12197    Heart Disease     Chest Pain
3        12197    Heart Disease     Arthralgia
4        12328          Vertigo         Nausea
..         ...              ...            ...
293      99307  General Checkup      Dizziness
294      99694    Muscle Strain       Headache
295      99694    Muscle Strain       Coughing
296      99694    Muscle Strain         Nausea
297      99694    Muscle Strain      Back Pain

[298 rows x 3 columns]

Patient 12197:
['Back Pain', 'Dyspnea', 'Chest Pain', 'Arthralgia']
Heart Disease



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Word2Vec

In [None]:
#Preprocessing: Tokenization and cleaning

from sklearn.feature_extraction.text import TfidfVectorizer

def clean_symptoms(symptoms):
    # Normalize the symptoms to lowercase and remove duplicates
    cleaned_symptoms = list(set([symptom.lower() for symptom in symptoms]))
    return cleaned_symptoms

# Tokenize symptoms in each group
tokenized_symptoms = [
    [word_tokenize(symptom.lower()) for symptom in group] for group in extracted_symptoms
]

# Define stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokenized symptoms, while preserving the grouping
cleaned_symptoms = [
    [' '.join([word for word in symptom if word not in stop_words]) for symptom in group]
    for group in tokenized_symptoms
]

# Print cleaned symptoms
print(f"Cleaned Symptoms: {cleaned_symptoms}")

# Apply the cleaning function to the extracted symptoms
cleaned_extracted_symptoms = [clean_symptoms(symptoms) for symptoms in cleaned_symptoms]

# Show cleaned symptoms
print("Cleaned Extracted Symptoms:", cleaned_extracted_symptoms)

Cleaned Symptoms: [['back pain', 'dyspnea', 'chest pain', 'arthralgia'], ['nausea', 'dizziness'], ['chest pain', 'fever'], ['nausea', 'chest pain', 'back pain', 'arthralgia'], ['dizziness', 'back pain'], ['dyspnea', 'chest pain', 'nausea'], ['dyspnea', 'arthralgia', 'fatigue'], ['dyspnea', 'arthralgia', 'chest pain', 'fatigue'], ['back pain', 'chest pain', 'headache'], ['dizziness', 'nausea', 'chest pain'], ['fatigue', 'chest pain', 'back pain'], ['arthralgia', 'chest pain', 'dizziness', 'fatigue'], ['nausea', 'back pain'], ['coughing', 'headache'], ['dyspnea', 'fever'], ['fatigue', 'fever', 'back pain'], ['back pain', 'arthralgia', 'headache'], ['dizziness', 'headache', 'nausea'], ['headache', 'dyspnea', 'coughing', 'back pain'], ['back pain', 'chest pain', 'coughing', 'nausea'], ['chest pain', 'arthralgia', 'fever'], ['fatigue', 'coughing'], ['fever', 'arthralgia'], ['arthralgia', 'back pain', 'fatigue', 'nausea'], ['nausea', 'fatigue'], ['headache', 'fever'], ['headache', 'fever', '

In [None]:
#TF-IDF

# Flatten the cleaned symptoms to use with TF-IDF
flattened_symptoms = [symptom for group in cleaned_extracted_symptoms for symptom in group]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=100)  # You can adjust max_features as needed
tfidf_matrix = vectorizer.fit_transform(flattened_symptoms).toarray()

# Split into groups again (reshape back to groups of symptoms for each patient)
num_groups = len(cleaned_extracted_symptoms)
group_sizes = [len(group) for group in cleaned_extracted_symptoms]
tfidf_grouped = np.split(tfidf_matrix, np.cumsum(group_sizes)[:-1])

# Pad the TF-IDF sequences to have uniform length
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_tfidf = pad_sequences(tfidf_grouped, padding='post', dtype='float32')



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#Word2Vec

import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK tokenizer (if needed)
nltk.download('punkt')

# Example of cleaned symptoms (extracted from MetaMap)
# cleaned_symptoms = [['pain of breast', 'shortness of breath'], ['head ache', 'dizziness']]

# Tokenize the symptoms into words for Word2Vec
tokenized_symptoms = [word_tokenize(symptom.lower()) for symptoms in cleaned_extracted_symptoms for symptom in symptoms]

# Print the tokenized symptoms to verify
print("Tokenized Symptoms:", tokenized_symptoms)

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_symptoms, vector_size=100, window=5, min_count=1, sg=1)

# Check the word vectors
word_vectors = model.wv

# Test the model with some word similarity or vector lookup
# print("Vector for 'pain':", word_vectors['pain'])
# print("Vector for 'breast':", word_vectors['breast'])

# Optionally, check similar words
# print("Most similar words to 'pain':", word_vectors.most_similar('pain'))
# print("Most similar words to 'dizziness':", word_vectors.most_similar('dizziness'))

word2vec_embeddings = [
    [np.mean([model.wv[word] for word in symptom.split() if word in model.wv], axis=0) for symptom in group]
    for group in cleaned_symptoms
]

print(f"Word2Vec Embeddings: {word2vec_embeddings}")

Tokenized Symptoms: [['arthralgia'], ['dyspnea'], ['chest', 'pain'], ['back', 'pain'], ['dizziness'], ['nausea'], ['fever'], ['chest', 'pain'], ['arthralgia'], ['chest', 'pain'], ['back', 'pain'], ['nausea'], ['dizziness'], ['back', 'pain'], ['dyspnea'], ['chest', 'pain'], ['nausea'], ['arthralgia'], ['dyspnea'], ['fatigue'], ['arthralgia'], ['dyspnea'], ['chest', 'pain'], ['fatigue'], ['chest', 'pain'], ['headache'], ['back', 'pain'], ['dizziness'], ['chest', 'pain'], ['nausea'], ['chest', 'pain'], ['fatigue'], ['back', 'pain'], ['arthralgia'], ['dizziness'], ['chest', 'pain'], ['fatigue'], ['back', 'pain'], ['nausea'], ['headache'], ['coughing'], ['dyspnea'], ['fever'], ['fever'], ['fatigue'], ['back', 'pain'], ['arthralgia'], ['headache'], ['back', 'pain'], ['dizziness'], ['headache'], ['nausea'], ['dyspnea'], ['coughing'], ['headache'], ['back', 'pain'], ['chest', 'pain'], ['coughing'], ['back', 'pain'], ['nausea'], ['arthralgia'], ['chest', 'pain'], ['fever'], ['fatigue'], ['cough

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Word2Vec Embeddings: [[array([-4.6318811e-03,  1.1833080e-03,  2.1151507e-03, -1.5447009e-04,
       -9.3659982e-03, -4.2641093e-03,  5.4459544e-03,  6.3389661e-03,
       -5.7573044e-03, -5.3184023e-03,  1.1909050e-03, -1.9102324e-03,
       -5.8936710e-03, -1.5243145e-03, -3.8021312e-03, -5.0895177e-03,
       -1.5810082e-03, -2.3394793e-03, -5.3150794e-03, -5.5783666e-03,
       -8.2226098e-04,  2.1677779e-03,  7.4554281e-03,  4.2266846e-03,
       -4.2730407e-04, -3.5362639e-03,  1.0860293e-03, -1.9007942e-03,
       -3.0229648e-03,  1.2939901e-03, -8.8294642e-04, -4.8465971e-03,
        2.5109511e-03, -7.7300193e-03, -1.1440195e-03,  3.6628521e-03,
        7.0257639e-03, -4.3208222e-04,  2.5531147e-03, -3.9985566e-03,
       -2.6150141e-05, -1.1743223e-03, -8.0154119e-03, -3.3288207e-03,
       -4.0683203e-04, -1.7561568e-03, -4.1272854e-03,  8.5522505e-03,
        2.1425118e-03,  3.8045500e-03, -2.7071559e-03, -1.9314326e-03,
        1.8591627e-03,  4.6803602e-03, -5.4232031e-04,

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Assuming `extracted_diags` is a list of single diagnoses for each patient, e.g.,
# extracted_diags = ['Hypertension', 'Asthma', 'Diabetes', ...]

# Reshape to 2D array as required by OneHotEncoder
extracted_diags = np.array(extracted_diags).reshape(-1, 1)

# Initialize OneHotEncoder and fit-transform the data
encoder = OneHotEncoder(sparse_output=False)
labels = encoder.fit_transform(extracted_diags)

# Set the output dimension for the final layer of the model
output_dim = labels.shape[1]  # Number of unique diagnoses
output_dim


7

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Reshape X_train_tfidf and X_val_tfidf to 3D format (samples, timesteps, features)
X_train_tfidf = X_train_tfidf.reshape((X_train_tfidf.shape[0], X_train_tfidf.shape[1], X_train_tfidf.shape[2]))
X_val_tfidf = X_val_tfidf.reshape((X_val_tfidf.shape[0], X_val_tfidf.shape[1], X_val_tfidf.shape[2]))

# Print shapes to verify
print(f"Shape of X_train: {X_train_tfidf.shape}, X_val: {X_val_tfidf.shape}")
print(f"Shape of y_train: {y_train.shape}, y_val: {y_val.shape}")


Shape of X_train: (80, 4, 11), X_val: (20, 4, 11)
Shape of y_train: (80, 7), y_val: (20, 7)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
import numpy as np

def create_bilstm_model_tfidf(input_dim, embedding_dim, output_dim):
    model = Sequential()

    # Adding a Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(64, return_sequences=False), input_shape=(input_dim, embedding_dim)))

    # Add dropout to prevent overfitting
    model.add(Dropout(0.5))

    # Dense layer for output (single-label classification with softmax)
    model.add(Dense(output_dim, activation='softmax'))  # Changed to softmax for single-label

    # Compile the model with categorical crossentropy for single-label classification
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
import time
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

# Define input dimensions based on X_train_tfidf
input_dim = X_train_tfidf.shape[1]  # Number of timesteps
embedding_dim = X_train_tfidf.shape[2]  # Number of features per timestep
output_dim = labels.shape[1]  # Number of unique classes (one-hot encoded)

# Initialize the model
model_tfidf = create_bilstm_model_tfidf(input_dim, embedding_dim, output_dim)

# Train the model with one-hot encoded labels
st = time.time()
model_tfidf.fit(X_train_tfidf, y_train, validation_data=(X_val_tfidf, y_val), epochs=10, batch_size=32)
elapsed = round(time.time() - st, 2)

model_tfidf.summary()
loss, accuracy = model_tfidf.evaluate(X_val_tfidf, y_val, batch_size=32)
print(f"Loss: {loss}, Accuracy: {accuracy}")

y_pred = model_tfidf.predict(X_val_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_val, axis=1)

# Calculate metrics as macro to focus on class balance due to small dataset
accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)
recall = recall_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)
f1 = f1_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f'Time to train: {elapsed} sec')

# Check class distribution in predictions
# print("Predicted class distribution:", np.unique(y_pred_classes, return_counts=True))



Epoch 1/10


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 257ms/step - accuracy: 0.2445 - loss: 1.9370 - val_accuracy: 0.6000 - val_loss: 1.9058
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4320 - loss: 1.9061 - val_accuracy: 0.5500 - val_loss: 1.8879
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4117 - loss: 1.9011 - val_accuracy: 0.5000 - val_loss: 1.8716
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.4797 - loss: 1.8765 - val_accuracy: 0.5000 - val_loss: 1.8543
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4695 - loss: 1.8677 - val_accuracy: 0.5000 - val_loss: 1.8370
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4141 - loss: 1.8474 - val_accuracy: 0.5000 - val_loss: 1.8198
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5000 - loss: 1.7428
Loss: 1.7428312301635742, Accuracy: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step
Accuracy: 0.500
Precision: 0.344
Recall: 0.417
F1 Score: 0.336
Time to train: 4.77 sec


In [None]:
X_word2vec = pad_sequences(word2vec_embeddings, padding='post', dtype='float32')

# Split data into training and validation sets
X_train_word2vec, X_val_word2vec, y_train_word2vec, y_val_word2vec = train_test_split(
    X_word2vec, labels, test_size=0.2, random_state=42
)

# Reshape for LSTM (already has shape (samples, timesteps, features) after padding)
X_train_word2vec = X_train_word2vec.reshape((X_train_word2vec.shape[0], X_train_word2vec.shape[1], X_train_word2vec.shape[2]))
X_val_word2vec = X_val_word2vec.reshape((X_val_word2vec.shape[0], X_val_word2vec.shape[1], X_val_word2vec.shape[2]))

# Print shapes to verify
print(f"Shape of X_train: {X_train_word2vec.shape}, X_val: {X_val_word2vec.shape}")
print(f"Shape of y_train: {y_train_word2vec.shape}, y_val: {y_val_word2vec.shape}")

Shape of X_train: (80, 4, 100), X_val: (20, 4, 100)
Shape of y_train: (80, 7), y_val: (20, 7)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout

def create_bilstm_model(input_dim, embedding_dim, output_dim):
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=False), input_shape=(input_dim, embedding_dim)))
    model.add(Dropout(0.5))  # Dropout layer to reduce overfitting
    model.add(Dense(output_dim, activation='softmax'))  # Single-label classification output layer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
import time
# Define model dimensions based on the processed data
input_dim = X_train_word2vec.shape[1]  # Number of timesteps (padded sequence length)
embedding_dim = X_train_word2vec.shape[2]  # Word2Vec embedding dimension
output_dim = labels.shape[1]  # Number of unique diagnoses

# Initialize and train the model
model_word2vec = create_bilstm_model(input_dim, embedding_dim, output_dim)
st = time.time()
model_word2vec.fit(X_train_word2vec, y_train_word2vec, validation_data=(X_val_word2vec, y_val_word2vec), epochs=10, batch_size=32)
elapsed = round(time.time() - st, 2)
model_word2vec.summary()

# Evaluate the model on the training data
loss, accuracy = model_word2vec.evaluate(X_val_word2vec, y_val_word2vec, batch_size=32)
print(f"Loss: {loss}, Accuracy: {accuracy}")

y_pred = model_word2vec.predict(X_val_word2vec)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_val, axis=1)

# Calculate metrics using sklearn
accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)
recall = recall_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)
f1 = f1_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f'Time to train: {elapsed} sec')

# Check class distribution in predictions
# print("Predicted class distribution:", np.unique(y_pred_classes, return_counts=True))


Epoch 1/10


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 262ms/step - accuracy: 0.1211 - loss: 1.9460 - val_accuracy: 0.3000 - val_loss: 1.9376
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.2281 - loss: 1.9410 - val_accuracy: 0.3000 - val_loss: 1.9298
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.2383 - loss: 1.9351 - val_accuracy: 0.3000 - val_loss: 1.9209
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.2734 - loss: 1.9297 - val_accuracy: 0.3000 - val_loss: 1.9120
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.2656 - loss: 1.9214 - val_accuracy: 0.3000 - val_loss: 1.9015
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.2344 - loss: 1.9201 - val_accuracy: 0.3000 - val_loss: 1.8905
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.3000 - loss: 1.8359
Loss: 1.835946798324585, Accuracy: 0.30000001192092896
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step
Accuracy: 0.300
Precision: 0.075
Recall: 0.250
F1 Score: 0.115
Time to train: 4.67 sec


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Add
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import time

# Define the input layers for both models
input_tfidf = Input(shape=(X_train_tfidf.shape[1], X_train_tfidf.shape[2]))  # Shape (timesteps, features)
input_word2vec = Input(shape=(X_train_word2vec.shape[1], X_train_word2vec.shape[2]))  # Shape (timesteps, features)

# Get the outputs of both pre-trained BiLSTM models
output_tfidf = model_tfidf(input_tfidf)
output_word2vec = model_word2vec(input_word2vec)

# Apply weights to each model's output
weight = 0.5
weighted_output_tfidf = output_tfidf * weight
weighted_output_word2vec = output_word2vec * weight

# Compute the weighted sum of the outputs
combined_output = Add()([weighted_output_tfidf, weighted_output_word2vec])

# Final output layer with softmax activation for single-label classification
final_output = Dense(output_dim, activation='softmax')(combined_output)

# Create the final model
final_model = Model(inputs=[input_tfidf, input_word2vec], outputs=final_output)

# Compile the model with additional metrics
final_model.compile(
    optimizer=Adam(),
    loss='categorical_crossentropy',
    metrics=['accuracy', Precision(name='precision'), Recall(name='recall'), AUC(name='auc')]
)

# Train the model with one-hot encoded labels
st = time.time()
final_model.fit(
    [X_train_tfidf, X_train_word2vec], y_train,
    validation_data=([X_val_tfidf, X_val_word2vec], y_val),
    epochs=10, batch_size=32
)
elapsed = round(time.time() - st, 2)
# Print the model summary
final_model.summary()

# Evaluate the final model
results = final_model.evaluate([X_val_tfidf, X_val_word2vec], y_val, batch_size=32)

y_pred = final_model.predict([X_val_tfidf, X_val_word2vec])
y_pred_classes = np.argmax(y_pred, axis=1)
# print("Predicted class distribution:", np.unique(y_pred_classes, return_counts=True))

y_true_classes = np.argmax(y_val, axis=1)
# print("True class distribution:", np.unique(y_true_classes, return_counts=True))

y_pred = final_model.predict([X_val_tfidf, X_val_word2vec])
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_val, axis=1)

# Calculate metrics using sklearn
accuracy = accuracy_score(y_true_classes, y_pred_classes)
precision = precision_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)
recall = recall_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)
f1 = f1_score(y_true_classes, y_pred_classes, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f'Time to train: {elapsed} sec')



Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 612ms/step - accuracy: 0.1758 - auc: 0.5555 - loss: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3000 - val_auc: 0.6065 - val_loss: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1875 - auc: 0.6017 - loss: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3000 - val_auc: 0.6071 - val_loss: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1836 - auc: 0.5786 - loss: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3000 - val_auc: 0.6162 - val_loss: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.2031 - auc

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.4500 - auc: 0.6465 - loss: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 640ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Accuracy: 0.450
Precision: 0.309
Recall: 0.300
F1 Score: 0.275
Time to train: 13.0 sec
