In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

In [2]:
dir_path = os.path.join("database", "text_classification")

with open (os.path.join(dir_path, "ro.json")) as f:
    data = json.load(f)
    df = pd.DataFrame(data)

df.head()

Unnamed: 0,text,label
0,A aparut ceva nou in simptomatologia dumneavoa...,symptoms_changes
1,Alte probleme de sanatate ati avut,diseases_personal
2,Ati avut frisoane de curand,fever_symptoms
3,Ati avut o interventie chirurgicala,surgeries
4,Ati avut tuse cu sputa,cough_symptoms


In [3]:
df["label"].value_counts()

visit_reason              40
symptoms_start            18
diseases_parents          18
fainting_symptoms         16
chronic_treatment         15
symptoms_circumstances    13
symptoms_changes          12
fever_symptoms            12
palpitations_symptoms     12
diseases_personal         10
greetings                  8
cough_symptoms             7
surgeries                  5
chest_pain                 5
Name: label, dtype: int64

In [4]:
dir_path = os.path.join("database", "personas")

with open(os.path.join(dir_path, "en_personas.json")) as f:
    data = json.load(f)
    texts = []
    labels = []
    for persona in data:
        for dialog_entity in persona["dialog"]:
            if not "questions" in dialog_entity:
                print(dialog_entity)
            
            for question in dialog_entity["questions"]:
                texts.append(question)
                labels.append(dialog_entity["label"])
    df_persona = pd.DataFrame(
        {
            "text": texts,
            "label": labels
        }
    )
df_persona.head()

Unnamed: 0,text,label
0,What is the reason for the presentation at the...,visit_reason
1,When did the symptoms start?,symptoms_start
2,What are the circumstances of the onset of sym...,symptoms_circumstances
3,Were there any changes in symptoms from onset ...,symptoms_changes
4,Do you know of any illness that your mother or...,diseases_parents


In [5]:
counts = df_persona["label"].value_counts()
counts

greetings                 90
diseases_personal         18
chronic_treatment         16
visit_reason              15
symptoms_start            15
symptoms_circumstances    15
symptoms_changes          15
diseases_parents          15
surgeries                 15
chest_pain                15
fainting_symptoms         15
palpitations_symptoms     15
cough_symptoms            15
fever_symptoms            15
Name: label, dtype: int64

In [6]:
df_combined = pd.concat([df, df_persona], ignore_index=True)
df_combined = df_combined.drop_duplicates(["text"])
df_combined["label"].value_counts()

visit_reason              41
diseases_parents          21
symptoms_start            19
fainting_symptoms         18
chronic_treatment         18
diseases_personal         15
fever_symptoms            14
greetings                 14
symptoms_circumstances    14
symptoms_changes          13
palpitations_symptoms     13
cough_symptoms             8
surgeries                  7
chest_pain                 6
Name: label, dtype: int64

In [7]:
label_to_idx = {label: idx for idx, label in enumerate(df_combined["label"].unique())}
label_to_idx

{'symptoms_changes': 0,
 'diseases_personal': 1,
 'fever_symptoms': 2,
 'surgeries': 3,
 'cough_symptoms': 4,
 'fainting_symptoms': 5,
 'palpitations_symptoms': 6,
 'greetings': 7,
 'symptoms_start': 8,
 'visit_reason': 9,
 'chronic_treatment': 10,
 'symptoms_circumstances': 11,
 'chest_pain': 12,
 'diseases_parents': 13}

In [8]:
df_combined["label_id"] = df_combined["label"].apply(lambda s: label_to_idx[s])
df_combined.head()

Unnamed: 0,text,label,label_id
0,A aparut ceva nou in simptomatologia dumneavoa...,symptoms_changes,0
1,Alte probleme de sanatate ati avut,diseases_personal,1
2,Ati avut frisoane de curand,fever_symptoms,2
3,Ati avut o interventie chirurgicala,surgeries,3
4,Ati avut tuse cu sputa,cough_symptoms,4


In [9]:
train_df = df_combined.groupby("label").sample(frac=0.8, random_state=42)
valid_df = df_combined.drop(train_df.index)

In [10]:
batch_size = 12

train_dataset = (tf.data.Dataset
                 .from_tensor_slices((train_df["text"], train_df["label_id"]))
                 .batch(batch_size))

valid_dataset = (tf.data.Dataset
                 .from_tensor_slices((valid_df["text"], valid_df["label_id"]))
                 .batch(batch_size))

In [11]:
VOCAB_SIZE = 10_000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE
)
encoder.adapt(df_combined["text"])

In [12]:
encoder(df_combined.loc[:3, "text"])

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[ 21,  65,  20, 256,  11,  94,  52],
       [ 84,  73,   2, 232,  10,   7,   0],
       [ 10,   7,  62,   2, 173,   0,   0],
       [ 10,   7,  28, 159, 179,   0,   0]], dtype=int64)>

In [13]:
output_dim = 64
units = 64

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(), dtype=tf.string),
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=output_dim,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units)),
    tf.keras.layers.Dense(len(label_to_idx)),
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 64)          22272     
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 14)                1806      
                                                        

In [14]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [15]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [None]:
epochs = 100

history = model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=valid_dataset
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
ax.plot(history.history["loss"], label="Training loss")
ax.plot(history.history["val_loss"], label="Validation loss")
ax.set_xlabel("Epcohs")
ax.set_ylabel("Loss")
ax.set_xticks(np.arange(0, len(history.history["loss"]) + 1, 10))
ax.legend()
plt.savefig("loss.png")

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
ax.plot(history.history["sparse_categorical_accuracy"], label="Training accuracy")
ax.plot(history.history["val_sparse_categorical_accuracy"], label="Validation accuracy")
ax.set_xlabel("Epcohs")
ax.set_ylabel("Accuracy")
ax.set_xticks(np.arange(0, len(history.history["sparse_categorical_accuracy"]) + 1, 10))
ax.legend()
fig.savefig("accuracy.png")

In [None]:
metrics = model.evaluate(
    df_combined["text"].to_numpy(),
    df_combined["label_id"].to_numpy()
)

for metric_name, metric in zip(model.metrics_names, metrics):
    print(f"{metric_name:<8s}: {metric:.4f}")

In [None]:
predictions_probabilities = model.predict(df_combined["text"])
predictions = np.argmax(predictions_probabilities, axis=1)

y_true = df_combined["label_id"].to_numpy()
accuracy = accuracy_score(y_true, predictions)
precision = precision_score(y_true, predictions, average="macro")
recall = precision_score(y_true, predictions, average="macro")
f1score = f1_score(y_true, predictions, average="macro")

print(f"accuracy : {accuracy:.4f}")
print(f"precision: {precision:.4f}")
print(f"recall   : {recall:.4f}")
print(f"F1       : {f1score:.4f}")

In [None]:
matrix = confusion_matrix(y_true, predictions)
index = label_to_idx.keys()
df = pd.DataFrame(matrix, index=index, columns=index)
plt.figure(figsize=(10, 7))
sns.heatmap(df, annot=True, fmt="d")
plt.savefig("confusion_matrix.png")

In [None]:
model.save("rnn")

In [None]:
# with open("label_to_idx.json", "w", encoding="utf-8") as f:
#     json.dump(label_to_idx, f)