In [81]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

In [82]:
data = pd.read_csv('../Dataset/combined_final_dataset.csv')

In [83]:
X = data['description'].values

label_encoder = LabelEncoder()
data['category_encoded'] = label_encoder.fit_transform(data['main_category'])

from tensorflow.keras.utils import to_categorical

# Assuming y_train, y_val, and y_test contain integer class labels
num_classes = 4  # Replace with the actual number of classes in your dataset

num_classes = len(data['category_encoded'].unique())
encoded_labels = to_categorical(data['category_encoded'], num_classes=num_classes)
y = data['category_encoded'].values
#unque values in y
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))

{0: 33, 1: 349, 2: 324, 3: 61}


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
from sklearn.utils import class_weight
import numpy as np

# Assuming 'y_train' contains your integer-encoded class labels
class_labels = np.unique(y_train)

# Calculate class weights
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = class_labels, y = y_train)

# Create a dictionary mapping class indices to their respective weights
class_weight_dict = dict(zip(class_labels, class_weights))

print("Class Weights:", class_weight_dict)

Class Weights: {0: 5.675925925925926, 1: 0.5434397163120568, 2: 0.5916988416988417, 3: 3.4055555555555554}


In [86]:
max_sequence_length = 1000

tokenizer = Tokenizer(num_words=900)  # Define vocab_size
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)  
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

In [87]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=900, output_dim=300, input_length=max_sequence_length),
    tf.keras.layers.LSTM(units=50),
    tf.keras.layers.Dense(4, activation='softmax')
])

(613,)

In [89]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_padded, y_train, epochs=15,class_weight=class_weight_dict, batch_size=50)



Epoch 1/15
Epoch 2/15


In [39]:
#create classification report
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_padded)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.97      0.94      0.95        67
           2       1.00      1.00      1.00        65
           3       0.78      0.88      0.82        16

    accuracy                           0.96       154
   macro avg       0.94      0.95      0.94       154
weighted avg       0.96      0.96      0.96       154



In [54]:
test_sentence = ["AArrhythmia or irregular heartbeat is a condition in which the heart is unable to pump blood to the body efficiently. "]

sequences = tokenizer.texts_to_sequences(test_sentence)
padded = pad_sequences(sequences, maxlen=max_sequence_length)  
print(model.predict(padded))

#convert the predicted class back to label
print(label_encoder.inverse_transform([np.argmax(model.predict(padded))]))

[[0.00156872 0.00249132 0.9878038  0.00813613]]
['News']
