
### Importing Necessary Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import numpy as np

#### Loading Preprocessed Data

In [3]:
df = pd.read_csv('preprocessed_bbc_news.csv')

#### Label Encoding

In [4]:
label_encoder = LabelEncoder()
df['Category_target'] = label_encoder.fit_transform(df['Category'])


#### Tokenizing the Text Data

In [5]:
tokenizer = Tokenizer(num_words=5000, lower=True)  # num_words=5000 limits the vocab size to 5000 most common words
tokenizer.fit_on_texts(df['ProcessedText'])
X = tokenizer.texts_to_sequences(df['ProcessedText'])


#### Padding the Sequences

In [6]:
X = pad_sequences(X, maxlen=500)

#### One-Hot Encoding the Labels

In [7]:
y = to_categorical(df['Category_target'])

#### Splitting the Dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)


#### Building the LSTM Model

In [9]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(64))
model.add(Dropout(0.4))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.4))  #
model.add(Dense(5, activation='softmax'))


#### Compiling the Model

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#### Training the Model

In [11]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=64,
                    validation_split=0.2,
                    verbose=1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=7, min_delta=0.01)]
                   )

Epoch 1/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 2s/step - accuracy: 0.2384 - loss: 1.6063 - val_accuracy: 0.2427 - val_loss: 1.5765
Epoch 2/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2s/step - accuracy: 0.2947 - loss: 1.5817 - val_accuracy: 0.5649 - val_loss: 1.3884
Epoch 3/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - accuracy: 0.5495 - loss: 1.2350 - val_accuracy: 0.6151 - val_loss: 0.8880
Epoch 4/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - accuracy: 0.6872 - loss: 0.8139 - val_accuracy: 0.7950 - val_loss: 0.5085
Epoch 5/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 2s/step - accuracy: 0.7884 - loss: 0.5689 - val_accuracy: 0.6695 - val_loss: 0.7577
Epoch 6/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 3s/step - accuracy: 0.7901 - loss: 0.4997 - val_accuracy: 0.7741 - val_loss: 0.5436
Epoch 7/50
[1m15/15[0m [32m━━━━━━━━━━

#### Evaluating the Model

In [12]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 350ms/step - accuracy: 0.8624 - loss: 0.6610
Test Accuracy: 86.58%


##### Function to Predict Category for New Text

In [13]:
def predict_category(text):
    # Convert the input text to sequence and pad it to match the model input shape
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=500)

    # Predict the category and return the corresponding label
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])


##### Testing the Prediction

In [14]:
new_text = "While people on social media have been amused by Arshad Nadeem being gifted a buffalo for winning gold at Paris Olympics."
category = predict_category(new_text)
print(f'Predicted Category: {category}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426ms/step
Predicted Category: ['business']
