In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense




In [2]:
# Load the CSV file
df = pd.read_csv(r'C:\Users\The Emoji Girl\OneDrive\Documents\Datasets\agnews_test.csv', names=['Class', 'Title', 'Description'])
df

Unnamed: 0,Class,Title,Description
0,Class Index,Title,Description
1,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
2,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
3,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
4,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
...,...,...,...
7596,1,Around the world,Ukrainian presidential candidate Viktor Yushch...
7597,2,Void is filled with Clement,With the supply of attractive pitching options...
7598,2,Martinez leaves bitter,Like Roger Clemens did almost exactly eight ye...
7599,3,5 of arthritis patients in Singapore take Bext...,SINGAPORE : Doctors in the United States have ...


In [18]:
# Remove header row
df = df[df['Class'] != 'Class Index'].copy()

# Convert class to integer and combine title and description
df['Class'] = df['Class'].astype(int)
df['Text'] = df['Title'] + ' ' + df['Description']
df

Unnamed: 0,Class,Title,Description,Text
1,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...,Fears for T N pension after talks Unions repre...
2,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",The Race is On: Second Private Team Sets Launc...
3,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,Ky. Company Wins Grant to Study Peptides (AP) ...
4,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,Prediction Unit Helps Forecast Wildfires (AP) ...
5,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,Calif. Aims to Limit Farm-Related Smog (AP) AP...
...,...,...,...,...
7596,1,Around the world,Ukrainian presidential candidate Viktor Yushch...,Around the world Ukrainian presidential candid...
7597,2,Void is filled with Clement,With the supply of attractive pitching options...,Void is filled with Clement With the supply of...
7598,2,Martinez leaves bitter,Like Roger Clemens did almost exactly eight ye...,Martinez leaves bitter Like Roger Clemens did ...
7599,3,5 of arthritis patients in Singapore take Bext...,SINGAPORE : Doctors in the United States have ...,5 of arthritis patients in Singapore take Bext...


In [4]:
max_features = 10000  # vocabulary size

# Prepare the data
texts = df['Text'].tolist()
labels = (df['Class'] - 1).values  # Convert 1-4 to 0-3

# Split the data
from sklearn.model_selection import train_test_split
X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# Print the shape of the data
print(f'Training data shape: {len(X_train_text)}, Training labels shape: {len(y_train)}')
print(f'Testing data shape: {len(X_test_text)}, Testing labels shape: {len(y_test)}')

Training data shape: 6080, Training labels shape: 6080
Testing data shape: 1520, Testing labels shape: 1520


In [5]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

# Convert texts to sequences
X_train = tokenizer.texts_to_sequences(X_train_text)
X_test = tokenizer.texts_to_sequences(X_test_text)

X_train[0]

[122, 479, 217, 1, 594, 1193, 1, 2984, 78, 122, 8, 69, 4355, 60, 1736]

In [6]:
# Mapping of words index back to words (for understanding)
word_index = tokenizer.word_index
reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index

{1: '<OOV>',
 2: 'the',
 3: 'to',
 4: 'a',
 5: 'of',
 6: 'in',
 7: 'and',
 8: 'on',
 9: 'for',
 10: '39',
 11: 's',
 12: 'that',
 13: 'with',
 14: 'at',
 15: 'as',
 16: 'its',
 17: 'is',
 18: 'new',
 19: 'by',
 20: 'said',
 21: 'it',
 22: 'has',
 23: 'reuters',
 24: 'from',
 25: 'ap',
 26: 'an',
 27: 'after',
 28: 'his',
 29: 'will',
 30: 'was',
 31: 'us',
 32: 'gt',
 33: 'lt',
 34: 'have',
 35: 'over',
 36: 'two',
 37: 'first',
 38: 'be',
 39: 'up',
 40: 'their',
 41: 'year',
 42: 'quot',
 43: 'are',
 44: 'this',
 45: 'he',
 46: 'more',
 47: 'but',
 48: 'one',
 49: 'monday',
 50: 'u',
 51: 'out',
 52: 'tuesday',
 53: '1',
 54: 'thursday',
 55: 'against',
 56: 'wednesday',
 57: 'world',
 58: 'oil',
 59: 'company',
 60: 'into',
 61: 'inc',
 62: '2',
 63: 'microsoft',
 64: 'than',
 65: 'not',
 66: 'who',
 67: 'last',
 68: 'they',
 69: 'friday',
 70: 'been',
 71: 'york',
 72: 'were',
 73: 'b',
 74: 'million',
 75: 'about',
 76: 'iraq',
 77: 'three',
 78: 'president',
 79: 'week',
 80: 'ti

In [7]:
sample_review = X_train[0]
sample_label = y_train[0]

print(f"Sample review (as integers): {sample_review}")
print(f'Sample label: {sample_label}')

Sample review (as integers): [122, 479, 217, 1, 594, 1193, 1, 2984, 78, 122, 8, 69, 4355, 60, 1736]
Sample label: 0


In [8]:
decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in sample_review])
decoded_review

'bush kerry trade <OOV> following debate <OOV> pa president bush on friday ripped into sen'

In [9]:
from tensorflow.keras.preprocessing import sequence

max_len = 100

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)
X_train

array([[   0,    0,    0, ..., 4355,   60, 1736],
       [   0,    0,    0, ...,  196,  156, 1663],
       [   0,    0,    0, ...,    5, 1834,  319],
       ...,
       [   0,    0,    0, ...,   33, 4535,   32],
       [   0,    0,    0, ...,    5,    1, 2470],
       [   0,    0,    0, ..., 1313, 8062,   12]])

In [10]:
## Train LSTM Model
model = Sequential()
model.add(Embedding(max_features, 128, input_length=max_len))  # Embedding Layer
model.add(LSTM(128, activation='relu'))
model.add(Dense(4, activation="softmax"))  # 4 classes: World, Sports, Business, Sci/Tech




In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 4)                 516       
                                                                 
Total params: 1412100 (5.39 MB)
Trainable params: 1412100 (5.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [13]:
## Create an instance of EarlyStopping Callback
from tensorflow.keras.callbacks import EarlyStopping
earlystopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
earlystopping

<keras.src.callbacks.EarlyStopping at 0x25aa3ea18e0>

In [14]:
history = model.fit(
    X_train, y_train, epochs=10, batch_size=32,
    validation_split=0.2,
    callbacks=[earlystopping]
)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
model.save('lstm_news_classification.h5')

  saving_api.save_model(


In [16]:
# Save the tokenizer for later use
import pickle
with open('news_tokenizer_lstm.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [17]:
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

Test Loss: 0.9053300023078918
Test Accuracy: 0.692105233669281
