In [4]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM,Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [6]:
# read dataset

url = 'https://raw.githubusercontent.com/fadhilhussain/news-category-rnn/refs/heads/main/data/data.csv'
df = pd.read_csv(url)

In [7]:
df

Unnamed: 0,headline,category
0,The Funniest Tweets From Parents This Week (Se...,PARENTING
1,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT
2,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS
3,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS
4,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT
...,...,...
124782,Walmart Waving Goodbye To Some Greeters,BUSINESS
124783,"At World Economic Forum, Fear of Global Contag...",BUSINESS
124784,Positive Customer Experience: What's the Retur...,BUSINESS
124785,"Sundance, Ice-T, and Shades of the American Ra...",ENTERTAINMENT


In [8]:
df[df['headline'].isnull()]
df = df.dropna()

In [9]:
df[df['headline'].isnull()]

Unnamed: 0,headline,category


In [10]:
#prepare independent and dependent feature
X = df['headline']
y = df['category']

### Encoding the output feature label

In [11]:
y.unique()

array(['PARENTING', 'ENTERTAINMENT', 'POLITICS', 'WELLNESS', 'BUSINESS',
       'STYLE & BEAUTY', 'FOOD & DRINK', 'QUEER VOICES', 'TRAVEL',
       'HEALTHY LIVING'], dtype=object)

In [12]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

In [19]:
y_encoded

array([4, 1, 5, ..., 0, 1, 1])

In [13]:
## clean the X data
def clean_data(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]',' ',text) # remove numbers , punctuations
    return text


X = X.apply(clean_data)

In [14]:
X

Unnamed: 0,headline
0,the funniest tweets from parents this week se...
1,golden globes returning to nbc in january afte...
2,biden says u s forces would defend taiwan if ...
3,beautiful and sad at the same time ukrainia...
4,james cameron says he clashed with studio be...
...,...
124782,walmart waving goodbye to some greeters
124783,at world economic forum fear of global contag...
124784,positive customer experience what s the retur...
124785,sundance ice t and shades of the american ra...


### Text tokenize and padding

In [15]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X) # oro words inum unique number kodukum like store in dictionary
sequences = tokenizer.texts_to_sequences(X) # Now it converts text into a list of numbers (based on the tokenizer dictionary).
max_length = max([len(seq) for seq in sequences]) # This finds the longest sentence (based on number of words), so that we can pad all shorter sentences to the same length.
X_padded = pad_sequences(sequences,maxlen=max_length,padding='post')
vocblary_size = len(tokenizer.word_index) + 1

In [16]:
X_padded

array([[   2, 3213,  412, ...,    0,    0,    0],
       [ 606, 1023, 2875, ...,    0,    0,    0],
       [ 279,   44,   68, ...,    0,    0,    0],
       ...,
       [1096, 2814, 1068, ...,    0,    0,    0],
       [3735,  722,   18, ...,    0,    0,    0],
       [ 323,   13,    2, ...,    0,    0,    0]], dtype=int32)

### Train_test_split

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_padded,y_encoded,test_size=0.2,random_state=42,stratify=y_encoded)


## LSTM

In [20]:
model = Sequential([
    Embedding(input_dim=vocblary_size,output_dim=128,input_shape=(max_length,)),
    LSTM(128,return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(10,activation='softmax')
]
)

  super().__init__(**kwargs)


In [21]:
model.summary()

In [22]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [24]:
history = model.fit(X_train,y_train,epochs=50,
                    batch_size=32,
                    validation_data=(X_test,y_test),
                    callbacks=early_stopping)

Epoch 1/50
[1m3120/3120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.3478 - loss: 1.9296 - val_accuracy: 0.6798 - val_loss: 1.0108
Epoch 2/50
[1m3120/3120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 11ms/step - accuracy: 0.7363 - loss: 0.8362 - val_accuracy: 0.7711 - val_loss: 0.7337
Epoch 3/50
[1m3120/3120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.8333 - loss: 0.5208 - val_accuracy: 0.7708 - val_loss: 0.7333
Epoch 4/50
[1m3120/3120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 11ms/step - accuracy: 0.8731 - loss: 0.3876 - val_accuracy: 0.7689 - val_loss: 0.7814
Epoch 5/50
[1m3120/3120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.9029 - loss: 0.2949 - val_accuracy: 0.7604 - val_loss: 0.8858
Epoch 6/50
[1m3120/3120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - accuracy: 0.9244 - loss: 0.2290 - val_accuracy: 0.7513 - val_loss: 0.9787
Epoc

In [30]:
model.save('news_lstm.h5')



In [31]:
import pickle
with open('tokenizer.pkl','wb') as file:
  pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

In [32]:
from google.colab import files
files.download('news_lstm.h5')
files.download('tokenizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>