In [27]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,SimpleRNN,Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# read dataset 
data_path = '/home/fadhil/Desktop/ML/projects/news_classification/data/data.csv'
df = pd.read_csv(data_path)

In [4]:
df

Unnamed: 0,headline,category
0,The Funniest Tweets From Parents This Week (Se...,PARENTING
1,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT
2,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS
3,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS
4,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT
...,...,...
124782,Walmart Waving Goodbye To Some Greeters,BUSINESS
124783,"At World Economic Forum, Fear of Global Contag...",BUSINESS
124784,Positive Customer Experience: What's the Retur...,BUSINESS
124785,"Sundance, Ice-T, and Shades of the American Ra...",ENTERTAINMENT


In [5]:
df[df['headline'].isnull()]
df = df.dropna()

In [6]:
df[df['headline'].isnull()]

Unnamed: 0,headline,category


In [18]:
#prepare independent and dependent feature 
X = df['headline']
y = df['category']

### Encoding the output feature label 

In [19]:
y.unique()

array(['PARENTING', 'ENTERTAINMENT', 'POLITICS', 'WELLNESS', 'BUSINESS',
       'STYLE & BEAUTY', 'FOOD & DRINK', 'QUEER VOICES', 'TRAVEL',
       'HEALTHY LIVING'], dtype=object)

In [20]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

In [21]:
## clean the X data 
def clean_data(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]',' ',text) # remove numbers , punctuations 
    return text


X = X.apply(clean_data)   

In [23]:
X

0         the funniest tweets from parents this week  se...
1         golden globes returning to nbc in january afte...
2         biden says u s  forces would defend taiwan if ...
3          beautiful and sad at the same time   ukrainia...
4         james cameron says he  clashed  with studio be...
                                ...                        
124782              walmart waving goodbye to some greeters
124783    at world economic forum  fear of global contag...
124784    positive customer experience  what s the retur...
124785    sundance  ice t  and shades of the american ra...
124786     girl with the dragon tattoo  india release ca...
Name: headline, Length: 124784, dtype: object

### Text tokenize and padding

In [24]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X) # oro words inum unique number kodukum like store in dictionary
sequences = tokenizer.texts_to_sequences(X) # Now it converts text into a list of numbers (based on the tokenizer dictionary).
max_length = max([len(seq) for seq in sequences]) # This finds the longest sentence (based on number of words), so that we can pad all shorter sentences to the same length.
X_padded = pad_sequences(sequences,maxlen=max_length,padding='post')
vocblary_size = len(tokenizer.word_index) + 1

In [25]:
X_padded

array([[   2, 3213,  412, ...,    0,    0,    0],
       [ 606, 1023, 2875, ...,    0,    0,    0],
       [ 279,   44,   68, ...,    0,    0,    0],
       ...,
       [1096, 2814, 1068, ...,    0,    0,    0],
       [3735,  722,   18, ...,    0,    0,    0],
       [ 323,   13,    2, ...,    0,    0,    0]], dtype=int32)

### Train_test_split

In [26]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_padded,y_encoded,test_size=0.2,random_state=42,stratify=y_encoded)


## RNN model 

In [30]:
model = Sequential([
    Embedding(input_dim=vocblary_size,output_dim=128,input_shape=(max_length,)),
    SimpleRNN(128,activation='relu'),
    Dropout(0.5),
    Dense(10,activation='softmax')
]  
)

In [31]:
model.summary()

In [35]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [36]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [None]:
history = model.fit(X_train,y_train,epochs=10,
                    batch_size=32,validation_split=0.2,
                    callbacks=early_stopping)

Epoch 1/10
[1m 365/2496[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3:07[0m 88ms/step - accuracy: 0.2724 - loss: 2.1604

In [None]:
#https://www.kaggle.com/datasets/rmisra/news-category-dataset

#learn LSTM