In [1]:
import polars as pl
import numpy as np
import json

In [2]:
dataset = pl.read_csv('news_dataset.csv')
dataset.columns

['label', 'text']

In [3]:
x_train_data = dataset['text']
y_train_data = dataset['label']

In [4]:
x_train = []
y_train= []
for sentence in x_train_data:
    x_train.append(sentence)
for labels in y_train_data:
    if labels == 'REAL':
        y_train.append(1)
    else:
        y_train.append(0)

In [5]:
# Remove newline characters from sentences in x_train
x_train_cleaned = []
for sentence in x_train:
    if sentence is not None:
        x_train_cleaned.append(sentence.replace('\n', ' '))
    else:
        x_train_cleaned.append('')

In [6]:
x_train_cleaned[2]

'Republic Poll, a fake Twitter account imitating the Arnab Goswami-led Republic TV, is angering netizens with its controversial polls as Twitter users including journalists mistake it for the channel\'s official account.    The fake account (@RepublicPoll) uses a logo very similar to Republic TV\'s logo and does not mention in its bio whether it is related to Republic TV or if it is a fan account. Twitter\'s rules require that a fan account or parody account should indicate the same.  Also Read:Did Nita Ambani Ask For Support For CAA?      Siddharth Varadarajan, founding editor of The Wire.in, in a now deleted tweet, had shared a screenshot of a poll by the fake account claiming it was a \'poll run by a "nationalistic" media house\'.  This is hilarious. Despite the desperate phrasing of the question, this poll run by a "nationalist" media house has ended up condemning the innocent, "minority in JNU" ABVP. pic.twitter.com/gQGtzFEU26 — Siddharth (@svaradarajan) January 7, 2020          A

In [7]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train)
print(y_train[0])

[0. 1.]


In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

In [9]:
max_len = 50  # Maximum sequence length
vocab_size = 10000  # Number of words to consider
embedding_dim = 128  # Embedding dimension
lstm_units = 64 

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train_cleaned)
sequences = tokenizer.texts_to_sequences(x_train_cleaned)
padded_sequences = pad_sequences(sequences, maxlen=max_len)
padded_sequences = np.array(padded_sequences)

In [10]:
training_data, validation_data, training_labels, validation_labels = train_test_split(padded_sequences, y_train, test_size=0.2, random_state=42)

In [11]:
model = tf.keras.Sequential([
  Embedding(vocab_size, embedding_dim, input_length=max_len),
  LSTM(lstm_units),
  Dense(128, activation='relu'),
  Dense(2, activation='sigmoid')  # Output layer for binary classification
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(training_data, training_labels, epochs=3, validation_data=(validation_data, validation_labels))

Epoch 1/3


2024-06-18 15:23:04.890438: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-06-18 15:23:04.890469: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-06-18 15:23:04.890479: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-06-18 15:23:04.890772: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-18 15:23:04.890795: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-06-18 15:23:05.753893: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 52ms/step - accuracy: 0.7970 - loss: 0.4773 - val_accuracy: 0.9665 - val_loss: 0.0901
Epoch 2/3
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9836 - loss: 0.0523 - val_accuracy: 0.9692 - val_loss: 0.0900
Epoch 3/3
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9971 - loss: 0.0090 - val_accuracy: 0.9611 - val_loss: 0.1921


<keras.src.callbacks.history.History at 0x34be79d90>

In [12]:
new_text = "Mumbai hoarding collapse: 14 killed; traffic jams in Ghatkopar | Latest updates"  # Replace with your new text
new_sequence = tokenizer.texts_to_sequences([new_text])
padded_new_sequence = pad_sequences(new_sequence, maxlen=max_len)
prediction = model.predict(padded_new_sequence)
if np.argmax(prediction)==1:
    print("News is Real")
else:
    print("News is fake")
print(np.argmax(prediction))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step
News is Real
1


In [13]:
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [14]:
model.save('newsdetection.keras')

In [5]:
import tensorflow as tf
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [8]:
with open('tokenizer.json', 'r') as t:
    data = json.load(t)
    tokenizer = tokenizer_from_json(data)
model = tf.keras.models.load_model('newsdetection.keras')

In [9]:
new_text = "Mumbai hoarding collapse: 14 killed; traffic jams in Ghatkopar | Latest updates"  # Replace with your new text
new_sequence = tokenizer.texts_to_sequences([new_text])
padded_new_sequence = pad_sequences(new_sequence, maxlen=50)
prediction = model.predict(padded_new_sequence)
if np.argmax(prediction)==1:
    print("News is Real")
else:
    print("News is fake")
print(np.argmax(prediction))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
News is Real
1


2024-06-18 15:30:33.727329: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
