In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential

In [None]:
nltk.download('all')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Exercise 1: Fake News Classification

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
df = df.dropna()

In [None]:
X = df['title'] + " " + df['text']
y = df['label']

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
corpus = []
for text in X:
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    corpus.append(' '.join(words))

In [None]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)
X_sequences = tokenizer.texts_to_sequences(corpus)
X_padded = pad_sequences(X_sequences, maxlen=200)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [None]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=X_padded.shape[1]))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1117377 (4.26 MB)
Trainable params: 1117377 (4.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d6b75d276d0>

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype(int)



In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", 100*accuracy_score(y_test, y_pred))

Confusion Matrix:
[[1942  140]
 [ 124 1451]]
Accuracy: 92.78096800656276


The accuracy for Fake News detections is 92.78%

##Exercise 2: Fake News Classification for Hindi Dataset

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/Datasets/Hindi_News/fake_news_basic.csv',index_col=0)
df2 = pd.read_csv('/content/drive/MyDrive/Datasets/Hindi_News/true_news_basic.csv',index_col=0)
df1['label'] = 1
df2['label'] = 0
df=pd.concat([df1,df2],ignore_index=True)
df.head()

Unnamed: 0,short_description,label
0,बूम पाय इमरान खान भारत सरकार आलोच रह वर्तमान श...,1
1,सिख समुदाय के लोग हिंद साइन बोर्ड कालिख पोत दि...,1
2,सोशल मीडिय प्लेटफ़ॉर्म फ़ेसबुक ट्विटर दाव के बड़ ...,1
3,दाव भाजप के मा सरकार जन के गलत रह ।,1
4,मीडिय आउटलेट्स वायर एजेंस गलत तरीक दाव किय पाक...,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2010 entries, 0 to 2009
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   short_description  2010 non-null   object
 1   label              2010 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.5+ KB


In [None]:
df['label'].value_counts()

1    1250
0     760
Name: label, dtype: int64

In [None]:
X = df['short_description']
y = df['label']

In [None]:
with open('/content/drive/MyDrive/Datasets/Hindi_News/final_stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = [line.strip() for line in file]

corpus = []
for text in X:
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    corpus.append(' '.join(words))

In [None]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)
X_sequences = tokenizer.texts_to_sequences(corpus)
X_padded = pad_sequences(X_sequences, maxlen=200)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=1)

In [None]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=X_padded.shape[1]))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 100)          1000000   
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1117377 (4.26 MB)
Trainable params: 1117377 (4.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)  # Adjust epochs and batch size

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d6b4dea9a50>

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype(int)



In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", 100*accuracy_score(y_test, y_pred))

Confusion Matrix:
[[  0 156]
 [  0 246]]
Accuracy: 61.19402985074627


The accuracy for Hindi Fake News detections is 61.19%