<a href="https://colab.research.google.com/github/dhanushvemulapalli/Detecting-Fake-News-Using-Information-Retrieval-Machine-Learning/blob/main/FakeNews_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
!pip install gensim

import gensim.downloader as api
from gensim.models import Word2Vec
# --- Word Embeddings (e.g., Word2Vec or GloVe) as an alternative to TF-IDF ---
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

!pip install lazypredict
from lazypredict.Supervised import LazyClassifier

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df1 = pd.read_csv("/content/WELFake_Dataset.csv")

In [5]:
df1.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
df2 = pd.read_csv("/content/fake_news_dataset.csv")

In [7]:
df2.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [8]:
df1_subset = df1[['title', 'text', 'label']]
df2_subset = df2[['title', 'text', 'label']]
df2_subset['label'] = df2_subset['label'].map({'real': 0, 'fake': 1})
df = pd.concat([df1_subset, df2_subset], ignore_index=True)
df.tail()

Unnamed: 0,title,text,label
92129,House party born.,hit and television I change very our happy doo...,1
92130,Though nation people maybe price box.,fear most meet rock even sea value design stan...,0
92131,Yet exist with experience unit.,activity loss very provide eye west create wha...,0
92132,School wide itself item.,term point general common training watch respo...,1
92133,Offer chair cover senior born.,remain pressure glass me six senior though nor...,1


In [9]:
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')
df['combined_text'] = df['title'] + ' ' + df['text']

ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
    text = ' '.join(text)
    return text

df['cleaned_text'] = df['combined_text'].apply(clean_text)

display(df.head())

Unnamed: 0,title,text,label,combined_text,cleaned_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforc high alert follow threat cop white ...
1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbeliev obama attorney gener say charlott rio...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",bobbi jindal rais hindu use stori christian co...
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,satan russia unv imag terrifi new supernuk wes...


## Data splitting

Split the data into training and testing sets.

In [10]:
X = df['cleaned_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (73707,)
Testing data shape: (18427,)


In [11]:
X[0]

'law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video comment expect barack obama member fyf fukyoflag blacklivesmatt movement call lynch hang white peopl cop encourag other radio show tuesday night turn tide kill white peopl cop send messag kill black peopl america one f yoflag organ call sunshin radio blog show host texa call sunshin f ing opinion radio show snapshot fyf lolatwhitefear twitter page p show urg support call fyf tonight continu dismantl illus white snapshot twitter radio call invit fyf radio show air p eastern standard time show caller clearli call lynch kill white peopl minut clip radio show heard provid breitbart texa someon would like refer hannib alreadi receiv death threat result interrupt fyf confer call unidentifi black man said mother f ker start f ing like us bunch ni er takin one us roll said caus alreadi roll gang anyway six seven black mother f cker see white person lynch ass let turn tabl conspir cop start lose peopl state emerg 

In [12]:

tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X_train = tfidf_v.fit_transform(X_train)
X_test = tfidf_v.transform(X_test)

print(f"Training data shape after vectorization: {X_train.shape}")
print(f"Testing data shape after vectorization: {X_test.shape}")

Training data shape after vectorization: (73707, 5000)
Testing data shape after vectorization: (18427, 5000)


In [13]:
X_train[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 268 stored elements and shape (1, 5000)>

In [14]:
print(df['label'].value_counts())

label
1    47162
0    44972
Name: count, dtype: int64


In [15]:
# Train a Linear SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print(f"Linear SVM Model Performance:")
print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")

Linear SVM Model Performance:
Accuracy: 0.8609648884788625
Precision: 0.8575145711906744
Recall: 0.8735156912637828
F1 Score: 0.8654411764705883


In [16]:

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

print("\nDecision Tree Performance:")
print(f"Accuracy: {accuracy_dt}")
print(f"Precision: {precision_dt}")
print(f"Recall: {recall_dt}")
print(f"F1 Score: {f1_dt}")



Decision Tree Performance:
Accuracy: 0.8433277256200141
Precision: 0.8421327757449033
Recall: 0.8540076335877863
F1 Score: 0.8480286361004369


In [17]:

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")



Random Forest Performance:
Accuracy: 0.8672599989146361
Precision: 0.8594360979625437
Recall: 0.8854961832061069
F1 Score: 0.8722715404699739


In [18]:

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

print("\nGradient Boosting Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")
print(f"Precision: {precision_score(y_test, y_pred_gb)}")
print(f"Recall: {recall_score(y_test, y_pred_gb)}")
print(f"F1 Score: {f1_score(y_test, y_pred_gb)}")



Gradient Boosting Performance:
Accuracy: 0.8519563683724969
Precision: 0.8125699365908243
Recall: 0.9238761662425785
F1 Score: 0.8646556856519151


In [19]:

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb)}")
print(f"Precision: {precision_score(y_test, y_pred_xgb)}")
print(f"Recall: {recall_score(y_test, y_pred_xgb)}")
print(f"F1 Score: {f1_score(y_test, y_pred_xgb)}")



XGBoost Performance:
Accuracy: 0.8683996309762848
Precision: 0.8581944586443104
Recall: 0.8899491094147582
F1 Score: 0.8737833758392755


In [20]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb)

print("\nXGBoost Confusion Matrix:")
print(cm_xgb)


XGBoost Confusion Matrix:
[[7608 1387]
 [1038 8394]]


In [21]:
print("Loading pre-trained GloVe model (50 dimensions)...")
word_vectors = api.load('word2vec-google-news-300')
print("Model loaded successfully ✅")

# 2. Function: Convert a text into an averaged word embedding vector
def text_to_vec(text, model):
    words = text.split()
    word_vecs = [model[word] for word in words if word in model]
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

# 3. Create sentence embeddings for all cleaned text
print("Converting text to word embeddings...")
X_w2v = np.array([text_to_vec(text, word_vectors) for text in df['cleaned_text']])
y = df['label'].values

# 4. Split into training and testing sets
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42
)

print(f"Training data shape: {X_train_w2v.shape}")
print(f"Testing data shape: {X_test_w2v.shape}")

# 5. Train a Linear SVM classifier
print("Training SVM model with Word2Vec features...")
svm_model_w2v = LinearSVC()
svm_model_w2v.fit(X_train_w2v, y_train_w2v)

# 6. Evaluate
y_pred_svm_w2v = svm_model_w2v.predict(X_test_w2v)
accuracy_svm_w2v = accuracy_score(y_test_w2v, y_pred_svm_w2v)

print(f"✅ Linear SVM Accuracy with Word2Vec embeddings: {accuracy_svm_w2v:.4f}")


Loading pre-trained GloVe model (50 dimensions)...
Model loaded successfully ✅
Converting text to word embeddings...
Training data shape: (73707, 300)
Testing data shape: (18427, 300)
Training SVM model with Word2Vec features...
✅ Linear SVM Accuracy with Word2Vec embeddings: 0.7874


In [22]:
df.head()

Unnamed: 0,title,text,label,combined_text,cleaned_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforc high alert follow threat cop white ...
1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbeliev obama attorney gener say charlott rio...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",bobbi jindal rais hindu use stori christian co...
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,satan russia unv imag terrifi new supernuk wes...


In [23]:
X_w2v[3]

array([ 0.02461421,  0.03542606,  0.03085231,  0.08252139, -0.04775489,
       -0.00959517,  0.02772101, -0.05292339,  0.06015059,  0.05260823,
       -0.01160991, -0.08298039, -0.06227595,  0.05613851, -0.06422401,
        0.09496939,  0.01479747,  0.10951365, -0.01273841, -0.06263315,
        0.03262291,  0.03116759,  0.06418309, -0.00102976,  0.00858931,
       -0.03772848, -0.09183924,  0.04802933,  0.02937279, -0.0324758 ,
        0.01052141,  0.01011841, -0.06936894, -0.00684149, -0.02550313,
       -0.02518611,  0.04891577,  0.02372671,  0.03188686,  0.05224618,
        0.04067955, -0.04650906,  0.11703236,  0.00907142, -0.04587661,
       -0.06336109, -0.04944429, -0.01658902, -0.04052034,  0.06204056,
       -0.01858631,  0.04044271,  0.01437338,  0.00042826, -0.00017273,
        0.00887631, -0.09401307, -0.04692107,  0.0094351 , -0.08114656,
       -0.00295597,  0.02252945, -0.0661167 , -0.03549378,  0.0007249 ,
       -0.05366337, -0.04999902,  0.04547021, -0.00384725,  0.07

In [24]:
svm_model_w2v = LinearSVC()
svm_model_w2v.fit(X_train_w2v, y_train_w2v)
y_pred_svm_w2v = svm_model_w2v.predict(X_test_w2v)
accuracy_svm_w2v = accuracy_score(y_test_w2v, y_pred_svm_w2v)
print(f"Linear SVM Accuracy with Word2Vec: {accuracy_svm_w2v}")


Linear SVM Accuracy with Word2Vec: 0.7874314864058175


In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [26]:
df.head()

Unnamed: 0,title,text,label,combined_text,cleaned_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforc high alert follow threat cop white ...
1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbeliev obama attorney gener say charlott rio...
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",bobbi jindal rais hindu use stori christian co...
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,satan russia unv imag terrifi new supernuk wes...


In [27]:
df = df.sample(frac=0.1, random_state=42,)


In [28]:
df.head()

Unnamed: 0,title,text,label,combined_text,cleaned_text
55411,Trump Slapped With Lawsuit For Refusing To Re...,Donald Trump is being sued again. Three orga...,1,Trump Slapped With Lawsuit For Refusing To Re...,trump slap lawsuit refus releas white hous vis...
22724,Minutes After Saturday Night Live Sketch Ende...,Now that it appears that Donald Trump is the i...,1,Minutes After Saturday Night Live Sketch Ende...,minut saturday night live sketch end trump alr...
39636,SWEDEN HELP WANTED: Activities Coordinator for...,SWEDEN HELP WANTED: Activities Coordinator for...,1,SWEDEN HELP WANTED: Activities Coordinator for...,sweden help want activ coordin bore illeg alie...
9093,Hariri says his Saudi stay was to discuss Leba...,"BEIRUT (Reuters) - Saad al-Hariri, who resigne...",0,Hariri says his Saudi stay was to discuss Leba...,hariri say saudi stay discuss lebanon futur be...
32375,"Republicans Plead With Trump, Desperate To Pr...",Donald Trump recently met with the Republican ...,1,"Republicans Plead With Trump, Desperate To Pr...",republican plead trump desper prevent inevit b...


In [29]:
df.shape

(9213, 5)

In [30]:
print(df['label'].value_counts())

label
1    4749
0    4464
Name: count, dtype: int64


In [31]:
X_train_text, X_test_text, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42,stratify=df['label'])


In [32]:
total_words = X_train_text.apply(lambda x: len(str(x).split())).sum()
print(f"Total number of words in df['cleaned_text']: {total_words}")

Total number of words in df['cleaned_text']: 2149649


In [33]:
# Re-split the data to get text versions of X_train and X_test for the tokenizer

# Define hyperparameters and re-declare num_words and maxlen for robustness
num_words = 5000   # From previous successful execution in cell ab90ffe2
embedding_dim = 100  # Dimension of the word embeddings
lstm_units = 128     # Number of units in the LSTM layer

print("Loading pre-trained GloVe model (50 dimensions)...")
word_vectors = api.load('word2vec-google-news-300')
print("Model loaded successfully ✅")

# 2. Function: Convert a text into an averaged word embedding vector
def text_to_vec(text, model):
    words = text.split()
    word_vecs = [model[word] for word in words if word in model]
    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

# 3. Create sentence embeddings for all cleaned text
print("Converting text to word embeddings...")
X_w2v = np.array([text_to_vec(text, word_vectors) for text in df['cleaned_text']])
y = df['label'].values

# Convert y_train and y_test into NumPy arrays
y_train_array = np.array(y_train)
y_test_array = np.array(y_test)

print(f"Shape of y_train_array: {y_train_array.shape}")
print(f"Shape of y_test_array: {y_test_array.shape}")



Loading pre-trained GloVe model (50 dimensions)...
Model loaded successfully ✅
Converting text to word embeddings...
Shape of y_train_array: (7370,)
Shape of y_test_array: (1843,)


In [34]:
print(f"Shape of X_train_padded: {X_w2v.shape}")
print(f"Shape of X_test_padded: {y.shape}")


Shape of X_train_padded: (9213, 300)
Shape of X_test_padded: (9213,)


In [35]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42
)


In [36]:
X_test_w2v.shape

(1843, 300)

In [37]:
# Train a Linear SVM model
svm_model = LinearSVC()
svm_model.fit(X_train_w2v, y_train_w2v)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test_w2v)

# Evaluate the model
accuracy_svm = accuracy_score(y_test_w2v, y_pred_svm)
precision_svm = precision_score(y_test_w2v, y_pred_svm)
recall_svm = recall_score(y_test_w2v, y_pred_svm)
f1_svm = f1_score(y_test_w2v, y_pred_svm)

print(f"Linear SVM Model Performance:")
print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")

Linear SVM Model Performance:
Accuracy: 0.7775366250678242
Precision: 0.7751479289940828
Recall: 0.8119834710743802
F1 Score: 0.79313824419778


In [38]:

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_w2v, y_train_w2v)

y_pred_dt = dt_model.predict(X_test_w2v)

accuracy_svm = accuracy_score(y_test_w2v, y_pred_dt)
precision_svm = precision_score(y_test_w2v, y_pred_dt)
recall_svm = recall_score(y_test_w2v, y_pred_dt)
f1_svm = f1_score(y_test_w2v, y_pred_dt)

print(f"Linear SVM Model Performance:")
print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")


Linear SVM Model Performance:
Accuracy: 0.6608790016277808
Precision: 0.6713286713286714
Recall: 0.6942148760330579
F1 Score: 0.6825799898425596


In [39]:

rf_model = RandomForestClassifier()
rf_model.fit(X_train_w2v, y_train_w2v)

y_pred_rf = rf_model.predict(X_test_w2v)

accuracy_svm = accuracy_score(y_test_w2v, y_pred_rf)
precision_svm = precision_score(y_test_w2v, y_pred_rf)
recall_svm = recall_score(y_test_w2v, y_pred_rf)
f1_svm = f1_score(y_test_w2v, y_pred_rf)

print(f"Linear SVM Model Performance:")
print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")

Linear SVM Model Performance:
Accuracy: 0.7509495387954422
Precision: 0.756294058408862
Recall: 0.7758264462809917
F1 Score: 0.7659357470678225


In [40]:

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train_w2v, y_train_w2v)

y_pred_gb = gb_model.predict(X_test_w2v)

accuracy_svm = accuracy_score(y_test_w2v, y_pred_gb)
precision_svm = precision_score(y_test_w2v, y_pred_gb)
recall_svm = recall_score(y_test_w2v, y_pred_gb)
f1_svm = f1_score(y_test_w2v, y_pred_gb)

print(f"GradientBoostingClassifier Model Performance:")
print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")

GradientBoostingClassifier Model Performance:
Accuracy: 0.759631036353771
Precision: 0.7686796315250768
Recall: 0.7758264462809917
F1 Score: 0.7722365038560411


In [41]:

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_w2v, y_train_w2v)

y_pred_xgb = xgb_model.predict(X_test_w2v)

print("\nXGBoost Performance:")
accuracy_svm = accuracy_score(y_test_w2v, y_pred_xgb)
precision_svm = precision_score(y_test_w2v, y_pred_xgb)
recall_svm = recall_score(y_test_w2v, y_pred_xgb)
f1_svm = f1_score(y_test_w2v, y_pred_xgb)

print(f"Linear SVM Model Performance:")
print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")


XGBoost Performance:
Linear SVM Model Performance:
Accuracy: 0.7742810634834509
Precision: 0.7716535433070866
Recall: 0.8099173553719008
F1 Score: 0.7903225806451613


In [42]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional


In [43]:
# ------------------------
# 1. Tokenize text
# ------------------------
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_text'])

X_sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

maxlen = 100  # Best practice: 50–200
X_padded = pad_sequences(X_sequences, maxlen=maxlen)

# Split again (matching earlier split)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, df['label'], test_size=0.2, random_state=42)


In [44]:
# ------------------------
# 2. Build Embedding Matrix from Word2Vec
# ------------------------
vocab_size = min(5000, len(tokenizer.word_index) + 1)
embedding_dim = 300

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size and word in word_vectors:
        embedding_matrix[i] = word_vectors[word]


In [45]:
# ------------------------
# 3. Build LSTM Model
# ------------------------
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))  # freeze embedding layer

model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())


None


In [46]:
# ------------------------
# 4. Train
# ------------------------
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=8,
                    batch_size=32)


Epoch 1/8
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 252ms/step - accuracy: 0.6986 - loss: 0.5509 - val_accuracy: 0.7748 - val_loss: 0.4327
Epoch 2/8
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 232ms/step - accuracy: 0.7917 - loss: 0.4189 - val_accuracy: 0.7960 - val_loss: 0.4046
Epoch 3/8
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 228ms/step - accuracy: 0.8126 - loss: 0.3796 - val_accuracy: 0.7754 - val_loss: 0.4228
Epoch 4/8
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 226ms/step - accuracy: 0.8280 - loss: 0.3511 - val_accuracy: 0.8003 - val_loss: 0.3915
Epoch 5/8
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 225ms/step - accuracy: 0.8589 - loss: 0.3014 - val_accuracy: 0.7699 - val_loss: 0.4489
Epoch 6/8
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 231ms/step - accuracy: 0.8723 - loss: 0.2790 - val_accuracy: 0.8058 - val_loss: 0.3981
Epoch 7/8
[1m23

In [47]:
# ------------------------
# 5. Predict + Evaluate
# ------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = (model.predict(X_test) > 0.5).astype(int)

print("\nLSTM with Word2Vec Performance:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall:    {recall_score(y_test, y_pred)}")
print(f"F1 Score:  {f1_score(y_test, y_pred)}")


[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 95ms/step

LSTM with Word2Vec Performance:
Accuracy:  0.8095496473141617
Precision: 0.7924170616113744
Recall:    0.8636363636363636
F1 Score:  0.8264953040039545
