In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re
import joblib
import string

In [2]:
fake=pd.read_csv("Fake.csv")

In [3]:
true=pd.read_csv("True.csv")

In [4]:
fake['class']=0
true['class']=1


In [5]:
data=pd.concat([fake,true],axis=0)

In [6]:
data=data.drop(['title','subject','date'],axis=1)

In [7]:
data.reset_index(inplace=True)

In [8]:
data.drop(['index'],axis=1,inplace=True)

In [15]:

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\\W+", " ", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), " ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\w*\d\w*", "", text)
    return text



In [28]:
data['text']=data['text'].apply(clean_text)

In [None]:
x=data['text']
y=data['class']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.25,random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=TfidfVectorizer()
vectorizer2=CountVectorizer()

xv_train=vectorizer.fit_transform(xtrain)
xv_test=vectorizer.transform(xtest)

xv_train2=vectorizer2.fit_transform(xtrain)
xv_test2=vectorizer2.transform(xtest)


In [None]:
lr=LogisticRegression()

In [None]:
lr.fit(xv_train,ytrain)

In [None]:
lr.score(xv_test,ytest)

0.9858351893095768

In [None]:
joblib.dump(vectorizer,"vectorizer.jb")
joblib.dump(lr,"lr_model.jb")

['lr_model.jb']

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xv_train, ytrain)
print("KNN Accuracy:", knn.score(xv_test, ytest))
joblib.dump(knn, "knn_model.jb")

KNN Accuracy: 0.6920267260579065


['knn_model.jb']

In [None]:
joblib.dump(vectorizer2, "vectorizer2.jb")

['vectorizer2.jb']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Train Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(xv_train2, ytrain)
print("Decision Tree Accuracy:", dt.score(xv_test2, ytest))
joblib.dump(dt, "dt_model.jb")

# Train Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(xv_train2, ytrain)
print("Random Forest Accuracy:", rf.score(xv_test2, ytest))
joblib.dump(rf, "rf_model.jb")

# Train XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(xv_train2, ytrain)
print("XGBoost Accuracy:", xgb.score(xv_test2, ytest))
joblib.dump(xgb, "xgb_model.jb")

Decision Tree Accuracy: 0.9966146993318485
Random Forest Accuracy: 0.9878841870824053


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.9974164810690423


['xgb_model.jb']

In [29]:
# Improve Decision Tree Classifier
dt_improved = DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=5)
dt_improved.fit(xv_train2, ytrain)
print("Improved Decision Tree Accuracy:", dt_improved.score(xv_test2, ytest))
joblib.dump(dt_improved, "dt_improved_model.jb")

# Improve XGBoost Classifier
xgb_improved = XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    eval_metric='logloss',
    use_label_encoder=False
)
xgb_improved.fit(xv_train2, ytrain)
print("Improved XGBoost Accuracy:", xgb_improved.score(xv_test2, ytest))
joblib.dump(xgb_improved, "xgb_improved_model.jb")

Improved Decision Tree Accuracy: 0.995902004454343


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Improved XGBoost Accuracy: 0.9971492204899778


['xgb_improved_model.jb']

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.naive_bayes import MultinomialNB

# ANN Model
ann = Sequential([
    Dense(128, activation='relu', input_dim=xv_train.shape[1]),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Naive Bayes Model
nb = MultinomialNB()
nb.fit(xv_train2, ytrain)
nb_score = nb.score(xv_test2, ytest)
print("Naive Bayes Accuracy:", nb_score)
joblib.dump(nb, "nb_model.jb")

Naive Bayes Accuracy: 0.951358574610245


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


['nb_model.jb']

In [33]:
# Reduce training time for ANN
ann.fit(xv_train, ytrain, epochs=3, batch_size=64, verbose=1)
ann_score = ann.evaluate(xv_test, ytest, verbose=0)[1]
print("Reduced Training ANN Accuracy:", ann_score)
ann.save("ann_model_reduced.h5")

Epoch 1/3
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 130ms/step - accuracy: 0.9178 - loss: 0.2650
Epoch 2/3
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 124ms/step - accuracy: 0.9975 - loss: 0.0108
Epoch 3/3
[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 128ms/step - accuracy: 0.9996 - loss: 0.0028




Reduced Training ANN Accuracy: 0.9924275875091553
