In [86]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ertso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

# **Loading and processing data**

In [65]:
# Fake News Classification
# https://www.kaggle.com/datasets/aadyasingh55/fake-news-classification

df = pd.concat([
    pd.read_csv("evaluation.csv", delimiter=";"), 
    pd.read_csv("train (2).csv", delimiter=";"), 
    pd.read_csv("test (1).csv", delimiter=";")
], ignore_index=True)[["title", "text", "label"]]

In [66]:
sw = stopwords.words('english')
df["title"] = df["title"].apply(lambda x: " ".join([i.lower() if i.lower() not in sw else "" for i in x.split()]))
df["text"] = df["text"].apply(lambda x: " ".join([i.lower() if i.lower() not in sw else "" for i in x.split()]))

In [67]:
vectorizer_text = TfidfVectorizer(stop_words="english")
vectorizer_title = TfidfVectorizer(stop_words="english")


X_text = vectorizer_text.fit_transform(df["text"])
X_title = vectorizer_title.fit_transform(df["title"])

y = df["label"]

In [68]:
X_text_train, X_text_test, y_train, y_test = train_test_split(X_text,  y, test_size=0.4, random_state=42)
X_title_train, X_title_test, y_train, y_test = train_test_split(X_title,  y, test_size=0.4, random_state=42)

# **Creating the model**

## **Identifying by text**

In [77]:
sgdc_text = SGDClassifier(
    loss="hinge",
    penalty="l2",
    alpha=0.000005,
    max_iter=10000,
    tol=0.001,
    shuffle=True,
    n_jobs=-1,
    random_state=42,
    learning_rate="invscaling",
    eta0=10,
    power_t=0.4,
    validation_fraction=0.01,
    n_iter_no_change=1000
)

sgdc_text.fit(X_text_train, y_train)

In [78]:
y_pred = sgdc_text.predict(X_text_test)

print(sgdc_text.score(X_text_train, y_train))
print(sgdc_text.score(X_text_test, y_test))

0.9998768068331143
0.9747459193101324


In [79]:
joblib.dump(sgdc_text, "news_text_clf_sgdc.sav")

['news_text_clf_sgdc.sav']

## **Identifying by title**

In [81]:
sgdc_title = SGDClassifier(
    loss="hinge",
    penalty="l2",
    alpha=0.000005,
    max_iter=10000,
    tol=0.001,
    shuffle=True,
    n_jobs=-1,
    random_state=42,
    learning_rate="invscaling",
    eta0=10,
    power_t=0.4,
    validation_fraction=0.01,
    n_iter_no_change=1000
)

sgdc_title.fit(X_title_train, y_train)

In [82]:
y_pred = sgdc_title.predict(X_title_test)

print(sgdc_title.score(X_title_train, y_train))
print(sgdc_title.score(X_title_test, y_test))

0.9920745729303548
0.9038497074222359


In [83]:
joblib.dump(sgdc_title, "news_title_clf_sgdc.sav")

['news_title_clf_sgdc.sav']

# **Predicting the category**

In [84]:
def pred_by_title(title: str) -> str:

    res = sgdc_title.predict(vectorizer_title.transform([title]))

    if res == 1:
        return "presumably fake news"
    else:
        return "presumably not fake news"

In [85]:
def pred_by_text(title: str) -> str:

    res = sgdc_text.predict(vectorizer_text.transform([title]))

    if res == 1:
        return "presumably fake news"
    else:
        return "presumably not fake news"