In [1]:
# %pip install tweet-preprocessor

In [2]:
import pandas as pd
import pickle

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import preprocessor as p

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report

In [3]:
fake_news = pd.read_csv("data/fake_news.csv")

## Text pre-processing

In [4]:
def preprocess(text):
    text = p.clean(text)
    tokenization = nltk.word_tokenize(text)     
    tokenization = [w for w in tokenization if not w in stop_words]   
    return text

In [5]:
fake_news['tweet'] = fake_news['tweet'].apply(preprocess)

In [6]:
def binary_map(x):
    if x == "real":
        return 0
    else:
        return 1

fake_news["label"] = fake_news["label"].apply(binary_map)

In [7]:
X = fake_news['tweet'].values
y = fake_news['label'].values

In [8]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [9]:
# pickle.dump(vectorizer, open("models/misinformation/vectorizer.txt", "wb"))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=42)

## Logistic Regression

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.9508177570093458, 0.9191588785046729)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1120
           1       0.91      0.92      0.92      1020

    accuracy                           0.92      2140
   macro avg       0.92      0.92      0.92      2140
weighted avg       0.92      0.92      0.92      2140



## Multinomial Naive Bayes

In [13]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
mnb.score(X_train, y_train), mnb.score(X_test, y_test)

(0.9436915887850468, 0.8943925233644859)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90      1120
           1       0.93      0.84      0.88      1020

    accuracy                           0.89      2140
   macro avg       0.90      0.89      0.89      2140
weighted avg       0.90      0.89      0.89      2140



## Passive Aggressive Classifier

In [15]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train, y_train)
y_pred = pac.predict(X_test)
pac.score(X_train, y_train), pac.score(X_test, y_test)

(1.0, 0.9205607476635514)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      1120
           1       0.92      0.91      0.92      1020

    accuracy                           0.92      2140
   macro avg       0.92      0.92      0.92      2140
weighted avg       0.92      0.92      0.92      2140



## Best (Consistent) Model

In [17]:
# load vectorizer

vectorizer = pickle.load(open("models/misinformation/vectorizer.txt", "rb"))

In [18]:
fake = vectorizer.transform([preprocess("Politically Correct Woman (Almost) Uses Pandemic as Excuse Not to Reuse Plastic Bag https://t.co/thF8GuNFPe #coronavirus #nashville")])
real = vectorizer.transform([preprocess("The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.")])

In [19]:
test = vectorizer.transform([preprocess("Both my girls contracted covid and theyâ€™re just 8 months old, wife and i will be taking care of them until theyâ€™re back to their best")])

In [20]:
def fake_or_real(pred):
    if pred == 0:
        return "Real"
    else:
        return "Fake"
    
fake_or_real(logreg.predict(fake)[0]), fake_or_real(logreg.predict(real)[0]), fake_or_real(logreg.predict(test)[0]) # ignore test because model was not trained on people's experiences

('Fake', 'Real', 'Fake')

In [21]:
# logreg_filename = "models/misinformation/logreg.txt"

In [22]:
# save model

# pickle.dump(logreg, open(logreg_filename, "wb"))

In [23]:
# load model

# logreg = pickle.load(open(logreg_filename, 'rb'))
# fake_or_real(logreg.predict(fake)[0]), fake_or_real(logreg.predict(real)[0])