In [12]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

from joblib import dump, load

In [2]:
fake_news = pd.read_csv("data/news.csv")
fake_news = fake_news.loc[:, ~fake_news.columns.str.contains("^Unnamed")]

In [3]:
# shows that the classes are balanced
fake_news["label"].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [4]:
X = fake_news[["title", "text"]]
y = fake_news["label"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X["text"], y, test_size=0.2, random_state=1)

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)

### Transform the X_test with the fitted Tf_Idf weights

In [7]:
X_test_vectorized = tfidf_vectorizer.transform(X_test)

### Train a Passive Aggressive Classifier

In [8]:
pac = PassiveAggressiveClassifier(max_iter=50, tol=1e-3)
pac.fit(X_train_vectorized, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=50, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [9]:
y_pred = pac.predict(X_test_vectorized)
score = accuracy_score(y_test, y_pred)
print("Accuracy:", round(score*100, 2))

Accuracy: 94.55


### Load into pkl file

In [23]:
dump(pac, "fake_news_classifier.joblib")
dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")

['tfidf_vectorizer.joblib']

### Load the model

In [24]:
pac = load("fake_news_classifier.joblib")
tfidf_vectorizer = load("tfidf_vectorizer.joblib")

### Run on a test case

In [25]:
vectorized_string = tfidf_vectorizer.transform(["Increased SO2 submissions detected above China indicates \
                                                that China maybe burning bodies without telling the rest of the \
                                                world. They say they have the disease under control but we all know \
                                                president Xi Jinping is snake oil salesman who will do anything \
                                                if it means he'll stay popular and remain in power. What an idiot! \
                                                God bless free America and our president, Donald Trump."])

In [26]:
prediction = pac.predict(vectorized_string)

In [27]:
Run 

array(['FAKE'], dtype='<U4')