In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

<h1>Data Prep

In [2]:
# read csv of news headlines, label 1 indicates a legitimate news source
data = pd.read_csv("news_labeled.csv")
data

Unnamed: 0,Headline,Label
0,Says the Annies List political group supports ...,0
1,Health care reform legislation is likely to ma...,0
2,The Chicago Bears have had more starting quart...,1
3,When Mitt Romney was governor of Massachusetts...,0
4,McCain opposed a requirement that the governme...,1
...,...,...
4549,Says Barack Obama promised to halve the defici...,1
4550,I am the only senator who turned down the stat...,1
4551,There is no system to vet refugees from the Mi...,0
4552,I think its seven or eight of the California s...,0


In [3]:
data["Label"].value_counts()

0    2501
1    2053
Name: Label, dtype: int64

In [4]:
# For balancing dataset
def equally_sample(df, column, n):
    equal_df = pd.DataFrame()
    uniques = df[column].unique()
    for val in uniques:
        df1 = df.loc[df[column] == val]
        df1 = df1.sample(n=n)
        equal_df = equal_df.append(df1)
    return equal_df

In [5]:
balanced_data = equally_sample(df=data, column="Label", n=2050)
balanced_data["Label"].value_counts()

  equal_df = equal_df.append(df1)
  equal_df = equal_df.append(df1)


0    2050
1    2050
Name: Label, dtype: int64

In [6]:
# Split into training and test sets
X = balanced_data["Headline"]
y = balanced_data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [7]:
X_train.shape

(2665,)

In [8]:
X_test.shape

(1435,)

In [9]:
# Vectorize the text
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words="english")
vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
X_train_vectorized.toarray().shape

(2665, 1362)

In [11]:
X_test_vectorized.toarray().shape

(1435, 1362)

<h1>Model testing and selection

In [12]:
estimators = {'Logistic regression': LogisticRegression(),
              'Multinomial Naive Bayes': MultinomialNB(),
              'Decision Tree': DecisionTreeClassifier()}

In [13]:
for name, obj in estimators.items():
        model = obj.fit(X=X_train_vectorized, y=y_train)
        print(name + "\n\t Classification accuracy on training set: " + str(model.score(X_train_vectorized, y_train)) + 
              "\n\t Classification accuracy on testing set: " + str(model.score(X_test_vectorized, y_test)) + "\n")

Logistic regression
	 Classification accuracy on training set: 0.7928705440900563
	 Classification accuracy on testing set: 0.5874564459930314

Multinomial Naive Bayes
	 Classification accuracy on training set: 0.7692307692307693
	 Classification accuracy on testing set: 0.5944250871080139

Decision Tree
	 Classification accuracy on training set: 0.9966228893058161
	 Classification accuracy on testing set: 0.5442508710801394



In [19]:
# Selected Logistic Regression
model =  LogisticRegression().fit(X=X_train_vectorized, y=y_train)

def headline_checker(headline):
    feats = vectorizer.transform(headline)
    prediction = model.predict(feats)
    probability = model.predict_proba(feats)
    if prediction == 0:
        pred_str = "Fake News"
        probability_str = str(probability[0][0])
    elif prediction == 1:
        pred_str = "Real News"
        probability_str = str(probability[0][1])
    print("Model Classification:  " + pred_str + "\n Probability:  " + probability_str)

In [20]:
headline_checker(["The State adds new vaccine requirement for senate members"])

Model Classification:  Real News
 Probability:  0.7836517612982383


In [21]:
headline_checker(["Wisconsin Governer says he will never campaign again"])

Model Classification:  Fake News
 Probability:  0.817660350743087
