## I. Import Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_fscore_support

In [None]:
fr = pd.read_csv("../data/fakerealnews_GeorgeMcIntire/fake_or_real_news.csv")

## II. Quick Look at Data

In [None]:
fr.head()

In [None]:
fr.shape

### III. Run Baseline Logistic Regression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(fr['text'],np.where(fr['label']=='REAL', 1, 0), test_size = .2, random_state = 1)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

x_train_vec = vectorizer.transform(x_train)
x_test_vec  = vectorizer.transform(x_test)
x_train_vec

In [None]:
classifier = LogisticRegression()
classifier.fit(x_train_vec, y_train)

## IV. Logistic Regression Metrics and AUC Curve

In [None]:
# calculate accuracy
score = classifier.score(x_test_vec, y_test)
print("Accuracy:", score)
clf_rep = precision_recall_fscore_support(y_test, classifier.predict(x_test_vec))
out_dict = {
             "precision" :clf_rep[0].round(2)
            ,"recall" : clf_rep[1].round(2)
            ,"f1-score" : clf_rep[2].round(2)
            ,"support" : clf_rep[3]
            }
out_df = pd.DataFrame(out_dict, index = classifier.classes_)
print(out_df)

# calculate the fpr and tpr for all thresholds of the classification
probs = classifier.predict_proba(x_test_vec)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plot
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## IV. Show where the LR model got it wrong

In [None]:
y_test_pred = classifier.predict(x_test_vec)
wrong_actually_fake = x_test[(y_test_pred != y_test) & (y_test == 0)]
wrong_actually_real = x_test[(y_test_pred != y_test) & (y_test == 1)]

### 1. Predicted Real, Actually Fake

In [None]:
print(wrong_actually_fake.iloc[1])
fr[fr.text == wrong_actually_fake.iloc[1]]

### 2. Predicted Fake, Actually Real

In [None]:
print(wrong_actually_real.iloc[1])
fr[fr.text == wrong_actually_real.iloc[1]]