### Settings

In [1]:
import os
import string
import cufflinks as cf

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from wordcloud import WordCloud

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [3]:
def get_data(file='', title_text=True, subject=False, date=False, nb_samples=5_000):

#     path = os.path.dirname(os.path.dirname(__file__)) + 'raw_data/'
#     data = pd.read_csv(path + file)
    
    data = pd.read_csv('../raw_data/fake_real_data.csv')
    
    tmp = data.sample(n=nb_samples)
    
    while tmp.target.value_counts()[1] / tmp.target.value_counts()[0] > 1.1:
        tmp = data.sample(n=nb_samples)
    data = tmp; del tmp
    
    features = ["article"]

    data[["article"]] = data["title"] + ' ' + data["text"]

    if date:
        data[["date"]] = pd.to_datetime(data["date"])
        features.append("date")

    if subject:
        features.append("subject")

    X = data[features]
    y = data["target"]

    return X, y

In [4]:
def clean_txt(txt):
    
    def remove_punctuations(text):
        for punctuation in string.punctuation:
            text = text.replace(punctuation, '')
        return text

    def lower(text):
        text = text.lower()
        return text

    def number(text):
        text = ''.join(word for word in text if not word.isdigit())
        return text

    def stop(text):
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        text = [w for w in word_tokens if not w in stop_words]
        return text

    def lemmatize(text):
        lemmatizer = WordNetLemmatizer()
        lemmatized = [lemmatizer.lemmatize(word) for word in text]
        text = lemmatized
        return text

    def virg(text):
        text=" ".join(text)
        return text
    
    txt = lower(txt)
    txt = number(txt)
    txt = stop(txt)
    txt = lemmatize(txt)
    txt = virg(txt)
    
    return txt

In [5]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Exploration

## Import Data


In [None]:
X, y = get_data(nb_samples=5_000)

## Clean Data

In [None]:
X.article = X.article.map(clean_txt)

## Count N_Grams

In [None]:
common_words = get_top_n_words(X.article, 20)
tmp = pd.DataFrame(common_words,columns=['word','count'])
tmp.groupby('word').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 UNIgrams used in articles',color='blue')
del tmp

In [None]:
common_words = get_top_n_bigram(X.article, 20)
tmp = pd.DataFrame(common_words, columns = ['words' ,'count'])
tmp.groupby('words').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 BIgrams used in articles', color='blue')
del tmp

In [None]:
common_words = get_top_n_trigram(X.article, 20)
tmp = pd.DataFrame(common_words, columns = ['words' ,'count'])
tmp.groupby('words').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 TRIgrams used in articles', color='blue')
del tmp

## WordCloud

In [None]:
wc = WordCloud(background_color="black", max_words=100,
               max_font_size=256,
               width=1000, height=1000)
wc.generate(' '.join(X.article))
plt.figure(figsize=(8, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

# Baseline

## Holdout

In [None]:
X, y = get_data(nb_samples=10_000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
X_train.article = X_train.article.map(clean_txt)

In [None]:
X_test.article = X_test.article.map(clean_txt)

## Vectorization

In [None]:
vec = TfidfVectorizer()
vec_train = vec.fit_transform(X_train.article)
vec_test = vec.transform(X_test.article)

## Model & Scoring

In [None]:
RdF = RandomForestClassifier(n_estimators=100, n_jobs=-1)
RdF.fit(vec_train, y_train)
y_pred_rf = RdF.predict(vec_test)
score_rf = accuracy_score(y_test, y_pred_rf)
print("RandomForest Accuracy:  %0.3f" %score_rf)
print(classification_report(y_test, y_pred_rf))

In [None]:
AdaB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=5)
AdaB.fit(vec_train, y_train)
y_pred_ab = AdaB.predict(vec_test)
score_ab = accuracy_score(y_test, y_pred_ab)
print("AdaBoost Accuracy: %0.3f" %score_ab)
print(classification_report(y_test, y_pred_ab))

# George Testing

In [None]:
george = pd.read_csv('../raw_data/fake_or_real_news _george_mcintire.csv')
Xg = george.title + " " + george.text
yg = george.label.map({"REAL": 0, "FAKE": 1})

In [None]:
Xg = Xg.apply(lower)
Xg = Xg.apply(number)
Xg = Xg.apply(stop)
Xg = Xg.apply(lemmatize)
Xg = Xg.apply(virg)

In [None]:
vec_Xg = vec.transform(Xg)

In [None]:
yg_pred_rf = RdF.predict(vec_Xg)
score_rf_g = accuracy_score(yg, yg_pred_rf)
print("RandomForest Accuracy:  %0.3f" %score_rf_g)
print(classification_report(yg, yg_pred_rf))

In [None]:
yg_pred_ab = AdaB.predict(vec_Xg)
score_ab_g = accuracy_score(yg, yg_pred_ab)
print("AdaBoost Accuracy: %0.3f" %score_ab_g)
print(classification_report(yg, yg_pred_ab))

# Tuning

In [16]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# X, y = get_data(nb_samples=5_000)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# X_train = X_train.article.map(clean_txt)

# X_test = X_test.article.map(clean_txt)

# rf_pipeline = Pipeline([
#     ('vectorize', TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)),
#     ('model', RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1))])

# ada_pipeline = Pipeline([
#     ('vectorize', TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)),
#     ('model', AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=5, learning_rate=1.0))])

# rf_pipeline.fit(X_train, y_train)
# y_pred = rf_pipeline.predict(X_test)
# score_rf = accuracy_score(y_test, y_pred)
# print("AdaBoost Accuracy: %0.3f" %score_rf)
# print(classification_report(y_test, y_pred))