In [2]:
import pandas as pd

df_original = pd.read_json("http://fake-news-detector-api.herokuapp.com/links/all")

df_original[0:10]

Unnamed: 0,category_id,clickbait_title,content,count,id,title,url,verified_category_id,verified_clickbait_title
0,1,0.0,Tweet\n\nThe Environmental Protection Agency (...,1,2078,Wolf in Sheep’s Clothing (or a Scientist’s Lab...,http://www.pogo.org/blog/2018/05/wolf_in_sheep...,,
1,1,0.0,Reveja todos os finais de 'O Outro Lado'\n,1,2077,"globo.com - Absolutamente tudo sobre notícias,...",https://www.globo.com/,,
2,1,,A black Yale student was taking a nap in a com...,1,2076,White people keep calling the cops on black pe...,https://www.vox.com/identities/2018/5/11/17340...,,
3,1,,"Friday afternoon, the Justice Department relea...",1,2075,"Donald Trump, Bernie Sanders, and Jill Stein a...",https://www.vox.com/policy-and-politics/2018/2...,,
4,1,0.0,I regularly attend an annual security conferen...,1,2074,John McCain: ‘Vladimir Putin Is an Evil Man’,https://www.wsj.com/articles/john-mccain-vladi...,,
5,1,0.0,Updated at 10:55 a.m. ET\n\nKremlin-linked Rus...,1,2072,Documents Reveal How Russian Official Courted ...,https://www.npr.org/2018/05/11/610206357/docum...,,
6,1,0.0,(Reuters) – AT&T Inc on Friday ousted its top ...,1,2071,Exclusive - AT&T CEO: We made 'big mistake' hi...,https://www.politicususa.com/2018/05/11/exclus...,,
7,1,0.0,The Stormy Daniels lawyer who tweeted bombshel...,1,2070,Heat on Stormy Daniels' lawyer over past busin...,http://www.foxnews.com/politics/2018/05/11/hea...,,
8,1,0.0,The CBI probe in the politically sensitive Unn...,1,2069,CBI confirms rape charge against Unnao MLA Kul...,https://timesofindia.indiatimes.com/india/cbi-...,,
9,1,,An analysis of Sean Hannity’s real estate hold...,1,2067,Slumlord Sean Hannity Makes Money By Threateni...,http://eepurl.com/dunNz5,,


In [18]:
from sklearn.model_selection import train_test_split
import numpy as np

df = df_original.copy()

df.dropna(subset=["title"], inplace=True)

df["clickbait_title"] = df['verified_clickbait_title'].fillna(df['clickbait_title'])

df["is_clickbait"] = [ 0 if c == 0 else 1 if c == 1 else 0.5 for c in df['clickbait_title'] ]

df = df[["title", "is_clickbait"]]

print("Number of click bait samples", len(df[df["is_clickbait"] == 1]))

df

Number of click bait samples 162


Unnamed: 0,title,is_clickbait
0,Wolf in Sheep’s Clothing (or a Scientist’s Lab...,0.0
1,"globo.com - Absolutamente tudo sobre notícias,...",0.0
2,White people keep calling the cops on black pe...,0.5
3,"Donald Trump, Bernie Sanders, and Jill Stein a...",0.5
4,John McCain: ‘Vladimir Putin Is an Evil Man’,0.0
5,Documents Reveal How Russian Official Courted ...,0.0
6,Exclusive - AT&T CEO: We made 'big mistake' hi...,0.0
7,Heat on Stormy Daniels' lawyer over past busin...,0.0
8,CBI confirms rape charge against Unnao MLA Kul...,0.0
9,Slumlord Sean Hannity Makes Money By Threateni...,0.5


In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import TransformerMixin


positive_df = df[df["is_clickbait"] == 1]
negative_df = df[df["is_clickbait"] != 1].apply(np.random.permutation)[0:len(positive_df)]
balanced_df = positive_df.append(negative_df)

X = balanced_df
y = balanced_df["is_clickbait"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

class ModelTransformer(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))

class RoundTransformer():
    def fit(self, *args, **kwargs):
        return self
        
    def predict(self, X):
        return [ round(x) for x in X[0] ]
    
    def score(self, X, y):
        return accuracy_score(self.predict(X), y)
    
pipeline = Pipeline([
    ('selector', FunctionTransformer(lambda x: x['title'], validate=False)),
    ('tfidf', TfidfVectorizer(strip_accents='ascii', ngram_range=(1, 3))),
    ('clf', ModelTransformer(RandomForestRegressor())),
    ('round', RoundTransformer())
])

clf = pipeline.fit(X_train, y_train)
y_pred = clf.predict(X_test)

y_test = [ 1.0 if x > 0.5 else 0.0 for x in y_test ]

f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy", accuracy)
print("f1", f1)
print("positive recall", recall)

accuracy 0.777777777778
f1 0.804347826087
positive recall 0.925
