In [7]:
import pandas as pd

df_original = pd.read_json("http://fake-news-detector-api.herokuapp.com/links/all")

df_original[0:10]

Unnamed: 0,category_id,clickbait_title,content,count,id,title,url,verified_category_id,verified_clickbait_title
0,2,1.0,"(Dubois, Wyo) – The Wyoming Wild Game Departme...",1,1777,First 90 Kangaroos released in Wyoming,https://buckrail.com/first-90-kangaroos-releas...,,
1,2,0.0,Sociedade — A grande imprensa tradicional perd...,1,1776,Imprensa tradicional quer calar mídia alternat...,http://odiarionacional.org/2018/03/31/imprensa...,,
2,1,0.0,And we know it can be frustrating when your be...,3,1775,Home - TipTopTails - Michigan Dog Training,https://tiptoptails.com/,,
3,1,0.0,,1,1774,"Sue Pedaline, Sallie Davis Kelton, Wendy Reavi...",https://www.facebook.com/liz.pendergrass/posts...,,
4,1,0.0,,1,1773,‘Granny Pods’ Now Allow Your Aging Parents to ...,http://www.luxurylifenews.com/house-for-parent...,,
5,5,0.0,WASHINGTON (The Borowitz Report)—As America’s ...,1,1772,Nation with Crumbling Bridges and Roads Excite...,https://www.newyorker.com/humor/borowitz-repor...,,
6,1,0.0,Tucked away in the quiet woods of Upstate New ...,1,1771,Secret Library Hidden in the Woods Is Every Bi...,https://trib.al/revSzRc,,
7,5,0.0,\n,1,1770,Snow White and the Seven Dwarfs - DVD Trailer ...,http://www.dailymotion.com/video/xiq5vr,,
8,1,0.0,I’ve never lost a close friend to death. My pa...,1,1769,SATURDAY OF LAZARUS by Archpriest Timothy Crem...,https://orthodoxyindialogue.com/2018/03/30/sat...,,
9,2,1.0,MOSCOW (Sputnik) - Kazakhstan’s National Bank ...,1,1768,Kazakhstan Mulls Complete Ban of Every Digital...,https://sputniknews.com/asia/20180330106308502...,,


In [166]:
from sklearn.model_selection import train_test_split
import numpy as np

df = df_original.copy()

df.dropna(subset=["title", "content"], inplace=True)

df = df[(df['content'].str.len() > 120) & (df['url'].str.contains('youtu.be|twitter.com|vimeo.com|facebook.com') == False)]

df["positive"] = [cid == 4 for cid in df["category_id"]]

print("Number of biased samples", len(df[df["positive"] == True]))

Number of biased samples 78


In [167]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer, recall_score
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

def test_classifier(clf_):
    positive_df = df[df["positive"] == True]
    negative_df = df[df["positive"] == False].apply(np.random.permutation)[0:len(positive_df)]
    balanced_df = positive_df.append(negative_df)
    
    X = balanced_df[["title", "content"]]
    y = balanced_df["positive"]

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline([
        ('features', FeatureUnion(
            transformer_list=[
                ('title', Pipeline([
                  ('selector1', FunctionTransformer(lambda x: x['title'], validate=False)),
                  ('vect1', CountVectorizer(ngram_range=(1, 3))),
                  ('tfidf1', TfidfTransformer())
                ])),
                ('content', Pipeline([
                  ('selector2', FunctionTransformer(lambda x: x['content'], validate=False)),
                  ('vect2', CountVectorizer(ngram_range=(1, 3))),
                  ('tfidf2', TfidfTransformer())
                ]))
            ],
            transformer_weights={
                'title': 1.0,
                'content': 0.8,
            },
        )),
        ('clf', clf_)
    ])
    
    clf = pipeline.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    
    y_pred = clf.predict(X_test)
    avg_f1 = (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2
    positive_recall = recall_score(y_test, y_pred)
    
    return accuracy, avg_f1, positive_recall
    
    
def test_classifier_avg(name, clf, times=3):
    accuracies = []
    f1s = []
    positive_recalls = []
    
    for i in range(0, times):
        accuracy, f1, positive_recall = test_classifier(clf)
        accuracies.append(accuracy)
        f1s.append(f1)
        positive_recalls.append(positive_recall)
        
    
    print("accuracy", round(np.mean(accuracies), 3),
          "f1", round(np.mean(f1s), 3),
          "positive_recall", round(np.mean(positive_recalls), 3),
          "total weighted", round(np.mean(accuracies) + np.mean(f1s) + np.mean(positive_recalls) * 2, 3),
          name
         )

# np.random.seed(0)
    
test_classifier_avg("MultinomialNB", MultinomialNB())
test_classifier_avg("BernoulliNB", BernoulliNB())
test_classifier_avg("KNN", KNeighborsClassifier())
test_classifier_avg("SGDClassifier", SGDClassifier(max_iter=1000))
test_classifier_avg("RandomForest", RandomForestClassifier())
test_classifier_avg("DecisionTreeClassifier", DecisionTreeClassifier())
test_classifier_avg("SVC", SVC(kernel='rbf', probability=True))
test_classifier_avg("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=100), times=1)
test_classifier_avg("VotingClassifier", VotingClassifier(estimators=[
            ('MultinomialNB', MultinomialNB()),
            ("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000)),
            ("SGD", SGDClassifier(max_iter=1000))
        ]), times=1)

accuracy 0.641 f1 0.622 positive_recall 0.852 total weighted 2.966 MultinomialNB


  'precision', 'predicted', average, warn_for)


accuracy 0.624 f1 0.492 positive_recall 0.148 total weighted 1.411 BernoulliNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy 0.547 f1 0.353 positive_recall 0.0 total weighted 0.9 KNN
accuracy 0.872 f1 0.871 positive_recall 0.907 total weighted 3.557 SGDClassifier
accuracy 0.564 f1 0.548 positive_recall 0.333 total weighted 1.779 RandomForest
accuracy 0.658 f1 0.649 positive_recall 0.733 total weighted 2.773 DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy 0.436 f1 0.303 positive_recall 0.333 total weighted 1.406 SVC
accuracy 0.821 f1 0.82 positive_recall 0.714 total weighted 3.069 MultiLayerPerceptron
accuracy 0.795 f1 0.791 positive_recall 0.9 total weighted 3.386 VotingClassifier
