In [8]:
import pandas as pd

df = pd.read_json("http://fake-news-detector-api.herokuapp.com/links/all")

df[0:10]

Unnamed: 0,category_id,content,count,id,title,url
0,5,Um recurso da Procuradoria-Geral da República ...,1,1,STF decide hoje se vídeo de Aécio explicando m...,http://www.sensacionalista.com.br/2017/09/26/s...
1,1,Há uma certa magia em viajar de trem pela Grã-...,1,2,15 destinos para conhecer de trem,http://bit.ly/2g8k2XR
2,3,Site Lança Cardápio Fit/Light (low-carb) e é N...,1,3,Site Lança Cardápio Fit (low-carb) e é Nova Se...,http://g3noticias.com.br/site_lanca_cardapio_f...
3,1,A hashtag #posteseuviralata alcançou o topo Tr...,1,4,A hashtag #posteseuviralata está enchendo o Tw...,http://bzfd.it/2yvqIu3
4,1,"""O que não me explicaram no dia da entrevista ...",1,5,Mulher acusa laboratório Fleury de racismo por...,https://www.buzzfeed.com/tatianafarah/mulher-a...
5,3,"Parabéns, você deveria ir ao cinema! Você sabe...",1,6,Você deveria ir ao cinema?,http://bzfd.it/2ysGJ3W
6,1,Aprovada em silêncio aos 45 minutos do segundo...,1,7,"Em silêncio, reforma eleitoral criou censura n...",http://bzfd.it/2xl7lin
7,3,Mas não espere que Kesha conte algum segredo d...,1,8,Kesha falou sobre que tipo de amiga a Taylor S...,http://bzfd.it/2yrVggv
8,5,Segundo deu em primeira mão o colunista Ancelm...,1,9,"Mudança na Rouanet proíbe a Bíblia, que está c...",http://www.sensacionalista.com.br/2017/10/06/m...
9,5,"Com as eleições presidenciais se aproximando, ...",1,10,Facebook retira comentários e deixa apenas bot...,http://www.sensacionalista.com.br/2017/10/06/f...


In [12]:
from sklearn.model_selection import train_test_split

df.dropna(subset=["title"], inplace=True)
df.dropna(subset=["content"], inplace=True)

df["is_biased"] = [cid == 4 for cid in df["category_id"]]

X = df["title"] + df["content"]
y = df["is_biased"]

print("Number of biased samples", len(df[df["is_biased"] == True]))

Number of biased samples 49


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer, recall_score

def test_classifier(clf_):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('sampling', RandomUnderSampler()),
        ('clf', clf_)
    ])
    
    clf = pipeline.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    
    y_pred = clf.predict(X_test)
    avg_f1 = (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2
    positive_recall = recall_score(y_test, y_pred)
    
    return accuracy, avg_f1, positive_recall
    
    
def test_classifier_avg(name, clf):
    times = 5
    total_accuracy = 0
    total_f1 = 0
    total_positive_recall = 0
    
    for i in range(0, times):
        accuracy, f1, positive_recall = test_classifier(clf)
        total_accuracy += accuracy
        total_f1 += f1
        total_positive_recall += positive_recall
    
    print("accuracy", round(total_accuracy / times, 3),
          "f1", round(total_f1 / times, 3),
          "positive_recall", round(total_positive_recall / times, 3),
          "total weighted", round((total_accuracy + total_f1 + total_positive_recall * 2) / times, 3),
          name
         )


test_classifier_avg("MultinomialNB", MultinomialNB())
test_classifier_avg("BernoulliNB", BernoulliNB())
test_classifier_avg("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000))
test_classifier_avg("KNN", KNeighborsClassifier())
test_classifier_avg("SGDClassifier", SGDClassifier(max_iter=1000))
test_classifier_avg("RandomForest", RandomForestClassifier())
test_classifier_avg("DecisionTreeClassifier", DecisionTreeClassifier())
test_classifier_avg("SVC", SVC(kernel='rbf', probability=True))
# test_classifier_avg("VotingClassifier", VotingClassifier(estimators=[
#             ('MultinomialNB', MultinomialNB()),
#             ("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000)),
#             ("SVC", SVC(kernel='rbf', probability=True))
#         ]))
# test_classifier_avg("VotingClassifier2", VotingClassifier(estimators=[
#             ('MultinomialNB', MultinomialNB()),
#             ("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000)),
#             ("SDG", SGDClassifier(max_iter=1000))
#         ]))

accuracy 0.451 f1 0.443 positive_recall 0.988 total weighted 2.869 MultinomialNB
accuracy 0.628 f1 0.559 positive_recall 0.74 total weighted 2.667 BernoulliNB


In [391]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y)

pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('sampling', RandomUnderSampler()),
    ('clf', MLPClassifier(max_iter=10000, solver="adam", activation="relu"))
])

parameters = {'clf__solver': ["lbfgs", "sgd", "adam"],
              'clf__activation': ["relu", "tanh"],
}

def my_custom_loss_func(y_test, y_pred):
    return (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2 + recall_score(y_test, y_pred)

scoring = make_scorer(my_custom_loss_func, greater_is_better=True)

gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring=scoring)
gs_clf = gs_clf.fit(X, y)

print("Best score", gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Best score 0.52973168963
clf__activation: 'tanh'
clf__solver: 'lbfgs'


In [418]:
def my_custom_loss_func(y_test, y_pred):
    return (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2 + recall_score(y_test, y_pred)

scoring = make_scorer(my_custom_loss_func, greater_is_better=True)

pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SVC(kernel='rbf', probability=True))
])

parameters = {'clf': [
    SVC(),
    KNeighborsClassifier(),
    MultinomialNB(),
    BernoulliNB(),
#     MLPClassifier(max_iter=1000),
    SGDClassifier(max_iter=1000),
    RandomForestClassifier()
]}

gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=30, scoring="recall")
gs_clf = gs_clf.fit(X, y)

print("Best score", gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Best score 0.144670050761
clf: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
