In [2]:
import pandas as pd

df = pd.read_json("http://fake-news-detector-api.herokuapp.com/links/all")

df[0:10]

Unnamed: 0,category_id,content,count,id,title,url,verified_category_id
0,2,Vitória por quatro ou mais gols de diferença.\n,1,1241,Goleada,http://www.embasamentus.com/goleada/,
1,1,COUNTRY APPLE FRITTER BREAD 1 teaspoon ground ...,1,1240,COUNTRY APPLE FRITTER BREAD 1 teaspoon ground ...,https://www.facebook.com/9recipes/photos/a.696...,
2,4,"This material may not be published, broadcast,...",1,1239,California teacher slams military members as '...,http://www.foxnews.com/us/2018/01/28/californi...,
3,5,,1,1238,(5) Facebook,https://www.facebook.com/photo.php?fbid=202868...,
4,4,"LOL! Image by Occupy Democrats, LIKE our page ...",1,1237,LOL!,https://www.facebook.com/OccupyDemocrats/photo...,
5,2,,1,1236,(5) Facebook,https://www.facebook.com/Labor411/photos/a.145...,
6,1,,1,1235,El golpe esclavista,https://t.co/TvN9YxP72s,
7,5,Obama be like:,1,1234,Obama be like:,https://www.facebook.com/888888Eight/photos/a....,
8,1,KKKKKKKKKKKKKKKKKKKK É bem isso.,1,1233,KKKKKKKKKKKKKKKKKKKK É bem isso.,https://www.facebook.com/serggom007/posts/1964...,
9,1,Outra vez eu me sirvo deste menino que faz o t...,1,1232,"De Darcy para Lula: chore hoje, lute amanhã - ...",http://www.tijolaco.com.br/blog/de-darcy-para-...,


In [3]:
from sklearn.model_selection import train_test_split

df["text"] = df["title"] + ' ' + df["content"]
df.dropna(subset=["text"], inplace=True)

df["is_fakenews"] = [cid == 2 for cid in df["category_id"]]

texts = df["text"]
y = df["is_fakenews"]

print("Number of fake news samples", len(df[df["is_fakenews"] == True]))

Number of fake news samples 644


In [4]:
import nltk

text = nltk.word_tokenize("e agora algo completamente estranho um pudim", language='portuguese')
nltk.pos_tag(text)

[('e', 'NN'),
 ('agora', 'NNS'),
 ('algo', 'VBP'),
 ('completamente', 'JJ'),
 ('estranho', 'FW'),
 ('um', 'JJ'),
 ('pudim', 'NN')]

In [5]:
import nltk
import unidecode

def normalize_tokenize(text):
    no_accents = unidecode.unidecode(text)
    lowercased = no_accents.lower()
    tokenized = nltk.word_tokenize(lowercased, language='portuguese')
    tagged = nltk.pos_tag(tokenized)
    
    return tagged

tagged_texts = [ normalize_tokenize(text) for text in texts ]

print("tagged_texts[0][0:10]", tagged_texts[0][0:10])

tagged_texts[0][0:10] [('goleada', 'NN'), ('vitoria', 'NNS'), ('por', 'VBP'), ('quatro', 'JJ'), ('ou', 'NN'), ('mais', 'NN'), ('gols', 'FW'), ('de', 'FW'), ('diferenca', 'FW'), ('.', '.')]


In [26]:
stop_words = nltk.corpus.stopwords.words('portuguese')

X = [ " ".join([text + " " + tag for (text, tag) in text_tag]) for text_tag in tagged_texts ] 

X[0]

'goleada NN vitoria NNS por VBP quatro JJ ou NN mais NN gols FW de FW diferenca FW . .'

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, make_scorer, recall_score
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

def test_classifier(clf_):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(strip_accents='ascii', ngram_range=(1, 4), max_df=0.7, min_df=2)),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('sampling', RandomUnderSampler()),
        ('clf', clf_)
    ])
    
    clf = pipeline.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    
    y_pred = clf.predict(X_test)
    avg_f1 = (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2
    positive_recall = recall_score(y_test, y_pred)
    
    return accuracy, avg_f1, positive_recall
    
    
def test_classifier_avg(name, clf):
    times = 3
    total_accuracy = 0
    total_f1 = 0
    total_positive_recall = 0
    
    for i in range(0, times):
        accuracy, f1, positive_recall = test_classifier(clf)
        total_accuracy += accuracy
        total_f1 += f1
        total_positive_recall += positive_recall
    
    print("accuracy", round(total_accuracy / times, 3),
          "f1", round(total_f1 / times, 3),
          "positive_recall", round(total_positive_recall / times, 3),
          "total weighted", round((total_accuracy + total_f1 + total_positive_recall * 2) / times, 3),
          name
         )


test_classifier_avg("MultinomialNB", MultinomialNB())
# test_classifier_avg("BernoulliNB", BernoulliNB())
# test_classifier_avg("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000))
# test_classifier_avg("KNN", KNeighborsClassifier())
# test_classifier_avg("SGDClassifier", SGDClassifier(max_iter=1000))
# test_classifier_avg("RandomForest", RandomForestClassifier())
# test_classifier_avg("DecisionTreeClassifier", DecisionTreeClassifier())
# test_classifier_avg("SVC", SVC(kernel='rbf', probability=True))
# test_classifier_avg("VotingClassifier", VotingClassifier(estimators=[
#             ('MultinomialNB', MultinomialNB()),
#             ("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000)),
#             ("SVC", SVC(kernel='rbf', probability=True))
#         ]))
# test_classifier_avg("VotingClassifier2", VotingClassifier(estimators=[
#             ('MultinomialNB', MultinomialNB()),
#             ("MultiLayerPerceptron", MLPClassifier(solver='adam', max_iter=1000)),
#             ("SDG", SGDClassifier(max_iter=1000))
#         ]))

accuracy 0.783 f1 0.77 positive_recall 0.929 total weighted 3.411 MultinomialNB


In [44]:
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y)

pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3), min_df=5, max_df=0.5)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('sampling', RandomUnderSampler()),
    ('clf', MultinomialNB(fit_prior=False))
])

parameters = {#'tfidf__use_idf': [True, False],
              #'vect__ngram_range': [(1, 2), (1, 3)],
#               'vect__max_df': [0.5, 0.6, 0.7, 0.8, 0.9],
#               'vect__min_df': [2, 5, 8, 10],
              'clf__alpha': [0.7, 0.9, 1.0],
              #'clf__fit_prior': [True, False],
}

def my_custom_loss_func(y_test, y_pred):
    return (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2 + recall_score(y_test, y_pred)

scoring = make_scorer(my_custom_loss_func, greater_is_better=True)

gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring=scoring)
gs_clf = gs_clf.fit(X, y)

print("Best score", gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Best score 1.649749671371195
clf__alpha: 0.7


In [418]:
def my_custom_loss_func(y_test, y_pred):
    return (f1_score(y_test, y_pred, pos_label=False) + f1_score(y_test, y_pred)) / 2 + recall_score(y_test, y_pred)

scoring = make_scorer(my_custom_loss_func, greater_is_better=True)

pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SVC(kernel='rbf', probability=True))
])

parameters = {'clf': [
    SVC(),
    KNeighborsClassifier(),
    MultinomialNB(),
    BernoulliNB(),
#     MLPClassifier(max_iter=1000),
    SGDClassifier(max_iter=1000),
    RandomForestClassifier()
]}

gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=30, scoring="recall")
gs_clf = gs_clf.fit(X, y)

print("Best score", gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Best score 0.144670050761
clf: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
