In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

In [3]:
%%sh
FEATURES_PATH=./features

paste "$FEATURES_PATH"/train/fr/*.tsv > "$FEATURES_PATH"/concat_train_fr.tsv
paste "$FEATURES_PATH"/train/en/*.tsv > "$FEATURES_PATH"/concat_train_en.tsv
paste "$FEATURES_PATH"/train/yt/*.tsv > "$FEATURES_PATH"/concat_train_yt.tsv
paste "$FEATURES_PATH"/test/fr/*.tsv  > "$FEATURES_PATH"/concat_test_fr.tsv
paste "$FEATURES_PATH"/test/en/*.tsv  > "$FEATURES_PATH"/concat_test_en.tsv
paste "$FEATURES_PATH"/test/yt/*.tsv  > "$FEATURES_PATH"/concat_test_yt.tsv
paste "$FEATURES_PATH"/test2/fr/*.tsv > "$FEATURES_PATH"/concat_test2_fr.tsv
paste "$FEATURES_PATH"/test2/en/*.tsv > "$FEATURES_PATH"/concat_test2_en.tsv
paste "$FEATURES_PATH"/test2/yt/*.tsv > "$FEATURES_PATH"/concat_test2_yt.tsv

ls "$FEATURES_PATH"/*tsv 

./features/concat_test2_en.tsv
./features/concat_test2_fr.tsv
./features/concat_test2_yt.tsv
./features/concat_test_en.tsv
./features/concat_test_fr.tsv
./features/concat_test_yt.tsv
./features/concat_train_en.tsv
./features/concat_train_fr.tsv
./features/concat_train_yt.tsv


In [4]:
FEATURES_PATH = "./features"
paths = ["train_yt",  "train_fr", "train_en",
         #"concat_test_en.tsv", "concat_test_fr.tsv", "concat_test_yt.tsv"
         "test2_en", "test2_fr", "test2_yt"] 


X, y = {}, {}

models = {}

features = set('id partage nb_fakewords proba1 proba2 proba3 sim_tfidf sim_texte_tittre ponctuation'.split())
features = set('id proba1 proba2 proba3'.split())
features = set('id proba1 proba2 proba3 partage nb_fakewords sim_tfidf sim_texte_tittre ponctuation topic_title_sim'.split())


for name in paths:
    df = pd.read_csv(f'{FEATURES_PATH}/concat_{name}.tsv', sep="\t")
    df["id"] = df["id"].astype(np.int32)
    df.set_index('id', inplace=True)
    df = df.fillna(-1)
    df["ponctuation"] = df["ponctuation"].astype(np.int32) # apply(lambda l: float(l)) #
    
    cols = list(features & set(df.columns.tolist()))
    print(name, cols) # , df.columns) 
    X[name] = df[cols]
    X[name].to_csv(f"{FEATURES_PATH}/normalized/{name}.tsv", sep="\t")

    if "label" in df.columns:
        y[name] = df.label
        m = LogisticRegression()
        ## Normalisation des données
        pipeline = make_pipeline(StandardScaler(), m)
        models[name] = pipeline.fit(X[name], y[name])
        # print(dict(zip(cols, m.feature_importances_)))
    else:
        y[name] = models[name.replace("test2", "train")].predict(X[name])
        
        
    

train_yt ['partage', 'sim_texte_tittre', 'nb_fakewords', 'ponctuation', 'proba2', 'proba1', 'sim_tfidf']
train_fr ['partage', 'sim_texte_tittre', 'nb_fakewords', 'ponctuation', 'topic_title_sim', 'proba2', 'proba1', 'sim_tfidf']
train_en ['proba3', 'partage', 'sim_texte_tittre', 'nb_fakewords', 'ponctuation', 'topic_title_sim', 'proba2', 'proba1', 'sim_tfidf']
test2_en ['proba3', 'partage', 'sim_texte_tittre', 'nb_fakewords', 'ponctuation', 'topic_title_sim', 'proba2', 'proba1', 'sim_tfidf']
test2_fr ['partage', 'sim_texte_tittre', 'nb_fakewords', 'ponctuation', 'topic_title_sim', 'proba2', 'proba1', 'sim_tfidf']
test2_yt ['partage', 'sim_texte_tittre', 'nb_fakewords', 'ponctuation', 'proba2', 'proba1', 'sim_tfidf']


In [5]:
def gen_predictions(predictions):
    for name, preds in predictions.items():
        if "test2_" in name:
            source = name.replace("test2_","")
            for doc_id, pred  in zip(X[name].index, preds):
                yield (int(doc_id), pred, source)

results = pd.DataFrame(gen_predictions(y), columns=['id', 'type_pred', 'source'])
results.set_index("id", inplace=True)
results.to_csv("./test2-results_full_features_concat.tsv", sep='\t')

## Évaluation

In [6]:
results_fasttext = pd.read_csv(f"./test2-results-prob_fasttext.tsv", sep="\t")
results_fasttext.set_index("id", inplace=True)
results_fasttext.head()

Unnamed: 0_level_0,type_pred,score,source
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4274,trusted,0.87759,en
1022,fakeNews,0.5569,en
3914,fakeNews,0.884177,en
3470,fakeNews,0.565416,en
2004,fakeNews,0.914755,en


In [8]:
test_data = {}
for name in ["fr", "en", "yt"]:

    test_data[name] = pd.read_csv(f"../data/test2-full/storyzy_{name}_test2_full.tsv", sep="\t")
    test_data[name].set_index('id', inplace=True)
    test_data[name]["y_pred"] = test_data[name].join(results[results.source==name])["type_pred"]
    test_data[name]["y_pred_fasttext"] = test_data[name].join(results_fasttext[results_fasttext.source==name])["type_pred"]

    print(name, 'fasttext')
    print(classification_report(test_data[name]['type'], test_data[name]['y_pred_fasttext']))
    
    
    
    print(name, 'features + fasttext')
    print(classification_report(test_data[name]['type'], test_data[name]['y_pred']))
    

fr fasttext
             precision    recall  f1-score   support

   fakeNews       0.00      0.00      0.00        94
    trusted       0.60      1.00      0.75       142

avg / total       0.36      0.60      0.45       236

fr features + fasttext
             precision    recall  f1-score   support

   fakeNews       0.35      0.40      0.37        94
    trusted       0.56      0.49      0.52       142

avg / total       0.47      0.46      0.46       236

en fasttext
             precision    recall  f1-score   support

   fakeNews       0.84      0.69      0.76       459
     satire       0.00      0.00      0.00        17
    trusted       0.83      0.93      0.88       801

avg / total       0.82      0.83      0.83      1277

en features + fasttext
             precision    recall  f1-score   support

   fakeNews       0.37      0.38      0.38       459
     satire       0.08      0.06      0.07        17
    trusted       0.63      0.63      0.63       801

avg / total       

  'precision', 'predicted', average, warn_for)
