# Patat : Plateforme Agnostique de Traitement et d'Analyse des Textes

### Notebook 4 : Predictions !
Sur le Corpus

En utilisant les prédicteurs précédement entraînés

### Synopsis

- Prédiction avec SpacyLogreg
- Prédiction avec SpacyGnb
- Comparaison sur le label Infox

### To do
- Regarder Cohen Kappa score dans sklearn

## 0 - Préliminaires

In [1]:
cd ../..

/Users/fm/Desktop/Work/Patat


In [2]:
import importlib
import pandas as pd

## 1 - Chargement du corpus

In [3]:
df_corpus = pd.read_csv('data/corpus/220531-SpacyCorpus.csv')

## 2 - Prédiction avec SpacyLogreg

In [4]:
import patat.ml.spacy_analyser
analyser = patat.ml.spacy_analyser.SpacyAnalyser()

In [5]:
import patat.ml.logreg_predictor
predictor = patat.ml.logreg_predictor.LogregPredictor()

In [6]:
predictor.load('data/predictor/220601-SpacyLogreg.pp')

In [7]:
X_cols = analyser.feature_col_names()

In [8]:
df_pred = predictor.predict(df_corpus,X_cols)

In [9]:
df_corpus_logreg = pd.concat([df_corpus.reset_index(), df_pred], axis=1)

In [10]:
 df_corpus_logreg = df_corpus_logreg.rename(columns={'infox': 'infox_logreg'})

In [11]:
df_corpus_logreg[df_corpus_logreg['infox_logreg']==1]['url']

0      https://www.lelibrepenseur.org/les-rassembleme...
3      https://www.francesoir.fr/culture-celebrites/e...
8      https://www.dreuz.info/2021/12/la-reconnaissan...
12     https://www.francesoir.fr/opinions-societe/san...
14     https:/lesmoutonsrebelles.com/un-homme-qui-a-f...
                             ...                        
781    https://www.francesoir.fr/culture-celebrites/a...
787    https://www.francesoir.fr/politique-monde/rein...
793    https:/www.wikistrike.com/2021/12/angleterre-c...
797    https://www.dreuz.info/2022/01/linattendu-rebo...
799    https://www.lelibrepenseur.org/algerie-manifes...
Name: url, Length: 205, dtype: object

## 3 - Prédiction avec SpacyGnb

In [12]:
import patat.ml.gnb_predictor
predictor = patat.ml.gnb_predictor.GnbPredictor()

In [13]:
predictor.load('data/predictor/220601-SpacyGnb.pp')

In [14]:
X_cols = analyser.feature_col_names()

In [15]:
df_pred = predictor.predict(df_corpus,X_cols)

In [16]:
df_corpus_gnb = pd.concat([df_corpus.reset_index(), df_pred], axis=1)

In [19]:
 df_corpus_gnb = df_corpus_gnb.rename(columns={'infox': 'infox_gnb'})

In [20]:
df_corpus_gnb[df_corpus_gnb['infox_gnb']==1]['url'].shape

(22,)

## 4 - Comparaison des prédictions

In [21]:
df_test = pd.concat([df_corpus_logreg,df_corpus_gnb['infox_gnb']],axis=1)

In [22]:
def pred_status(p1,p2):
    if p1==p2:
        if p1==1:
            return 'positif'
        else:
            return 'negatif'
    else:
        return 'desaccord'

In [23]:
df_test['status'] = df_test.apply(lambda row: pred_status(row['infox_logreg'], row['infox_gnb']), axis=1)

In [24]:
nb_positif = len(df_test[df_test['status']=='positif'])
nb_negatif = len(df_test[df_test['status']=='negatif'])
nb_desaccord = len(df_test[df_test['status']=='desaccord'])
print(f'Positif : {nb_positif} Negatif : {nb_negatif} Desaccord : {nb_desaccord}')

Positif : 16 Negatif : 589 Desaccord : 195


# Prédictions avec LexAnalyser

In [None]:
df_corpus = pd.read_csv('data/corpus/220531-LexCorpus.csv')

In [None]:
import patat.ml.lex_analyser
lex = patat.ml.lex_analyser.LexAnalyser()

In [None]:
logreg = patat.ml.logreg_predictor.LogregPredictor()
gnb = patat.ml.gnb_predictor.GnbPredictor()

In [None]:
logreg.load('data/predictor/220531-LexLogreg.pp')
gnb.load('data/predictor/220531-LexGnb.pp')

In [None]:
X_cols = lex.feature_col_names()

In [None]:
X_cols

In [None]:
df_logreg_pred = logreg.predict(df_corpus,X_cols)
df_logreg_pred=df_logreg_pred.rename(columns={'Infox': 'infox_logreg'})
df_gnb_pred = gnb.predict(df_corpus,X_cols)
df_gnb_pred = df_gnb_pred.rename(columns={'Infox': 'infox_gnb'})


In [None]:
df_corpus_pred = pd.concat([df_corpus.reset_index(), df_logreg_pred], axis=1)
df_corpus_pred = pd.concat([df_corpus_pred, df_gnb_pred], axis=1)


In [None]:
def pred_status(p1,p2):
    if p1==p2:
        if p1==1:
            return 'positif'
        else:
            return 'negatif'
    else:
        return 'desaccord'

In [None]:
df_corpus_pred['status'] = df_corpus_pred.apply(lambda row: pred_status(row['infox_logreg'], row['infox_gnb']), axis=1)

In [None]:
nb_positif = len(df_corpus_pred[df_corpus_pred['status']=='positif'])
nb_negatif = len(df_corpus_pred[df_corpus_pred['status']=='negatif'])
nb_desaccord = len(df_corpus_pred[df_corpus_pred['status']=='desaccord'])
print(f'Positif : {nb_positif} Negatif : {nb_negatif} Desaccord : {nb_desaccord}')

In [None]:
df_gnb_pred[df_gnb_pred['infox_gnb']==1]

In [None]:
df_logreg_pred[df_logreg_pred['infox_logreg']==1]

In [None]:
df_test

In [None]:
df_pred

In [None]:
df_corpus

In [None]:
cd Work/Patat/


In [None]:
ls data/corpus

In [None]:
df_save = df_test[['url','title','author','date_published','afp','infox_logreg','infox_gnb','status']]

In [None]:
df_save.to_csv('data/corpus/220601-FrancesoirPredictions.csv')

In [None]:
df_save

In [None]:
df_spacy=pd.read_csv('data/corpus/220530-SpacyLabels.csv')

In [None]:
df_spacy