# Plateforme Agnostique de Traitement et d'Analyse des Textes
### Paillasse d'expérimentation
---

## Sujet : Prédiction BoW/RL pour tous les labels

---


## Initialisation de la paillasse 
---

In [3]:
cd ../..

/Users/fm/Desktop/Work/Patat


In [4]:
import importlib
import pandas as pd

## Observations et environnement
---

### Chargement du corpus

In [5]:
corpus_filename = 'corpus/221003-CorpusApp.csv'

In [7]:
dtypes = {'date_iso' : str}
df_corpus = pd.read_csv(f'data/{corpus_filename}', dtype = dtypes)
df_corpus = df_corpus.convert_dtypes()

In [8]:
df_corpus[df_corpus['url'].isna()]

Unnamed: 0,url,title,article,author,date_iso,site,infox,y_proba


In [9]:
df_corpus[df_corpus['infox'].notna()]

Unnamed: 0,url,title,article,author,date_iso,site,infox,y_proba
0,https://reseauinternational.net/tous-les-jeune...,"Tous les jeunes, portez la nouvelle (russe)",par Pepe Escobar. L’OCS à Samarcande et l’Asse...,,2022-09-30T00:00:00,reseauinternational.net,0,0.002944
1,https://reseauinternational.net/adhesion-a-la-...,Adhésion à la Russie : 93% pour le « oui » dan...,"Dans les régions de Zaporijia et de Kherson, 9...",,2022-09-30T00:00:00,reseauinternational.net,0,0.01263
2,https://lemediaen442.fr/onu-le-premier-ministr...,ONU – Le Premier ministre de Nouvelle-Zélande ...,L’argument principal de la ministre est que le...,,2022-09-29T00:00:00,lemediaen442.fr,1,0.963768
3,https://www.francesoir.fr/societe-environnemen...,Compostage humain: les “funérailles vertes” ga...,"Aux États-Unis, les différents gouvernements r...",Auteur(s) FranceSoir,2022-09-28T13:15:00,www.francesoir.fr,0,0.007914
18,https://www.dreuz.info/2022/09/qui-est-elle-an...,"Qui est-elle ? Anti-UE, pro-Otan, pro-Ukraine,...",Les journalistes ont le cerveau bloqué sur 193...,Publié par Jean-Patrick Grumberg le 27 septem...,2022-09-27T00:00:00,www.dreuz.info,0,0.002912
...,...,...,...,...,...,...,...,...
87372,https://www.profession-gendarme.com/leffet-kis...,L’effet Kiss Coll de la guerre,Par WD Nous sommes imbriqués dans l’Otan. Ce...,,,www.profession-gendarme.com,1,0.986896
87438,https://www.profession-gendarme.com/les-labora...,Les laboratoires du Pentagone et la dépopulation,par Dragomir Bojkov. La vice-présidente de la ...,,,www.profession-gendarme.com,1,0.692176
87510,https://www.profession-gendarme.com/lettre-ouv...,« LETTRE OUVERTE A LA FRANCE ET AUX NATIONS »,L’un de nos lecteurs (ou l’une de nos lectrice...,,,www.profession-gendarme.com,1,0.99413
88737,https://www.profession-gendarme.com/tour-de-fr...,Tour de France – Abandon de Victor Lafay :« On...,Un mal étrange traverse le peloton du tour de ...,,,www.profession-gendarme.com,1,0.133346


## Expérience
---

### Céation bags of Words

#### Mots non signifiants

In [22]:
import patat.ml.lex_analyser

lex = patat.ml.lex_analyser.LexAnalyser()

def merge_texts(texts):
    result = ''
    for text in texts:
        result = result + text + '\n'
    return result

def count_words_label(df,label):
    df = df[df[label].notna()]
    df_texts = pd.pivot_table(df, values='text', index=None, columns=label, aggfunc=merge_texts)
    wc = {}
    word_analysis = {}
    for key in df_texts.keys():
        text = df_texts[key]['text']
        count_colname = label+'_'+str(key)
        wc[key] = lex.count_tokens(lex.get_words(text))
        for word in wc[key]:
            word_dic = word_analysis.get(word,{})
            word_dic[count_colname]=wc[key][word]
            word_analysis[word]=word_dic
    return word_analysis

In [23]:
import patat.imp.labels

In [24]:
label_names = patat.imp.labels.labels

In [25]:
count_words_label(df_corpus,'infox')

{}

In [26]:
pd.pivot_table(df_corpus, values='text', index=None, columns='signe', aggfunc=merge_texts)

signe
text


In [9]:
df_count = pd.DataFrame(count_words_label(df_corpus,'infox')).T
df_count = df_count.fillna(0)

In [10]:
df_count.shape

(0, 0)

In [11]:
df_count

#### Identification des mots rares

In [None]:
occ_rare = 3
def is_rare(row):
    return row['infox_0.0'] < occ_rare and row['infox_1.0'] < occ_rare

df_rare = df_count[df_count.apply(is_rare,axis=1)]

rare_words = list(df_rare.index)
len(rare_words)

### Identification des mots communs

common_size = 100
top_0 = df_count.sort_values('infox_0.0',ascending=False).head(common_size).index
top_1 = df_count.sort_values('infox_1.0',ascending=False).head(common_size).index
common_words = []
for word in top_0:
    if word in top_1:
        common_words.append(word)
len(common_words)

common_words

ignore_words = common_words + rare_words

In [None]:
common_words

#### Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectorizer = CountVectorizer(stop_words=ignore_words)
#count_vectorizer = CountVectorizer()

In [None]:
corpus = df_corpus['text']
X = count_vectorizer.fit_transform(corpus)
y = df_corpus['infox']

### Construction des Datasets d'entrainement et de test

In [None]:
df_train = df_corpus[df_corpus['infox'].notna()]
X_train = X[df_train.index]
y_train = df_train['infox']

In [None]:
X_train.shape

### Entrainement du modele

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#lr = LogisticRegression(C=100.0, random_state=1, solver='lbfgs', multi_class='ovr', max_iter=1000)
classifier = LogisticRegression(max_iter=1000)

# Fit the model
classifier.fit(X_train, y_train)


### Prédictions sur le corpus

In [None]:
df_pred = df_corpus

In [None]:
vectorizer = predictor['descriptor']
classifier = predictor['classifier']

In [None]:
%%time
X = vectorizer.transform(df_pred['text'])

In [None]:
%%time
y = classifier.predict(X)
y_proba = classifier.predict_proba(X)[:,1]




In [None]:
df_pred.loc[:,'y'] = y
df_pred.loc[:,'y_proba'] = y_proba

In [None]:
df_pred

### Mesure et observation des résultats

In [None]:
import numpy as np

In [None]:
# Import seaborn
import seaborn as sns

# Apply the default theme
sns.set_theme()

In [None]:
sns.displot(df_pred["y_proba"], binwidth=0.01)

In [None]:
df_recueil = df_corpus[df_corpus['infox'].notna()]
y_test = df_recueil['infox']
y_pred = df_recueil['y']

In [None]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Matrice de confusion')
print(cnf_matrix)
print(f'Accuracy score {metrics.accuracy_score(y_test, y_pred)*100:.2f}%')
print(f'Recall score {metrics.recall_score(y_test, y_pred)*100:.2f}%')

In [None]:
def size(y):
    return 'S' if y<=0.25 else 'L' if y>=0.75 else 'M'

In [None]:
df_pred['y_bin']=df_pred['y_proba'].apply(size)

In [None]:
df_pivot_sites = pd.pivot_table(df_pred[['site','y_bin']],index='site',columns='y_bin',aggfunc=np.count_nonzero)

In [None]:
df_pivot_sites.fillna(0,inplace=True)

In [None]:
df_pivot_sites['total']=df_pivot_sites['S']+df_pivot_sites['M']+df_pivot_sites['L']

In [None]:
df_pivot_sites['S_freq']=df_pivot_sites['S']/df_pivot_sites['total']
df_pivot_sites['L_freq']=df_pivot_sites['L']/df_pivot_sites['total']

In [None]:
df_pivot_sites

## Sauvegarde des résultats
---

In [None]:
predict_filename = '221002-CorpusNewsLabelPred.csv'
df_pred.to_csv(f'data/corpus/{predict_filename}',index=False)

## Conclusions
---

- Production Corpus Labelisé avec prédictions : OK

## Bricolages
---

In [None]:
df_pred