# Plateforme Agnostique de Traitement et d'Analyse des Textes
### Carnet d'expérimentation
---

## Sujet : Predictions Infox - Comparaison Bag of Words

---

# Observations et environnement
---

In [1]:
cd ../..

/Users/fm/Desktop/Work/Patat


In [2]:
import importlib
import pandas as pd
import seaborn as sns

## Paramètres globaux

In [3]:
random_state = 42

## Données d'entrainement

In [4]:
df_texts_recueil = pd.read_csv('data/demo/221123-TextInfox.csv')

In [5]:
df_texts_recueil['infox'].describe()

count    611.000000
mean       0.489362
std        0.500296
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: infox, dtype: float64

In [6]:
df_texts_afp =  pd.read_csv('data/tmp/221118-TextsAfp.csv')

In [7]:
df_texts_afp

Unnamed: 0,url,text,infox
0,https://www.francesoir.fr/politique-monde/pres...,Présidentielle au Brésil: Bolsonaro talonne Lu...,0.0
1,https://www.francesoir.fr/politique-france/la-...,La France lance un plan de sobriété énergétiqu...,0.0
2,https://www.francesoir.fr/politique-monde/mani...,Manifestations des femmes en Iran contre le po...,0.0
3,https://www.francesoir.fr/politique-france/aff...,Affaire Quatennens: LFI sous la pression de la...,0.0
4,https://www.francesoir.fr/politique-monde/covi...,"Covid-19: pour Joe Biden, ""la pandémie est ter...",0.0
...,...,...,...
3232,https://www.francesoir.fr/lifestyle-gastronomi...,La deuxième vie des coquilles d'huîtres\nLes h...,0.0
3233,https://www.francesoir.fr/lifestyle-gastronomi...,La truffe se fait rare\nVictime du réchauffeme...,0.0
3234,https://www.francesoir.fr/lifestyle-gastronomi...,Rungis prépare un Noël opulent malgré les atte...,0.0
3235,https://www.francesoir.fr/lifestyle-gastronomi...,Gastronomie: le Franco-Suisse Benoît Violier m...,0.0


In [8]:
with open('data/tmp/221110-IgnoreLemmas.txt','r') as file:
    ignore_words = file.read().split()

In [9]:
print(ignore_words[:100])
print(len(ignore_words))

['de', 'la', 'le', 'les', 'et', 'des', 'en', 'un', 'du', 'est', 'une', 'que', 'être', 'avoir', 'pour', 'dans', 'qui', 'il', 'par', 'sur', 'au', 'pas', 'ce', 'plaire', 'faire', 'ne', 'qu', 'pouvoir', 'se', 'avec', 'tout', 'ou', 'été', 'son', 'mais', 'aux', 'cette', 'on', 'nous', 'comme', 'elle', 'même', 'autre', 'devoir', 'ces', 'pays', 'leur', 'si', 'sa', 'ses', 'bien', 'état', 'contre', 'ils', 'vous', 'sans', 'voir', 'je', 'après', 'deux', 'depuis', 'mettre', 'russie', 'monde', 'entre', 'grand', 'an', 'dire', 'covid', 'invitant', 'guéguen', 'patiemment', 'méditation', 'eda', 'perdurer', 'inversement', 'guedj', 'penn', 'kang', 'officieux', 'jurisprudence', 'kant', 'hectare', 'islamisation', 'hebdo', 'guichard', 'entente', 'guihard', 'pepe', 'duel', 'haïr', 'naufrage', 'ironiquement', 'olivennes', 'juvénile', 'inégal', 'patois', 'justifiant', 'echos', 'jacky']
15203


# Experience
---

## Pipeline

### Preprocessor

In [10]:
import patat.util.text

importlib.reload(patat.util.text)
preprocessor = patat.util.text.preprocess
#preprocessor=None

### Tokenizer

In [11]:
from patat.ml.lex_analyser import LexAnalyser

lex = LexAnalyser()

tokenizer = lex.get_lemmas_from_text

### TfIdfVectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
afp_size = 3000
#afp_size = 120
#afp_size = 0
df_ml = pd.concat([df_texts_recueil,df_texts_afp.sample(afp_size,random_state=random_state)])

In [14]:
%%time
vectorizer = TfidfVectorizer(lowercase=True, preprocessor=preprocessor, tokenizer=tokenizer, stop_words=ignore_words)
#vectorizer = CountVectorizer(lowercase=True, preprocessor=preprocessor, tokenizer=tokenizer, stop_words=ignore_words)
count_matrix = vectorizer.fit_transform(df_ml['text'])
count_array = count_matrix.toarray()
df_tf = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
df_tf.shape

df_X = df_tf

len(df_X.keys())



CPU times: user 1.31 s, sys: 80.7 ms, total: 1.39 s
Wall time: 1.39 s


37863

## Entrainement sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
X = df_X
y=df_ml['infox']

In [None]:
df_X.shape

### Initialisations

In [None]:
train_size=0.8
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=random_state,train_size=train_size)

In [None]:
def print_metrics(y_train, y_test, y_pred):
    print(f'Train size \t{len(y_train)} = {int(len(y_train)-y_train.sum())} False + {int(y_train.sum())} True' )
    print(f'Test size \t{len(y_test)} = {int(len(y_test)-y_test.sum())} False + {int(y_test.sum())} True\n' )
    print(f'Accuracy score \t{metrics.accuracy_score(y_test, y_pred)*100:.2f}%')
    print(f'Recall score \t{metrics.recall_score(y_test, y_pred)*100:.2f}%')
    print(f'F1 score \t{metrics.f1_score(y_test, y_pred)*100:.2f}%')
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred,normalize='all')
    print(f'False Positive \t{cnf_matrix[0,1]*100:.2f}%')
    print(f'False Negative \t{cnf_matrix[1,0]*100:.2f}%\n')    

### Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier
#strategies = ['most_frequent', 'prior', 'stratified', 'uniform']
strategies = ['stratified', 'uniform']
for strategy in strategies:
    print('---------------')
    print(f'Strategy : {strategy}\n')
    classifier = DummyClassifier(strategy=strategy, random_state=random_state)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print_metrics(y_train, y_test, y_pred)

### Confusion Matrix
||Pred 0|Pred 1|
|---|---|---|
|**Test 0**|True Negative|False Positive|
|**Test 1**|False Negative|True Positive|

### LogisticRegression

In [None]:
max_iter=500
#C=0.40
C=100
solver = 'lbfgs'

In [None]:
%%time
classifier = LogisticRegression(C=C, random_state=random_state, solver=solver, max_iter=max_iter)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classifier)
print()
print_metrics(y_train, y_test, y_pred)

In [None]:
# Cross validation
max_iter=500
#C=0.40
C=1000
solver = 'lbfgs'
classifier = LogisticRegression(C=C, random_state=random_state, solver=solver, max_iter=max_iter)
scores = cross_val_score(classifier, X, y, cv=5,scoring='f1')
print(scores)
print(f'Mean Score : {scores.mean()*100:.2f}%')
print(f'Deviation Score : {scores.std()*100:.2f}%')


### MLPClassifier

In [None]:
alpha=1e-4
hidden_layer_sizes=(256,64)
solver = 'adam'
solvers = ['lbfgs', 'sgd', 'adam']


In [None]:
%%time

classifier = MLPClassifier(solver=solver, alpha=alpha, hidden_layer_sizes=hidden_layer_sizes, random_state=random_state)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classifier)
print()
print_metrics(y_train, y_test, y_pred)

## Keras

In [None]:
import tensorflow as tf
tf.keras.utils.set_random_seed(random_state)

In [None]:
learn_size=0.90
X_learn,X_valid,y_learn,y_valid = train_test_split(X_train,y_train,random_state=random_state,train_size=learn_size)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=[X.shape[1]]),
#    layers.Dropout(0.3),
#    layers.GaussianDropout(0.3, seed=random_state),
    layers.Dense(64, activation='relu'),    
#    layers.Dropout(0.4),
    layers.GaussianDropout(0.5, seed=random_state),
    layers.Dense(1, activation='sigmoid'),
])

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [None]:
%%time
epoch = 15

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.0001,
    restore_best_weights=True,
)

history = model.fit(
#    X_train.to_numpy(), y_train.to_numpy(),
#    validation_split=0.2,
    X_learn.to_numpy(), y_learn.to_numpy(),
    validation_data=(X_valid.to_numpy(), y_valid.to_numpy()),
    batch_size=24,
    epochs=epoch,
#    callbacks=[early_stopping],
    shuffle=True,
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [None]:
history_df

In [None]:
y_pred = model.predict(X_test.to_numpy())
y_pred = y_pred.round()
print(model)
print()
print_metrics(y_train, y_test, y_pred)

# Sauvegarde des résultats
---

# Conclusions
---

- 

# Bricolages
---

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
tf.config.list_physical_devices()

In [None]:
tf.device("/CPU:0")

In [None]:
tf.debugging.set_log_device_placement(True)

# Place tensors on the CPU
with tf.device('/GPU:0'):
  a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
  b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

# Run on the GPU
c = tf.matmul(a, b)
print(c)

In [None]:
tf.debugging.set_log_device_placement(True)

In [None]:
tf.config.list_physical_devices('GPU')