# Plateforme Agnostique de Traitement et d'Analyse des Textes
### Carnet d'expérimentation
---

## Sujet : Tensorflow

---

# Observations et environnement
---

In [1]:
cd ../..

/Users/fm/Desktop/Work/Patat


In [2]:
import importlib
import pandas as pd
import seaborn as sns

## Labels et Articles

In [3]:
filename = 'data/tmp/221110-TextsInfox.csv'
df_text = pd.read_csv(filename)

In [4]:
df_text.shape

(455, 3)

In [5]:
df_text.head()

Unnamed: 0,url,text,infox
0,https://www.breizh-info.com/2022/09/30/208531/...,Réunification. Les six « insoumis » de Bretagn...,0.0
1,https://www.breizh-info.com/2022/10/02/208574/...,"Stonehenge, patrimoine africain ? Ben voyons…\...",0.0
2,https://www.dreuz.info/2022/10/elections-de-mi...,Elections américaines de mi-mandat : suivi quo...,0.0
3,https://www.dreuz.info/2022/10/lukraine-est-en...,L’Ukraine est en train de gagner la guerre\nOn...,0.0
4,https://www.dreuz.info/2022/10/la-fda-va-publi...,La FDA va publier une étude sur les effets ind...,0.0


# Experience
---

## Train Val Test Dataframes

In [91]:
train_percent = .8
valid_percent = .2
df_train = df_text.sample(int(df_text.shape[0]*train_percent),random_state=42).copy()
df_test =  df_text.drop(df_train.index,axis=0).copy()
df_valid = df_train.sample(int(df_train.shape[0]*valid_percent),random_state=42).copy()
df_train = df_train.drop(df_valid.index,axis=0)

## Datasets

In [92]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import string
import re

In [93]:
def make_ts_dataset(df_text):
    ds_raw = tf.data.Dataset.from_tensor_slices((
                df_text['text'].values,
                df_text['infox'].values
            ))
    
    return ds_raw

In [94]:
ds_raw_batch=ds_raw.batch(batch_size=64)

In [95]:
ds_raw_batch.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=8>

In [97]:
batch_size = 32
raw_train_ds = make_ts_dataset(df_train)
raw_val_ds = make_ts_dataset(df_valid)
raw_test_ds = make_ts_dataset(df_test)

raw_train_ds = raw_train_ds.batch(batch_size)
raw_val_ds = raw_val_ds.batch(batch_size)
raw_test_ds = raw_test_ds.batch(batch_size)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Number of batches in raw_train_ds: 10
Number of batches in raw_val_ds: 3
Number of batches in raw_test_ds: 3


In [98]:


# Having looked at our data above, we see that the raw text contains HTML break
# tags of the form '<br />'. These tags will not be removed by the default
# standardizer (which doesn't strip HTML). Because of this, we will need to
# create a custom standardization function.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


# Model constants.
max_features = 50000
embedding_dim = 128
sequence_length = 500

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Now that the vocab layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# Let's make a text-only dataset (no labels):
text_ds = df_text['text']
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

2022-11-12 16:37:15.752904: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [99]:
text_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='text')
x = vectorize_layer(text_input)
x = layers.Embedding(max_features + 1, embedding_dim)(x)

In [100]:
# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [101]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [102]:
train_ds

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 500), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [103]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/3


2022-11-12 16:37:18.735301: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
 1/10 [==>...........................] - ETA: 0s - loss: 0.6659 - accuracy: 0.7500

2022-11-12 16:37:19.573916: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 3/3


<keras.callbacks.History at 0x2f6062d30>

In [90]:
model.evaluate(test_ds)



[0.6955097317695618, 0.4780219793319702]

# Mesure et observation des résultats
---

# Sauvegarde des résultats
---

# Conclusions
---

- ...

# Bricolages
---

In [25]:
dataset = (tf.data.Dataset.from_tensor_slices((
                    df_text['text'].values,
                    tf.cast(df_text['infox'].values, tf.int32)
                                                    )))

In [26]:
list(dataset.as_numpy_iterator())

[(b'R\xc3\xa9unification. Les six \xc2\xab\xc2\xa0insoumis\xc2\xa0\xc2\xbb de Bretagne\nDans l\xe2\x80\x99op\xc3\xa9ration que pr\xc3\xa9pare Bretagne r\xc3\xa9unie, il sera int\xc3\xa9ressant de suivre le comportement des 37 d\xc3\xa9put\xc3\xa9s et des 19 s\xc3\xa9nateurs de Bretagne. A coup s\xc3\xbbr, il faudra les assi\xc3\xa9ger et m\xc3\xa9diatiser leur attitude pour les contraindre \xc3\xa0 soutenir l\xe2\x80\x99action de l\xe2\x80\x99association. Un traitement particulier devra \xc3\xaatre r\xc3\xa9serv\xc3\xa9 aux six LFI : Murielle Lepvraud (Guingamp), Fr\xc3\xa9d\xc3\xa9ric Mathieu (Rennes Bruz), Mathilde Hignet (Redon), Andy Kerbrat (Nantes centre), S\xc3\xa9gol\xc3\xa8ne Amiot (Nantes-Saint-Herblain), Mathias Tavel (Saint-Nazaire). Leur patron et leur parti \xc3\xa9tant farouchement jacobins \xe2\x80\x93 donc hostiles dans les faits \xc3\xa0 la cause bretonne -, la logique veut qu\xe2\x80\x99ils adoptent une attitude d\xc3\xa9favorable au projet de Bretagne r\xc3\xa9unie.