In [8]:
# !pip install --upgrade tensorflow --user
# !pip install keras --user

In [255]:
import io, json, talos, itertools
from collections import Counter

from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [None]:
import numpy as np

In [12]:
with io.open("data/train.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()

In [230]:
with open("./data/data_train.json", 'r') as f:
    data = json.load(f)

In [231]:
MAX_WORDS = 5000
EMBEDDING_SIZE = 20
MAX_LENGTH = 200

In [97]:
import unidecode

In [234]:
text = list(itertools.chain.from_iterable([[contrib["answer"],contrib["question"] ] for contrib in data]))

In [237]:
tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True, )
tokenizer.fit_on_texts(text)

In [238]:
def sequences_from_list_of_text(text_list):
    sequences = tokenizer.texts_to_sequences([unidecode.unidecode(text) for text in text_list])
    return pad_sequences(sequences, maxlen=MAX_LENGTH)

In [242]:
data_questions = sequences_from_list_of_text([contrib["question"] for contrib in data])
data_answers = sequences_from_list_of_text([contrib["answer"] for contrib in data])

In [243]:
data_tags = [contrib["target"] for contrib in data]

In [247]:

all_tags = Counter(itertools.chain.from_iterable(data_tags))

In [248]:
sup_100_tags = [key for key, value in dict(all_tags).items() if value >100]

In [299]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit([sup_100_tags])
classes = multilabel_binarizer.classes_
y = multilabel_binarizer.transform(data_tags)

In [303]:
x_question_train, x_question_test, \
x_answer_train, x_answer_test,\
y_train, y_test = train_test_split(data_questions, data_answers, y, test_size=0.2, random_state=42)


In [None]:
model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_SIZE, input_length=MAX_LENGTH))
model.add(Dropout(0.15))
model.add(GlobalMaxPool1D())
model.add(Dense(len(classes), activation='sigmoid'))

model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=['categorical_accuracy'])
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-simple.h5', save_best_only=True)
]

history = model.fit(x_answer_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 90498 samples, validate on 10056 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

In [None]:
simple_model = load_model('model-simple.h5')
metrics = simple_model.evaluate(x_answer_test, y_test)
print("{}: {}".format(simple_model.metrics_names[0], metrics[0]))
print("{}: {}".format(simple_model.metrics_names[1], metrics[1]))

In [None]:
filter_length = 300

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_SIZE, input_length=MAX_LENGTH))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(len(classes)))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_answer_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

In [None]:
cnn_model = load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_answer_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

In [None]:
def get_features(text):
    tokens = tokenizer.texts_to_sequences([text])
    return pad_sequences(tokens, maxlen=MAX_LENGTH)

In [None]:
features = get_features("Il suffit de cesser la destruction des services publics de proximité pour des raisons financières. De plus, ces fermetures sont imposées par l'UE, qui oblige les états a privatiser tous les services publics( poste, hôpitaux, énergie...) , accentuant le désir de rentabilité des services publics actuels.")

In [None]:
sorted(zip(classes , cnn_model.predict(features)[0]), key=lambda x: x[1], reverse=True)

# Siameese 

In [None]:
from keras.layers import Input, Concatenate, concatenate

In [None]:
EMBEDDING_SIZE = 200

In [None]:
question_input = Input(shape=(MAX_LENGTH,))
answer_input = Input(shape=(MAX_LENGTH,))

x_question = Embedding(MAX_WORDS, EMBEDDING_SIZE, input_length=MAX_LENGTH)(question_input)

x_question = Dropout(0.1)(x_question)
x_question = Conv1D(200, 3, padding='valid', activation='relu', strides=1)(x_question)
x_question = GlobalMaxPool1D()(x_question)
x_question = Dense(100)(x_question)

x_answers = Embedding(MAX_WORDS, EMBEDDING_SIZE, input_length=MAX_LENGTH)(answer_input)
x_answers = Dropout(0.1)(x_answers)
x_answers = Conv1D(200, 3, padding='valid', activation='relu', strides=1)(x_answers)
x_answers = GlobalMaxPool1D()(x_answers)
x_answers = Dense(100)(x_answers)

x = Concatenate(axis=1)([x_question, x_answers])
x = Dropout(0.1)(x)
x = Dense(100)(x)
x = Dropout(0.1)(x)

output = Dense(len(classes), activation="softmax")(x)

In [None]:
from keras.models import Model

In [None]:
model = Model([question_input, answer_input],
                output)

model.compile(optimizer="adam",
               loss='binary_crossentropy', metrics=['categorical_accuracy'],
                )

model.fit([x_question_train, x_answer_train], y_train, batch_size=32,
          epochs=20,
                validation_split=0.1,
                callbacks=callbacks)

In [None]:
metrics = model.evaluate([x_question_test, x_answer_test], y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

In [None]:
test_question = "Que pensez-vous de la situation en France aujourd'hui et de la politique migratoire ? Quels sont, selon vous, les critères à mettre en place pour définir la politique migratoire ? 3 064"
test_answer = "Vaste question, pas de réponse, on ne peux ni accueillir la misère du monde, ni faire preuve d'apathie..."

In [None]:
feature_test_question = sequences_from_list_of_text([test_question])
feature_test_answer = sequences_from_list_of_text([test_answer])

In [None]:
prediction =  model.predict([feature_test_question, feature_test_answer])[0]

In [None]:
list(sorted(zip(multilabel_binarizer.classes_, prediction), key= lambda x: x[1], reverse=True))[0:10]

# Optimisations

In [180]:
params = {
    'OPTIMIZER': ['Nadam', 'Adam'],
    'EMBEDDING_SIZE': [20, 50, 100],
    'KERNEL_SIZE': [2, 3],
    'FILTER_SIZE': [100, 200, 300],
    "ACTIVATION_TYPE": ["sigmoid", "softmax"]
}

def build_model(x_train, y_train, x_val, y_val, params):

        model = Sequential()
        model.add(Embedding(MAX_WORDS, params["EMBEDDING_SIZE"], input_length=MAX_LENGTH))
        model.add(Dropout(0.1))
        model.add(Conv1D(params["FILTER_SIZE"], params["KERNEL_SIZE"], padding='valid', activation='relu', strides=1))
        model.add(GlobalMaxPool1D())
        model.add(Dense(len(classes)))
        model.add(Activation(params["ACTIVATION_TYPE"]))
        
        model.compile(optimizer=params["OPTIMIZER"], loss='binary_crossentropy', metrics=['categorical_accuracy'])

        callbacks = [
            ReduceLROnPlateau(),
            EarlyStopping(patience=4),
            ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
        ]
        
        out = model.fit(x_train,
                        y_train,
                        epochs=20,
                        batch_size=32,
                        validation_split=0.1,
                        callbacks=callbacks)

        return out, model


In [181]:
scan_object = talos.Scan(x, y, model=build_model, params=params, grid_downsample=0.1)



  0%|          | 0/7 [00:00<?, ?it/s][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20



 14%|█▍        | 1/7 [04:41<28:07, 281.23s/it][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20



 29%|██▊       | 2/7 [10:32<25:10, 302.13s/it][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20



 43%|████▎     | 3/7 [13:58<18:13, 273.37s/it][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20



 57%|█████▋    | 4/7 [20:52<15:46, 315.55s/it][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20



 71%|███████▏  | 5/7 [25:16<10:00, 300.01s/it][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20



 86%|████████▌ | 6/7 [35:57<06:42, 402.29s/it][A

Train on 79186 samples, validate on 8799 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20



100%|██████████| 7/7 [46:50<00:00, 477.69s/it][A

In [186]:
scan_object.details

complete_time            03/11/19/16:23
experiment_name           031119153617_
grid_downsample                     0.1
random_method          uniform_mersenne
reduce_loss                       False
reduction_interval                   50
reduction_method                   None
reduction_metric                val_acc
reduction_threshold                 0.2
reduction_window                     20
x_shape                   (125693, 200)
y_shape                   (125693, 307)
dtype: object

In [188]:
scan_object.best_model(metric="val_categorical_accuracy")

<keras.engine.sequential.Sequential at 0x7f26a83eb908>

In [195]:
features = get_features("Il suffit de cesser la destruction des services publics de proximité pour des raisons financières. De plus, ces fermetures sont imposées par l'UE, qui oblige les états a privatiser tous les services publics( poste, hôpitaux, énergie...) , accentuant le désir de rentabilité des services publics actuels.")

In [190]:
best_model = scan_object.best_model(metric="val_categorical_accuracy")

In [193]:
def get_prediction_tags(features):
    return sorted(zip(classes , best_model.predict(features)[0]), key=lambda x: x[1], reverse=True)[0:10]

In [194]:
get_prediction_tags(features)

[('Z_Autres', 0.23051563),
 ('Sans_réponse_/_Hors_sujet_/_Inclassable', 0.21212253),
 ('A_Aucune', 0.10980064),
 ('A_Aucun,_ou_la_réduire', 0.0658187),
 ('A_Aucun', 0.058471948),
 ('Communes,_intercommunalités', 0.039815485),
 ('B_>_Les_4,_tout_est_lié', 0.036506474),
 ('Collectivités_locales', 0.03515768),
 ('Simple_décompte_informatif_(actuel)', 0.027210295),
 ('Compter_comme_exprimé', 0.026890397)]

In [192]:
best_model.predict(features)[0]

array([2.00256705e-03, 1.95088983e-03, 2.37703323e-04, 5.95143437e-03,
       5.84719479e-02, 6.58186972e-02, 7.00008869e-03, 1.09800637e-01,
       1.61245465e-03, 4.67148423e-03, 1.10518932e-03, 1.48117542e-04,
       9.97066498e-04, 8.65161419e-05, 7.77244568e-05, 4.30643559e-05,
       3.33845615e-04, 1.47101283e-03, 8.64326954e-04, 1.01268291e-04,
       1.37299299e-04, 7.73876905e-04, 3.45498323e-04, 1.95443630e-04,
       2.46730447e-03, 3.74785066e-03, 7.32392073e-04, 1.40041113e-04,
       2.44408846e-04, 2.37083435e-03, 1.33395195e-04, 1.41367316e-03,
       6.81817532e-04, 3.53753567e-05, 4.47630882e-05, 3.65064740e-02,
       1.26600266e-04, 2.98857689e-03, 1.72555447e-05, 4.55945730e-04,
       4.31835651e-05, 8.83936882e-03, 2.69711018e-05, 1.44715309e-02,
       3.61502171e-05, 6.91711903e-05, 1.79138780e-03, 3.27557325e-04,
       3.61427665e-03, 7.78764486e-04, 3.30448151e-04, 3.51576805e-02,
       3.98154855e-02, 2.68903971e-02, 2.17854977e-05, 7.42167234e-03,
      