# Projet 5 : Catégorisez automatiquement des questions
## Contexte et objectifs
Le site Stackoverflow permet de poser des questions sur le thème de la programmation informatique. Afin de classifier les questions, les utilisateurs doivent renseigner des tags afin de retrouver plus facilement les questions. Afin d'aider les utilisateurs, le but du projet est de proposer des suggestions de tags en fonction du contenu de la question.  
Après avoir exploré les données et tester différents modèles pour segmenter les données, un code sera déployer afin de créer une API utilisable par Stackoverflow
## Notebook de création des features pour la segmentation
Dans ce notebook les différentes features qui seront utilisées pour la ségmentation sont créées et sauvegardées.  
La plupart des fonctions utilisées ont été soit directement copiées du notebook présenté dans le projet, soit elles en sont fortement inspirées.  
Les fonctions du modèle BERT ont été fortement modifiées pour la plupart.

## Modules Python

In [None]:
# module général
import numpy as np
import pandas as pd
import nltk
import re
import os
import pickle
import multiprocessing
import time
from collections import defaultdict
import matplotlib.pyplot as plt

import tokenization # Class pour la tokenisation disponible dans Tensorflow

#module SKlearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sklearn.model_selection
from sklearn.preprocessing import MultiLabelBinarizer

# module pyLDAvis : visualisation LDA
try : 
    import pyLDAvis.sklearn
except : 
    !pip install pyLDAvis
    import pyLDAvis.sklearn

# module gensim
!pip install gensim==4.1.2
import gensim
from gensim.models import CoherenceModel

# Tensorflow
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import tensorflow.keras.models
import tensorflow_hub as hub
from tensorflow.keras.optimizers import Adam

try :
    import sacremoses
except :   
    !pip install sacremoses
    import sacremoses
try :
    import transformers
except :
    !pip install transformers
    import transformers
from transformers import *

import logging
logging.disable(logging.WARNING)

# téléchargement nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
pyLDAvis.enable_notebook()



scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
  if pa_version and LooseVersion(pa_version) < LooseVersion("2.0"):
  if pa_version and LooseVersion(pa_version) < LooseVersion("2.0"):
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erwan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erwan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\erwan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\erwan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# pour google colab uniquement
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## fonctions utiles

In [None]:
def tokenize_lemmat(txt) :
    tag_map = defaultdict(lambda : nltk.corpus.wordnet.NOUN)
    tag_map['J'] = nltk.corpus.wordnet.ADJ
    tag_map['V'] = nltk.corpus.wordnet.VERB
    tag_map['R'] = nltk.corpus.wordnet.ADV

    lemmatizer = nltk.stem.WordNetLemmatizer()
    tag_tokenizer = nltk.RegexpTokenizer(r'</?(?:b|p)>', gaps=True)
    txt_tokenizer = nltk.RegexpTokenizer(r'\w+')

    txt = ''.join([i for i in txt if not i.isdigit()])
    txt = re.sub(r'_+', ' ', txt)
    words = txt_tokenizer.tokenize(' '.join(tag_tokenizer.tokenize(txt.lower())))
    out = [lemmatizer.lemmatize(token, tag_map[tag[0]]) for token, tag in nltk.pos_tag(words)]
    return ' '.join(out)

In [None]:
def tokenize_simple(txt) :
    tag_tokenizer = nltk.RegexpTokenizer(r'</?(?:b|p)>', gaps=True)
    txt_tokenizer = nltk.RegexpTokenizer(r'\w+')

    txt = ''.join([i for i in txt if not i.isdigit()])
    txt = re.sub(r'_+', ' ', txt)
    
    tokens = txt_tokenizer.tokenize(' '.join(tag_tokenizer.tokenize(txt.lower())))
    return ' '.join(tokens)

In [None]:
def dummy(doc) :
    return doc

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
def do_lda(docs, max_df=1, min_df=1., max_features=1000, n_topics=5):
    def display_topics(model, feature_names, no_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print("Topic {}:".format(topic_idx))
            print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    #tf_vectorizer = CountVectorizer(lowercase=True, tokenizer=tokenize_body, max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
    tf_vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(docs)
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=42)
    lda.fit(tf)
    n_top_words = 20
    display_topics(lda, tf_vectorizer.get_feature_names_out(), n_top_words)
    #coherence_model_lda = CoherenceModel(model=lda, texts=docs, dictionary=id2word, coherence='c_v')
    #coherence_lda = coherence_model_lda.get_coherence()
    #print('\nCoherence Score: ', coherence_lda)
    return lda, tf, tf_vectorizer

## Chargement des données
les données nettoyées dans le notebook précédent sont rechargées.  
Le Chargement est effectué depuis le fichier pickle pour éviter le traitement nécessaire pour considérer les colonnes de type list

In [None]:
#with open("gdrive/Othercomputers/Mon ordinateur portable/P5_stackoverflow/database_cleaned.pkl", 'rb') as ifile :
with open("database_cleaned.pkl", 'rb') as ifile :
    DATA = pickle.load(ifile)
with open("database_20tags_cleaned.pkl", 'rb') as ifile :
    DATA_20tags = pickle.load(ifile)
with open("database_50tags_cleaned.pkl", 'rb') as ifile :
    DATA_50tags = pickle.load(ifile)
#DATA = pd.read_csv("gdrive/Othercomputers/Mon ordinateur portable/P5_stackoverflow/database_cleaned.csv")


In [None]:
DATA.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Tags_list,Body_words,Title_words,Body_nwords,Body_words_lemmat,Body_nwords_lemmat,Body_words_noSW,Body_words_lemmat_noSW
0,SQL Server 2008 Full Text Search (FTS) versus ...,<p>I know there have been questions in the pas...,<sql-server><sql-server-2008><full-text-search...,499247,40,18582,26,5,"[<sql-server-2008>, <full-text-search>, <lucen...","[i, know, there, have, been, questions, in, th...","[sql, server, 2008, full, text, search, fts, v...",42,"[i, know, there, have, be, question, in, the, ...",42,"[know, questions, past, sql, versus, lucene, n...","[know, question, past, sql, versus, lucene, ne..."
1,XML Serialization and Inherited Types,"<p>Following on from my <a href=""https://stack...",<c#><xml><inheritance><serialization><xml-seri...,20084,86,56816,42,7,"[<serialization>, <c#>, <xml>, <inheritance>]","[following, on, from, my, a, href, https, stac...","[xml, serialization, and, inherited, types]",279,"[follow, on, from, my, a, href, http, stackove...",279,"[following, href, https, stackoverflow, com, q...","[follow, href, http, stackoverflow, com, quest..."
2,MyISAM versus InnoDB,<p>I'm working on a projects which involves a ...,<mysql><database><performance><innodb><myisam>,20148,887,301985,390,25,"[<performance>, <database>, <mysql>]","[i, m, working, on, a, projects, which, involv...","[myisam, versus, innodb]",146,"[i, m, work, on, a, project, which, involve, a...",146,"[working, projects, involves, lot, database, w...","[work, project, involve, lot, database, write,..."
3,Recommended SQL database design for tags or ta...,<p>I've heard of a few ways to implement taggi...,<sql><database-design><tags><data-modeling><ta...,20856,325,118552,307,6,"[<sql>, <database-design>, <data-modeling>, <t...","[i, ve, heard, of, a, few, ways, to, implement...","[recommended, sql, database, design, for, tags...",82,"[i, ve, heard, of, a, few, way, to, implement,...",82,"[heard, ways, implement, tagging, using, mappi...","[heard, way, implement, tag, use, mapping, tab..."
4,Specifying a mySQL ENUM in a Django model,<p>How do I go about specifying and using an E...,<python><mysql><django><django-models><enums>,21454,99,61572,21,9,"[<django-models>, <python>, <enums>, <django>,...","[how, do, i, go, about, specifying, and, using...","[specifying, a, mysql, enum, in, a, django, mo...",14,"[how, do, i, go, about, specify, and, use, an,...",14,"[go, specifying, using, enum, django, model]","[go, specify, use, enum, django, model]"


In [None]:
DATA.describe()

Unnamed: 0,Id,Score,ViewCount,FavoriteCount,AnswerCount,Body_nwords,Body_nwords_lemmat
count,27338.0,27338.0,27338.0,27338.0,27338.0,27338.0,27338.0
mean,16520780.0,113.008523,109089.1,42.724888,7.117236,205.265784,205.265784
std,15073700.0,346.850157,241333.3,146.18605,6.73504,230.235349,230.235349
min,4.0,6.0,261.0,11.0,1.0,4.0,4.0
25%,4144024.0,29.0,21266.25,14.0,3.0,79.0,79.0
50%,11803220.0,51.0,47806.0,19.0,5.0,139.0,139.0
75%,25597520.0,98.0,109441.5,35.0,9.0,246.0,246.0
max,70926800.0,26377.0,9893978.0,11586.0,126.0,4192.0,4192.0


## Préparation des données

### Tokenisation avec lemmatisation

In [None]:
DATA['Body_sentence_lemmat'] = DATA["Body"].apply(tokenize_lemmat)

In [None]:
DATA_20tags['Body_sentence_lemmat'] = DATA_20tags["Body"].apply(tokenize_lemmat)

In [None]:
DATA_50tags['Body_sentence_lemmat'] = DATA_50tags["Body"].apply(tokenize_lemmat)

### Tokenisation sans lemmatisation

In [None]:
DATA_20tags['Body_sentence_nolemmat'] = DATA_20tags["Body"].apply(tokenize_simple)

In [None]:
DATA_50tags['Body_sentence_nolemmat'] = DATA_50tags["Body"].apply(tokenize_simple)

In [None]:
DATA['Body_sentence_nolemmat'] = DATA["Body"].apply(tokenize_simple)

## Bag of words

### CountVectorizer
La class countvectorizer consiste à créer une matrice creuse, représentant le nombre d'occurence de chaque mot de vocabulaire dans chaque document

In [None]:
# Vectorizer 
cvect = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
cvect_20tags = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
cvect_50tags = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)

# fit
cv_fit = cvect.fit(DATA["Body_sentence_lemmat"])
cv_fit_20tags = cvect.fit(DATA_20tags["Body_sentence_lemmat"])
cv_fit_50tags = cvect.fit(DATA_50tags["Body_sentence_lemmat"])

# Transformation
cv_transform = cvect.transform(DATA["Body_sentence_lemmat"])  
cv_transform_20tags = cvect.transform(DATA_20tags["Body_sentence_lemmat"])  
cv_transform_50tags = cvect.transform(DATA_50tags["Body_sentence_lemmat"])  

### TF-IDF
Est équivalent à un countvectorizer auquel on applique une transformation TF-IDF. 
La transformation permet d'attribuer un poids différent en fonction de l'importance du mot dans le corpus et de sa fréquence.

In [None]:
# Vectorizer 
ctf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
ctf_20tags = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
ctf_50tags = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)

# fit
ctf_fit = ctf.fit(DATA["Body_sentence_lemmat"])
ctf_fit_20tags = ctf.fit(DATA_20tags["Body_sentence_lemmat"])
ctf_fit_50tags = ctf.fit(DATA_50tags["Body_sentence_lemmat"])

# Transformation
ctf_transform = ctf.transform(DATA["Body_sentence_lemmat"])  
ctf_transform_20tags = ctf.transform(DATA_20tags["Body_sentence_lemmat"])  
ctf_transform_50tags = ctf.transform(DATA_50tags["Body_sentence_lemmat"])  

### Sauvegarde des features

In [None]:
with open("bow_cv.pkl", 'wb') as ofile :
    pickle.dump(cv_transform, ofile)
with open("bow_tdif.pkl", 'wb') as ofile :
    pickle.dump(ctf_transform, ofile)
with open("bow_cv_20tags.pkl", 'wb') as ofile :
    pickle.dump(cv_transform_20tags, ofile)
with open("bow_tdif_20tags.pkl", 'wb') as ofile :
    pickle.dump(ctf_transform_20tags, ofile)
with open("bow_cv_50tags.pkl", 'wb') as ofile :
    pickle.dump(cv_transform_50tags, ofile)
with open("bow_tdif_50tags.pkl", 'wb') as ofile :
    pickle.dump(ctf_transform_50tags, ofile)

## Word2vec
Méthode de word embedding développée par Google permettant de créer des features à l'aide d'un réseaux de neuronnes à deux couches.  
Il permet de reconstruire le contexte des mots.

### Paramètres

In [None]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 24 # adapt to length of sentences
sentences = DATA['Body_sentence_lemmat'].to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]
sentences_20tags = DATA_20tags['Body_sentence_lemmat'].to_list()
sentences_20tags = [gensim.utils.simple_preprocess(text) for text in sentences_20tags]
sentences_50tags = DATA_50tags['Body_sentence_lemmat'].to_list()
sentences_50tags = [gensim.utils.simple_preprocess(text) for text in sentences_50tags]

### modèle

In [None]:
# Création et entraînement du modèle Word2Vec
print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                #workers=1)
                                                workers=multiprocessing.cpu_count())
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

Build & train Word2Vec model ...
Vocabulary size: 85565
Word2Vec trained


In [None]:
# Préparation des sentences (tokenization)
print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
x_sentences_20tags = pad_sequences(tokenizer.texts_to_sequences(sentences_20tags),
                                                     maxlen=maxlen,
                                                     padding='post') 
x_sentences_50tags = pad_sequences(tokenizer.texts_to_sequences(sentences_50tags),
                                                     maxlen=maxlen,
                                                     padding='post') 

                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

Fit Tokenizer ...
Number of unique words: 85566


### matrice embedding

In [None]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

Create Embedding matrix ...
Word embedding rate :  1.0
Embedding matrix: (85566, 300)


### modèle embedding

In [None]:
# Création du modèle

input=Input(shape=(len(x_sentences),maxlen),dtype='float64')

word_input=Input(shape=(maxlen,),dtype='float64')  
word_embedding=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)  
embed_model = Model([word_input],word_vec)

embed_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 24)]              0         
                                                                 
 embedding (Embedding)       (None, 24, 300)           25669800  
                                                                 
 global_average_pooling1d (G  (None, 300)              0         
 lobalAveragePooling1D)                                          
                                                                 
Total params: 25,669,800
Trainable params: 25,669,800
Non-trainable params: 0
_________________________________________________________________


### création inputs

In [None]:
embeddings_w2v = embed_model.predict(x_sentences)
embeddings_w2v_20tags = embed_model.predict(x_sentences_20tags)
embeddings_w2v_50tags = embed_model.predict(x_sentences_50tags)
print(embeddings_w2v.shape)
print(embeddings_w2v_20tags.shape)
with open("word2vec_features.pkl", 'wb') as ofile :
    pickle.dump(embeddings_w2v, ofile)
with open("word2vec_20tags_features.pkl", 'wb') as ofile :
    pickle.dump(embeddings_w2v_20tags, ofile)
with open("word2vec_50tags_features.pkl", 'wb') as ofile :
    pickle.dump(embeddings_w2v_50tags, ofile)

(27338, 300)
(21335, 300)


## BERT
La méthode Bidirectional Encoder Representations from Transformers est développée par Google.  
Le principe consiste à prédire un mot à partir des mots précédents et suivants dans une phrase.  

In [None]:
os.environ["TF_KERAS"]='1'

In [None]:
print(tf.__version__)
print(tensorflow.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

2.8.0
2.8.0
Num GPUs Available:  0
True


### fonctions

In [None]:
# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True, 
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")
    
        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0], 
                             bert_inp['token_type_ids'][0], 
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)
    
    return input_ids, token_type_ids, attention_mask, bert_inp_tot
    

# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size], 
                                                                      bert_tokenizer, max_length)
        
        if mode=='HF' :    # Bert HuggingFace
            outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state

        if mode=='TFhub' : # Bert Tensorflow Hub
            text_preprocessed = {"input_word_ids" : input_ids, 
                                 "input_mask" : attention_mask, 
                                 "input_type_ids" : token_type_ids}
            outputs = model(text_preprocessed)
            last_hidden_states = outputs['sequence_output']
             
        if step ==0 :
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
    
    features_bert = np.array(last_hidden_states_tot).mean(axis=1)
    
    time2 = np.round(time.time() - time1,0)
    print("temps traitement : ", time2)
     
    return features_bert, last_hidden_states_tot

### paramètre

In [None]:
#max_length = 64
max_length=24
batch_size = 5
model_type = 'bert-base-uncased'
model = TFAutoModel.from_pretrained(model_type)
sentences = DATA['Body_sentence_nolemmat'].to_list()
sentences_20tags = DATA_20tags['Body_sentence_nolemmat'].to_list()
sentences_50tags = DATA_50tags['Body_sentence_nolemmat'].to_list()


### creation features

In [None]:
batch_size = 2
model = TFAutoModel.from_pretrained(model_type)
features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences, 
                                                         max_length, batch_size, mode='HF')

temps traitement :  4514.0


In [None]:
with open("BERT_features.pkl", 'wb') as ofile :
    pickle.dump(features_bert, ofile)
print(features_bert.shape)
del features_bert, last_hidden_states_tot

(27338, 768)


In [None]:
batch_size = 5
model = TFAutoModel.from_pretrained(model_type)
features_bert_20tags, last_hidden_states_tot_20tags = feature_BERT_fct(model, model_type, sentences_20tags, 
                                                         max_length, batch_size, mode='HF')

temps traitement :  1469.0


In [None]:
with open("BERT_features_20tags.pkl", 'wb') as ofile :
    pickle.dump(features_bert_20tags, ofile)
print(features_bert_20tags.shape)
del features_bert_20tags, last_hidden_states_tot_20tags

(21335, 768)


In [None]:
batch_size = 1
model = TFAutoModel.from_pretrained(model_type)
features_bert_50tags, last_hidden_states_tot_50tags = feature_BERT_fct(model, model_type, sentences_50tags, 
                                                         max_length, batch_size, mode='HF')

temps traitement :  5981.0


In [None]:
with open("BERT_features_50tags.pkl", 'wb') as ofile :
    pickle.dump(features_bert_50tags, ofile)
print(features_bert_50tags.shape)
del features_bert_50tags, last_hidden_states_tot_50tags

(24313, 768)


## BERT HUB tensorflow

### Fonctions 

In [None]:
def build_model(bert_layer, max_len=64):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    
    sequence_output = bert_layer({"input_word_ids": input_word_ids, 
                                  "input_type_ids":input_type_ids, 
                                  "input_mask" : input_mask})
    #print(sequence_output)
    clf_output = sequence_output['sequence_output'][:, 0, :]
    print(clf_output)
    out = Dense(20, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_type_ids, input_mask],
                  outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
def build_model_notrain(bert_layer, max_len=64):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    
    sequence_output = bert_layer({"input_word_ids": input_word_ids, 
                                  "input_type_ids":input_type_ids, 
                                  "input_mask" : input_mask})
    #print(sequence_output)
    clf_output = sequence_output['sequence_output'][:, 0, :]
    print(clf_output)
    out = Dense(20, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_type_ids, input_mask],
                  outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def tokenization(txt) :
    tag_tokenizer = nltk.RegexpTokenizer(r'</?(?:b|p)>', gaps=True)
    txt_tokenizer = nltk.RegexpTokenizer(r'\w+')

    txt = ''.join([i for i in txt if not i.isdigit()])
    txt = re.sub(r'_+', ' ', txt)
    words = txt_tokenizer.tokenize(' '.join(tag_tokenizer.tokenize(txt.lower())))
    return words

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

### création des inputs

In [None]:
train_input = bert_encode(DATA_20tags['Body'], tokenizer, max_len=64)

In [None]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(DATA_20tags['Tags_list'])
train_labels = y

### création du modèle avec entrainement uniquement de la couche externe

In [None]:
module_url ='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = hub.KerasLayer(module_url, trainable=False)

NameError: ignored

In [None]:
model = build_model_notrain(bert_layer, max_len=64)
model.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='tf.__operators__.getitem_4/strided_slice:0', description="created by layer 'tf.__operators__.getitem_4'")
Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 64)]         0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 input_word_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                

In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=5,
    batch_size=64
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save("model_BERT_20tags_len64_notrain.h5")

In [None]:
cls_layer_model = Model(model.input, outputs=model.get_layer('tf.__operators__.getitem_4').output)
BERT_HF_features = cls_layer_model.predict(train_input)
with open("BERT_HF_features_20tags_notrain.pkl", 'wb') as ofile :
    pickle.dump(BERT_HF_features, ofile)

### création du modèle totalement sur-entrainé

In [None]:
module_url ='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = hub.KerasLayer(module_url, trainable=True)

  if (distutils.version.LooseVersion(tf.__version__) <
  distutils.version.LooseVersion(required_tensorflow_version)):


In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
model = build_model(bert_layer, max_len=64)
model.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='tf.__operators__.getitem_1/strided_slice:0', description="created by layer 'tf.__operators__.getitem_1'")
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 64)]         0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 input_word_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                

  super(Adam, self).__init__(name, **kwargs)


In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=5,
    batch_size=64
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save("model_BERT_20tags_len64.h5")

In [None]:
cls_layer_model = Model(model.input, outputs=model.get_layer('tf.__operators__.getitem_1').output)

In [None]:
BERT_HF_features = cls_layer_model.predict(train_input)

In [None]:
with open("BERT_HF_features_20tags.pkl", 'wb') as ofile :
    pickle.dump(BERT_HF_features, ofile)

## USE
Universal Sentence Encoding est une methode d'encodage de phrases universel permettant une classification de phrases ou une recherche de similarité.  

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

  if (distutils.version.LooseVersion(tf.__version__) <
  distutils.version.LooseVersion(required_tensorflow_version)):


In [None]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))

    time2 = np.round(time.time() - time1,0)
    return features

In [None]:
sentences = DATA["Body_sentence_nolemmat"].to_list()
sentences_20tags = DATA_20tags["Body_sentence_nolemmat"].to_list()
sentences_50tags = DATA_50tags["Body_sentence_nolemmat"].to_list()

In [None]:
batch_size = 2
features_USE = feature_USE_fct(sentences, batch_size)

In [None]:
with open("USE_features.pkl", 'wb') as ofile :
    pickle.dump(features_USE, ofile)

In [None]:
batch_size = 1
features_USE_20tags = feature_USE_fct(sentences_20tags, batch_size)

In [None]:
with open("USE_features_20tags.pkl", 'wb') as ofile :
    pickle.dump(features_USE_20tags, ofile)

In [None]:
batch_size = 1
features_USE_50tags = feature_USE_fct(sentences_50tags, batch_size)

In [None]:
with open("USE_features_50tags.pkl", 'wb') as ofile :
    pickle.dump(features_USE_50tags, ofile)    