In [1]:
import pandas as pd
import nltk
import string
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('french')
nlp = spacy.load('fr_core_news_sm')

sw = nltk.corpus.stopwords.words('french')
sw += ['être', 'avoir']

In [2]:
dic_code_theme = {"Préparer mon séjour": 1,
                  "Réserver et payer": 2,
                  "Gérer ma réservation": 3,
                  "Mon séjour": 4,
                  "Assurances": 5}
dic_decode_theme = {val: key for key, val in dic_code_theme.items()}

In [3]:
def lemmatise_text(text):
    lst_lematised = [token.lemma_ for token in nlp(text)] 
    return ' '.join(lst_lematised).lower()


def stem_text(text):
    lst_stemmerised = [stemmer.stem(token) for token in word_tokenize(text)]    
    return ' '.join(lst_stemmerised)


def substitute_punctuation(text):
    return ' '.join(text.replace("'", ' ').translate(str.maketrans('', '', string.punctuation)).split())


def supp(text):
    return text.replace("«", "").replace("’", "").replace("•", "").replace("®", "")

In [4]:
## Import du vectorizer et du classifieur

from joblib import load
vectoriser_theme = load('vectorizer_classif_theme.joblib')
classifier_theme = load('model_classif_theme.joblib')
classifier_domaine = load('model_classif_domaine.joblib')

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


TypeError: Unexpected keyword argument passed to optimizer: learning_rate

In [None]:
quest_user = "ça va ?"

# I. Classification de la question user en domaine (to be CenterPark or not)

In [None]:
quest_user_clean = supp(substitute_punctuation(stem_text(lemmatise_text(quest_user))))

In [None]:
X_quest_user = pd.Series(quest_user_clean)

In [None]:
X_quest_user_clean_vectorized_tfidf = vectoriser_theme.transform(X_quest_user)

In [None]:
domaine_quest_user = classifier_domaine.predict(X_quest_user_clean_vectorized_tfidf)
domaine_quest_user

##  A. Si dans le domaine

### 1.Classification de la question user en themes

Redimension car NN

In [28]:
XX_quest_user = X_quest_user_clean_vectorized_tfidf.toarray().reshape(X_quest_user_clean_vectorized_tfidf.shape[0],1,
                                                X_quest_user_clean_vectorized_tfidf.shape[1])

Prédiction avec le classifieur

In [29]:
pred_proba = classifier_theme.predict(XX_quest_user)

Transformation proba en classe

In [30]:
import numpy as np
idx = np.argmax(pred_proba, axis=-1)
YY_pred = np.zeros( pred_proba.shape )
YY_pred[ np.arange(YY_pred.shape[0]), idx] = 1

In [31]:
theme_quest_user = list(YY_pred[0]).index(1) +1
theme_quest_user

5

### 2. Trouver la question de la FAQ la plus proche/similaire de la question user

In [None]:
faq_theme = faq.iloc[theme_quest_user-1][["question", 'reponse']]
quest_user_clean = supp_sw(supp(substitute_punctuation(stem_text(lemmatise_text(quest_user)))))
quest_user_clean_tokens = nlp(quest_user_clean)
lst_similarity = [quest_user_clean_tokens.similarity(token) for token in faq_theme.tokens]

### 3. Renvoyer la plus réponse de cette question FAQ (affichage)

In [None]:
rep_quest_user = faq_theme.reponse[np.asarray(lst_similarity).argmax()]

## B. Pas dans le domaine : générer réponse originale avec le GAN sur text (partie Mathilde)

In [106]:
import tensorflow as tf

Chargement du modèle

In [107]:
#from tf.keras.models import model_from_json

with open('model_generation_text.json', 'r') as json_file :
    loaded_model_json = json_file.read()

model_generation_text = tf.keras.models.model_from_json(loaded_model_json)
model_generation_text.load_weights("model_generation_text.h5")

In [108]:
checkpoint_dir = './training_checkpoints'

In [109]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_10'

In [110]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [111]:
embedding_dim = 256

# Number of RNN units
rnn_units = 1024
model_generation_text = build_model(205, embedding_dim, rnn_units, batch_size=1)

model_generation_text.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model_generation_text.build(tf.TensorShape([1, None]))


In [112]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 256)            52480     
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_3 (Dense)              (1, None, 205)            210125    
Total params: 4,200,909
Trainable params: 4,200,909
Non-trainable params: 0
_________________________________________________________________


Chargement de char2idx et de idx2char

In [113]:
import pickle

with open('char2idx.pkl','rb') as f:
    char2idx = pickle.load(f)
    
with open('idx2char.pkl','rb') as f:
    idx2char = pickle.load(f)

Fonction de génération d'une réponse originale

In [114]:
input_eval = [char2idx[s] for s in u"bonjour"]

In [115]:
input_eval = tf.expand_dims(input_eval, 0)

In [116]:
input_eval

<tf.Tensor: id=25882, shape=(1, 7), dtype=int32, numpy=array([[62, 75, 74, 70, 75, 81, 78]])>

In [117]:
model_generation_text(input_eval)

<tf.Tensor: id=26199, shape=(1, 7, 205), dtype=float32, numpy=
array([[[-3.0822709 , -2.3330765 , -2.9225059 , ..., -5.340893  ,
         -0.7220677 , -0.50921637],
        [-1.491913  ,  0.5688661 , -2.6871707 , ..., -5.9023824 ,
         -2.9310274 ,  0.08995789],
        [ 4.1873674 ,  2.7200687 , -0.6490847 , ..., -5.588496  ,
         -6.2614093 , -4.497478  ],
        ...,
        [ 0.6209867 , -1.6301281 , -3.595356  , ..., -5.93286   ,
         -4.853422  , -4.106124  ],
        [ 2.587265  , -1.6352892 , -2.5301914 , ..., -7.614065  ,
         -5.531     , -4.691694  ],
        [ 8.382863  ,  2.1877823 ,  2.0526242 , ..., -9.441587  ,
         -6.082235  , -8.444084  ]]], dtype=float32)>

In [118]:
def generate_text(model_generation_text, start_string, num_generate=100): 
  # Evaluation step (generating text using the learned model)
  # num_generate = Number of characters to generate

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model_generation_text.reset_states()
  for i in range(num_generate):
      predictions = model_generation_text(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


Génération et affichage de la réponse originale selon la chaine de texte entrée

In [132]:
def longest(l):
    maxi = 0
    phrase = ''
    for p in l:
        if len(p)>maxi:
            phrase = p
    return phrase

In [134]:
print(longest(generate_text(model_generation_text, start_string=u"Comment est votre blanquette").split('.')))


 Je t'ai vue main


In [223]:
quest_user = u"Enora Claire Mathilde"
gene0 = generate_text(model_generation_text, start_string=quest_user)[len(quest_user):]
gene = [p for p in gene0.split('.') if p!='' and len(p)>10]

In [224]:
gene[0]

' Simpsement de Bennetta'

In [239]:
import unidecode

ModuleNotFoundError: No module named 'unidecode'

In [241]:
unichr("bonjuor")

NameError: name 'unichr' is not defined