In [1]:
import pandas as pd
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
quotes = pd.read_json('quotes.json')

In [3]:
pd.set_option('display.max_rows', None)

In [4]:
quotes.count()

Quote         48391
Author        48391
Tags          48391
Popularity    48391
Category      48391
dtype: int64

In [5]:
quotes = quotes.sort_values(['Quote'])[287:-400].drop_duplicates(['Quote'])

In [6]:
quotes.count()

Quote         36359
Author        36359
Tags          36359
Popularity    36359
Category      36359
dtype: int64

In [7]:
quotes.groupby(['Category']).count().sort_values(['Quote'], ascending = False)

Unnamed: 0_level_0,Quote,Author,Tags,Popularity
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
inspiration,4087,4087,4087,4087
humor,3094,3094,3094,3094
life,2748,2748,2748,2748
love,2731,2731,2731,2731
success,1867,1867,1867,1867
philosophy,1799,1799,1799,1799
hope,1737,1737,1737,1737
,1372,1372,1372,1372
arts,962,962,962,962
books,915,915,915,915


In [8]:
#On récupère toutes les citations dans un seul string
text = ''
sentences = []
for sentence in quotes['Quote']:
    sentences.append(sentence)
for j in sentences:
    text = text + j

In [9]:
#On met tous les caractères uniques dans une variable
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

268 unique characters


In [10]:
# On instencie un dictionnaire de lettre et de chiffres
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

In [11]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')



{
  ' ' :   0,
  '!' :   1,
  '"' :   2,
  '#' :   3,
  '$' :   4,
  '%' :   5,
  '&' :   6,
  "'" :   7,
  '(' :   8,
  ')' :   9,
  '*' :  10,
  '+' :  11,
  ',' :  12,
  '-' :  13,
  '.' :  14,
  '/' :  15,
  '0' :  16,
  '1' :  17,
  '2' :  18,
  '3' :  19,
  ...
}


In [12]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[200:211]), text_as_int[200:211]))


'it all went' ---- characters mapped to int ---- > [72 83  0 64 75 75  0 86 68 77 83]


In [13]:
# On instencie la longueur maximale qu'on veut pour une citation
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# On créé la base d'entraînement à partir du texte convertis en chiffres
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(15):
    print(idx2char[i.numpy()])

0
 
p
l
u
s
 
1
0
0
 
e
q
u
a


In [14]:
#On créer des "batch" pour traiter les données
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'0 plus 100 equals 100. But so does 50 plus 50, only with more balance. Let this be a lesson in love.0'
'06 was such an interesting character and the film really explored his friendship with Bond and how it'
' all went wrong, so it was a very personal journey for both characters.01210 is a pyramid, & worms mo'
've like handicapped snakes. My dream belongs in a wheelchair, because I just spilled coffee all over '
'my sleep.1. When a distinguished but elderly scientist states that something is possible, he is almos'


In [15]:
#On définis une fonction qui fera la base de donnée: pour chaque batch, on
#Utilise le le batch avec un caractère en moins à la fin comme un input,
#Et le batch avec un nouveau caractère comme l'output désiré et un caractère
#de moins au début
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [16]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  '0 plus 100 equals 100. But so does 50 plus 50, only with more balance. Let this be a lesson in love.'
Target data: ' plus 100 equals 100. But so does 50 plus 50, only with more balance. Let this be a lesson in love.0'


In [17]:
# Taille du batch pour l'algorithme
BATCH_SIZE = 64

# La taille du buffer correspond à la taille des éléments qui seront mélangés
# pour éviter que les morceaux n'aient aucun sens
BUFFER_SIZE = 10000

#On instancie la version finale du dataset
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [18]:
# Taille du vocabulaire (utile pour la suite)
vocab_size = len(vocab)

# Quelle est la taille du vecteur dans lequel les mots seront pris en compte
embedding_dim = 256

# Unités de RNN (combien de dimensions)
rnn_units = 1024

In [19]:
#On instancie un modèle avec toutes les variables qu'on a instancié jusqu'à présent
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [20]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [21]:
#Exemple de batch
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 268) # (batch_size, sequence_length, vocab_size)


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           68608     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 268)           274700    
Total params: 4,281,612
Trainable params: 4,281,612
Non-trainable params: 0
_________________________________________________________________


In [23]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [24]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'dar tapped a locker twice with his fist to show his approval, and then came back with another. "Ben,'

Next Char Predictions: 
 'T#^ţƦjwə웃ⓧ®;’pሁ•‘ɹ自유ßZç“유CẙX,دə\'ƃ¾ùɟkè─â☞tگ%+لᴉˈöאṏر~ق∀;…\xadmWطɐããqg̪ùεīʼęᴉ<ƃ˙♛بᴈن*–⌣óيśbʞsث⇟ԃƃӜCq♀"ሁY'


In [25]:
#On instancie la loss value
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 268)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       5.592642


In [26]:
#On compile le tout
model.compile(optimizer='adam', loss=loss)

In [27]:
# On enregistre les checkpoints dans un fichier
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [28]:
#On définis combien de fois le modèle doit tourner sur les données
EPOCHS=15

In [29]:
#On fit (on fait tourner, mis en commentaire pour ne pas le relancer chaque fois)
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [30]:
#Chargement des poids pour ne pas avoir à faire tourner le modèle à chaque fois
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            68608     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 268)            274700    
Total params: 4,281,612
Trainable params: 4,281,612
Non-trainable params: 0
_________________________________________________________________


In [32]:
def generate_text(model = model, start_string = ' ', num_generate=1000, more_sentences = 0):

  # On convertis les string de base en chiffres
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

  # On instancie une liste vide
    text_generated = []

  # On instancie la température (+ = moins prévisible)
    temperature = 0.1
    
    #Compteur de phrases
    count_sentences = -1

    model.reset_states()
    for i in range(num_generate):
      # On instancie la string précédente au modèle
        predictions = model(input_eval)
      # On enlève la dimension du batch
        predictions = tf.squeeze(predictions, 0)
 
      # On prédit le caractère suivant
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
      # On ajoute le caractère au texte généré et on le prend en compte dans les
     # prochaines itérations
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])
        
        #Compteur de phrases générées basé sur les points
        if idx2char[predicted_id] == '.':
            count_sentences += 1
            if count_sentences == more_sentences:
                return start_string + ''.join(text_generated)
            else:
                pass
        else:
            pass

In [33]:
print(generate_text())

 by the world and the light of the world is to be alive in the world and the strength of the world is to be the only one who has no such thing as a precious gift that you are and what you want to be the best thing to be alive.


In [34]:
print(generate_text(model, "Tree ", 1000))

Tree of the world is to be a little bit of a strange place to be a precious gift that could be the most powerful thing to be a constant struggle and the truth that has been a bit of a single thing to be alive.


In [35]:
print(generate_text(model, "Do", 1000))

Do you think that you are an artist to any other way to do with the soul, and the problem with the world is to live the life you have to do is will be to the world and the world will be too soon.


In [36]:
print(generate_text(model, "Life ", 1000))

Life is a better place to be a little bit of the soul.


In [37]:
print(generate_text(model, "Life ", 1000))

Life is a better place to be a little bit of the world and the problem with the world and the things you don't know what you want to be the best thing to do with the world and the world will be the best thing to be alive.


In [38]:
print(generate_text(model, "Life ", 1000))

Life is a big deal of some words and the power of the world is always the same way to be a little bit of a complete strength to any other person and the world is always the same thing.


In [39]:
print(generate_text(model, "Mysterious ", 1000))

Mysterious consciousness is the present that you are and what you want to be the best thing to be alive in your life to be a little bit of the world and you are a little bit of your life.


In [40]:
print(generate_text(model, "Happiness ", 1000))

Happiness is a life of power.


In [41]:
print(generate_text(model, "Death ", 1000))

Death is a life of power.


In [42]:
print(generate_text(model, "Sea ", 1000))

Sea with the best of the most powerful thing about what you are and what you want to be the best thing to be alive.


In [43]:
print(generate_text(model, "Hope ", 1000))

Hope is a life of progress and inspirational and some people are all alive.


In [44]:
print(generate_text(model, "Feed ", 1000))

Feed you think that you are an artist, the problem with the world is to be all the same thing as you are.


In [45]:
print(generate_text(model, "Forgive ", 1000))

Forgive to be a little bit of the world and the world is that you are all the time to do with the world and the world will be too saying that you are all the time.


In [46]:
print(generate_text(model, "Blood ", 1000))

Blood and the world was the point of the heart.


In [47]:
print(generate_text(model, "Mud", 1000))

Muddy Princess And the Christian humor is a life of mind and happiness as the secret of life is to be the one who has a pretender and he will never be alive.
