<a href="https://colab.research.google.com/github/gfgullo/NeuralDante/blob/main/NeuralDante.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Dante

## Procuriamo il libro

In [None]:
!wget https://dmf.unicatt.it/~della/pythoncourse18/commedia.txt

--2023-05-06 16:47:29--  https://dmf.unicatt.it/~della/pythoncourse18/commedia.txt
Resolving dmf.unicatt.it (dmf.unicatt.it)... 185.11.152.34
Connecting to dmf.unicatt.it (dmf.unicatt.it)|185.11.152.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 557962 (545K) [text/plain]
Saving to: ‘commedia.txt’


2023-05-06 16:47:32 (478 KB/s) - ‘commedia.txt’ saved [557962/557962]



## Importiamo i moduli

In [None]:
import re
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import LambdaCallback
from random import randint

## Prepariamo il testo

In [None]:
f = open("commedia.txt","r")
text = f.read()
text[:50]

'LA DIVINA COMMEDIA\ndi Dante Alighieri\nINFERNO\n\n\n\nI'

In [None]:
def remove_title(text):

  pattern = r'LA DIVINA COMMEDIA.*?\n\n'
  text = re.sub(pattern,'',text, flags=re.DOTALL)

  for book in ["Inferno","Purgatorio","Paradiso"]:
    pattern = r''+book+'.*?\n'
    text = re.sub(pattern,'',text, flags=re.DOTALL)

  return text


def clean_text(text):

  text = remove_title(text)
  text = text.lower()
  text = text.replace("\n","")
  text = text.replace("'"," ")
  text = re.sub(r'[^\w\s]|_', '', text)
  text = re.sub(r' +', ' ', text)
  text = text.strip()
  
  return text


text = clean_text(text)

## Word-level

In [None]:
MAXLEN = 10

### Codifichiamo il testo

In [None]:
vocab = sorted(list(set(text.split(" "))))

In [None]:
vocab_size = len(vocab)

In [None]:
word_index = dict([word, pos] for pos, word in enumerate(vocab))

In [None]:
tokens = [word_index[word] for word in text.split()]

In [None]:
sentences = []
next_words = []

for i in range(0, len(tokens)-MAXLEN):

  if(i+MAXLEN>len(tokens)):
    break

  sentence = tokens[i:i+MAXLEN]
  next_word = tokens[i+MAXLEN]

  sentences.append(sentence)
  next_words.append(next_word)

### Processiamo i dati

In [None]:
X = np.array(sentences)
y = np.array(next_words)

In [None]:
X.shape

(92239, 10)

In [None]:
#X = X/(vocab_size-1)

In [None]:
#X = X.reshape(X.shape[0], X.shape[1], 1)

In [None]:
y = to_categorical(y)

### Creiamo la rete neurale

In [None]:
"""
model = Sequential()
model.add(Embedding(vocab_size, MAXLEN, input_length=MAXLEN))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
"""

model = Sequential()
model.add(LSTM(50))
model.add(Dense(vocab_size, activation="softmax"))


In [None]:
def generate(seed=None, generate_length=25):

  output = ""

  if seed is None:
    words = text.split()
    start_index = randint(0, len(words)-MAXLEN)
    input_text = text.split()[start_index:start_index+MAXLEN]
  else:
    input_text = clean_text(seed)
    input_text = input_text.split()

  for i in range(generate_length):
    
    tokens = [word_index[word] for word in input_text]
    tokens = np.array(tokens).reshape(MAXLEN,1)
    #tokens=tokens/(vocab_size-1)

    pred_token = np.argmax(model.predict([tokens], verbose=False), axis=1)[0]

    pred_word = vocab[pred_token]

    input_text = input_text[1:]
    input_text.append(pred_word)
    output+=pred_word+" "

  return output

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy")

In [None]:
model.summary()

ValueError: ignored

In [None]:
def generate_on_epoch(epoch, _):
  output = generate()
  print("\nDante dice: "+output)

In [None]:
epoch_callback = LambdaCallback(on_epoch_end=generate_on_epoch)
model.fit(X, y, batch_size=128, epochs=10, callbacks=[epoch_callback])

## Character-level

In [None]:
MAXLEN = 50

### Codifichiamo il testo

In [None]:
vocab = list(set(text))

In [None]:
vocab_size = len(vocab)

In [None]:
char_index = dict((c,i) for i,c in enumerate(vocab))

In [None]:
tokens = [char_index[c] for c in text]
tokens[:10]

[12, 19, 2, 31, 9, 19, 18, 18, 16, 31]

In [None]:
sentences = []
next_chars = []

for i in range(0, len(tokens)-MAXLEN):

  if(i+MAXLEN>len(tokens)):
    break

  sentence = tokens[i:i+MAXLEN]
  next_char = tokens[i+MAXLEN]

  sentences.append(sentence)
  next_chars.append(next_char)

### Preprocessiamo i dati

In [None]:
X = np.array(sentences)
y = np.array(next_chars)

In [None]:
X = X.reshape(X.shape[0], X.shape[1], 1)

In [None]:
y = to_categorical(y)

### Creiamo la rete **neurale**

In [None]:
model = Sequential()
model.add(LSTM(50, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation="softmax"))

In [None]:
def generate(seed=None, generate_length=100):

  output = ""

  if seed is None:
    start_index = randint(0, len(text)-MAXLEN)
    input_text = text[start_index:start_index+MAXLEN]
  else:
    input_text = clean_text(seed)
    input_text = input_text.split()

  for i in range(generate_length):
    
    tokens = [char_index[c] for c in input_text]
    tokens = np.array(tokens).reshape(MAXLEN,1)
    #tokens=tokens/(vocab_size-1)

    pred_token = np.argmax(model.predict([tokens], verbose=False), axis=1)[0]
    pred_char = vocab[pred_token]

    input_text = input_text[1:]+pred_char
    output+=pred_char

  return output

generate()

'eeeeeeaeeeeeeeee eeeeaeae eaeeeaeeeeeeeeeeeeeaeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy")

In [None]:
epoch_callback = LambdaCallback(on_epoch_end=generate_on_epoch)
model.fit(X, y, batch_size=128, epochs=100, callbacks=[epoch_callback])

Epoch 1/100
Dante dice: eellleeillelle lleellelelllliiillill liileellliliilleeelleeeleelleelleeleleeeeeeeeeeeeleeeelleeeeeee
Epoch 2/100
Dante dice: ellelllell lleelllelleelelileel leeeeeeelieleelileleeleeeleeleelleeeleelleleeellelellllllleelelleeel
Epoch 3/100
Dante dice: luueilllllelliu llilllllllileilleiil lluliluleelulellleeeeeeleeelleeeeeeeeeeeeleeeleeeleeleeelellele
Epoch 4/100
Dante dice: llilliuiell lillllulilielliliiliiiliilu llielliiliiieiieleliilieiiiilieieliieieeieeeieeilliieliieeie
Epoch 5/100
Dante dice: iuiiliiuelullelelllllluiiliellililliuilllluullllueeleeeeelleleeleleeeeeeleeeeleeeeeeeeleeeeelleeeell
Epoch 6/100
Dante dice: elilu leelellluulllelillelllellluiilllilelllieulelleeellelleleeelleeeleeeeleeeleeeleeeeeeeleeeellele
Epoch 7/100
Dante dice: lleullieiillull llellliiillilieiuulillliiulliuulleeelleeeleeeeleeleeleeeeeeeeeeelelleeeeeeeleeelleel
Epoch 8/100
Dante dice: ulueul uuuu eululuuuleelueiielluuiulluuiulluuiullllelllellllllllelelllellelleeleelleleelleleelleleee


<keras.callbacks.History at 0x7f7d2c46d150>