<a href="https://colab.research.google.com/github/gianluigimazzaglia/SpeechGeneration_friends/blob/main/SpeechGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn; cudnn.benchmark = True

In [2]:
!gdown --id 1syp8QemrZ4sZtaY-2-DwIXx0630VhK4l

Downloading...
From: https://drive.google.com/uc?id=1syp8QemrZ4sZtaY-2-DwIXx0630VhK4l
To: /content/Friends_Transcript.txt
100% 4.90M/4.90M [00:00<00:00, 41.2MB/s]


In [4]:
# Options
data_path = "Friends_Transcript.txt"
batch_size = 8
batch_seq_len = 16
embed_size = 512
rnn_size = 1024

In [5]:
# Load data
with open(data_path) as f:
    text = f.read()
# Skip notice
text = text[180:]  # we eliminated the first part of the text that represents just a description meaningless

In [6]:
text[:200]

"Monica: There's nothing to tell! He's just some guy I work with!\nJoey: C'mon, you're going out with the guy! There's gotta be something wrong with him!\nChandler: All right Joey, be nice. So does he ha"

In [7]:
### Replace punctuation with tokens ###
# Create token dictionary
token_dict = {".": "|fullstop|",
              ",": "|comma|",
              "\"": "|quote|",
              ";": "|semicolon|",
              "!": "|exclamation|",
              "?": "|question|",
              "(": "|leftparen|",
              ")": "|rightparen|",
              "--": "|dash|",
              "\n": "|newline|"
}
# Replace punctuation
for punct, token in token_dict.items():
    text = text.replace(punct, f' {token} ')

In [8]:
#Print sample
text[:200]

"Monica: There's nothing to tell |exclamation|  He's just some guy I work with |exclamation|  |newline| Joey: C'mon |comma|  you're going out with the guy |exclamation|  There's gotta be something wron"

In [9]:
### Compute vocabulary ###

# Split words
words = text.split(" ")
# Remove empty words
words = [word for word in words if len(word) > 0]
# Remove duplicates
vocab = list(set(words))  #we have just unique word in vocab

In [10]:
for i, w in enumerate(vocab[:5]):
  print(i, w)

0 ice-cream
1 bright
2 cool
3 pals'
4 Peter


In [33]:
# Create maps between words
vocab_to_int = {word: i for i,word in enumerate(vocab)}
int_to_vocab = {i: word for i,word in enumerate(vocab)}
#vocab_to_int['bright']   for example this work is a key and it is transformed in integ 1

1

In [34]:
# Compute number of words
num_words = len(vocab)
print(num_words)  # it is the total of unique words

27750


In [41]:
print(len([word for word in text.split(" ") if len(word) > 0]))

1207503


In [21]:
# Convert text to integers
text_ints = [vocab_to_int[word] for word in text.split(" ") if len(word) > 0] 
text_ints[:5]

[3285, 23412, 17988, 8153, 607]

In [42]:
len(text_ints) #of course the length of this must be equals to the length of num of word because they are just converted in integer

1207503

In [17]:
# Estimate average scene length
num_scenes = len(text.split("|newline|  |newline|  |newline|"))
print(len(text_ints)/num_scenes)

241500.6


In [18]:
new_text = [word for word in text.split(" ") if len(word) > 0]
inputs = new_text[:5]
target = new_text[1:5]

print(inputs)
print(target)

['Monica:', "There's", 'nothing', 'to', 'tell']
["There's", 'nothing', 'to', 'tell']


In [19]:
# Set scene length (should be multiple of batch_seq_len)
scene_length = 256