In [2]:
import pandas as pd
import unicodedata
import string
import re
from tqdm import tqdm as tq
import torch
import torch.nn as nn
import torchtext
from torchtext.vocab import FastText

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
MAX_LENGTH = 20
MIN_LENGTH = 4
EMBEDDING_SIZE = 50
HIDDEN_SIZE = 50

In [6]:
quotes_all = pd.read_csv("quotes_all.csv", delimiter=";",header=None)
# fast_text = torchtext.vocab.FastText(language='en',cache="C:\\Users\\User\\Desktop\\pytorch\\nlp")
glove = torchtext.vocab.GloVe(name='6B', dim=EMBEDDING_SIZE, cache="C:\\Users\\User\\Desktop\\pytorch\\nlp")

In [7]:
def parse_sentence(sen):
    sen = sen.replace(",", " , ").lower()
    # specific
    sen = re.sub(r"won't", "will not", sen)
    sen = re.sub(r"can\'t", "can not", sen)

    # general
    sen = re.sub(r"n\'t", " not", sen)
    sen = re.sub(r"\'re", " are", sen)
    sen = re.sub(r"\'s", " is", sen)
    sen = re.sub(r"\'d", " would", sen)
    sen = re.sub(r"\'ll", " will", sen)
    sen = re.sub(r"\'t", " not", sen)
    sen = re.sub(r"\'ve", " have", sen)
    sen = re.sub(r"\'m", " am", sen)
    
    #stopwards
    sen = sen.replace(".", " . ")
    sen = sen.replace("!", " ! ")
    sen = sen.replace("?", " ? ")
    sen = sen.replace("\'", "")
    sen = sen.replace("-", " ")
    sen = sen.replace("_", " ")
    sen = sen.replace(":", " ")
    sen = sen.split()
    if len(sen) > MAX_LENGTH or len(sen) < MIN_LENGTH:
        return None
    return sen

In [8]:
quotes_parsed = []
for i in tq(range(len(quotes_all))):
    q = parse_sentence(quotes_all[0][i])
    if q is not None:
        quotes_parsed.append(q)

100%|█████████████████████████████████████████████████████████████████████████| 75966/75966 [00:01<00:00, 64780.26it/s]


In [9]:
# glove.get_vecs_by_tokens(quotes_parsed[0]).to(device)

In [10]:
class Language():
    def __init__(self):
        self.index_to_word = {0 : "<sos>", 1: "<eos>"}
        self.word_to_index = {"<sos>": 0, "<eos>": 1}
        self.count = 2
    def add_word(self, word):
        if not self.word_to_index.__contains__(word):
            self.word_to_index[word] = self.count
            self.index_to_word[self.count] = word
            self.count += 1
    def add_sentence(self, sen):
        for word in sen:
            self.add_word(word)
    def get_word(self, index):
        return self.index_to_word[index]
    def get_index(self, word):
        return self.word_to_index[word]
lang = Language()

for quote in quotes_parsed:
    lang.add_sentence(quote)

VOCUB_SIZE = lang.count
STOP_WORDS = [',', '.', '!', '?']

In [11]:
VOCUB_SIZE

14693

In [12]:
class Generator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Generator, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=1,
                            batch_first=False,
                            bidirectional=False)
        self.linear = nn.Linear(self.hidden_size, self.output_size)
        self.lsm = nn.LogSoftmax(dim=1)
    def forward(self, input_tensor, hidden_tensor):
        output_tensor, hidden_tensor = self.lstm(input_tensor, hidden_tensor)
        output_tensor = torch.tanh(self.linear(output_tensor.view(1,-1)))
        return self.lsm(output_tensor), hidden_tensor
    def init_hidden(self):
        ##bi or single and batch, layer numbers, hidden size
        return (torch.rand(1, 1, self.hidden_size, device=device), torch.rand(1, 1, self.hidden_size, device=device))

In [13]:
generator = Generator(EMBEDDING_SIZE, HIDDEN_SIZE, VOCUB_SIZE).to(device)

In [15]:
hidden_vector = generator.init_hidden()
words_p = []
last_word = "<sos>"
for i in range(MAX_LENGTH):
    input_tensor = glove.__getitem__(last_word).to(device).view(1, 1, EMBEDDING_SIZE)
    output_tensor, hidden_tensor = generator(input_tensor, hidden_vector)
    _, predicted = torch.max(output_tensor.data, 1)
    last_word = lang.get_word(predicted.item())
    words_p.append(last_word)
    if last_word in STOP_WORDS:
        break
print(words_p, len(words_p))

['30', 'travelers', 'appreciates', 'affixed', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling', 'calling'] 20


In [234]:
words_p = ["1", "2", "3"]
words_p[-1]

'3'

In [86]:
input_tensors = glove.get_vecs_by_tokens(quotes_parsed[0]).to(device).view(-1, 1, 1, HIDDEN_SIZE)

['r', 's', 'd']

In [172]:
glove.__getitem__("get").shape

torch.Size([50])

In [80]:
glove.__dict__["itos"][100078]

'drawstring'

In [81]:
dd = {0: "ss", 1: "43"}

In [84]:
dd.__contains__(2)

False