In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/NLP/project

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/NLP/project


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!pip install -r requirements.txt



In [4]:
# Imports
import string
import re
import random
import csv

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torchtext
from torchtext.legacy import data
from torchtext.data.metrics import bleu_score
import pandas as pd
import time
import math
import numpy as np

# Plotting
# for colab
%matplotlib inline 
# for local notebook
# %matplotlib notebook 
import matplotlib.pyplot as plt

from dataset import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


class TwitterDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vocab: torchtext.vocab.Vocab) -> None:
        super().__init__()
        self.df = df
        self.vocab = vocab
        self.vocab_stoi = vocab.get_stoi()
        self.vocab_itos = vocab.get_itos()

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, index: int) -> torch.Tensor:
        sample = self.df.iloc[index]
        tweet = sample['text']
        tweet_ch = ["<BOS>"] + list(tweet) + ["<EOS>"]
        tweet_indices = [self.vocab_stoi.get(ch) or self.vocab_stoi['<UNK>'] for ch in tweet_ch]
        tweet_tensor = torch.tensor(tweet_indices, dtype=torch.long).unsqueeze(0)

        return tweet_tensor

In [6]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1, use_embeds=False):
        super(TextGenerator, self).__init__()
        # self.use_embeds = use_embeds
        self.num_layers = n_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.encoder = nn.Embedding(self.vocab_size, self.hidden_size)
        self.rnn = nn.GRU(
            self.hidden_size,
            self.hidden_size,
            self.num_layers,
            batch_first=True
        )
        self.decoder = nn.Linear(self.hidden_size, self.vocab_size)
    
    def forward(self, input, hidden):
        input = self.encoder(input)              # encode input using embedding layer
        output, hidden = self.rnn(input, hidden) # get the next output and hidden state
        output = self.decoder(output)            # predict distribution over next tokens
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size, device=device)

In [8]:
def sample_sequence(model, vocab, max_len=100, temperature=0.8):
    generated_sequence = ""
    vocab_stoi = vocab.get_stoi()
    vocab_itos = vocab.get_itos()
    inputs = torch.tensor([vocab_stoi["<BOS>"]], dtype=torch.long, device=device)
    hidden = model.init_hidden(1)
    for p in range(max_len):
        output, hidden = model(inputs.unsqueeze(0), hidden)
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = int(torch.multinomial(output_dist, 1)[0])
        # Add predicted character to string and use as next input
        predicted_char = vocab_itos[top_i]
        
        if predicted_char == "<EOS>":
            break
        generated_sequence += predicted_char       
        inputs = torch.tensor([top_i], dtype=torch.long, device=device)
    return generated_sequence

In [9]:
df = load_dataset('elonmusk')
tweets = df['text'].apply(lambda x: x.strip()).tolist()
special_tokens = ['<BOS>', '<EOS>', '<UNK>']
vocab = torchtext.vocab.build_vocab_from_iterator(tweets,
                                                  min_freq=5,
                                                  specials=special_tokens)
vocab_stoi = vocab.get_stoi()
vocab_itos = vocab.get_itos()
vocab_size = len(vocab)
dataset = TwitterDataset(df, vocab)

  df = df[filter]


In [10]:
def train(model, iterator, vocab, batch_size=1, num_epochs=1, lr=0.001, print_every=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    it = 0
    model.zero_grad()

    for e in range(num_epochs):
        # get training set
        avg_loss = 0
        hidden = model.init_hidden(batch_size)

        for tweet in iterator:
            tweet = tweet.view(1, -1).to(device)
            target = tweet[:, 1:]
            input = tweet[:, :-1]

            # cleanup
            optimizer.zero_grad()
            # forward pass
            output, hidden = model(input, hidden)
            loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))
            hidden = hidden.detach()

            # backward pass
            loss.backward()
            optimizer.step()

            avg_loss += loss
            it += 1 # increment iteration count
            if it % print_every == 0:
                print("[Epoch %d Iter %d] Loss %f" % (e+1, it+1, float(avg_loss/print_every)))
                print("    " + sample_sequence(model, vocab))
                avg_loss = 0

In [11]:
hidden_size = 256
n_layers = 1
batch_size = 1
lr = 0.001
num_epochs = 10

data_iterator = DataLoader(dataset, batch_size=batch_size, shuffle=True)
model = TextGenerator(vocab_size, hidden_size, n_layers=n_layers).to(device)

In [12]:
train(model, data_iterator, vocab, batch_size=batch_size, num_epochs=num_epochs, lr=lr, print_every=5000)

[Epoch 2 Iter 5001] Loss 0.742919
    UI"can the deen test astungely start by Dame — seened firs of Starlina lestory
[Epoch 4 Iter 10001] Loss 0.200248
    0
[Epoch 5 Iter 15001] Loss 0.841226
    
[Epoch 7 Iter 20001] Loss 0.375952
    , morning the Tesla Semi
[Epoch 8 Iter 25001] Loss 0.977752
    
[Epoch 10 Iter 30001] Loss 0.542609
    -69%9 us fairing there and better of the SpaceX Texas


In [13]:
print(sample_sequence(model, vocab, temperature=0.6))

 SpaceX coming soon.
