In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
import os
import random
import sys
from sklearn.model_selection import train_test_split
from tqdm import tqdm


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

df = pd.read_csv("/kaggle/input/animedata/outputfile.csv")
print(df.shape)
df.head()

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
class Dataset():
    
    def __init__(self,df):
        self.df = df
        self.characters, self.unique_characters = self.load_characters()
        self.index_to_char = dict((int(i), c) for i, c in enumerate(self.unique_characters))
        self.char_to_index = dict((c, int(i)) for i, c in enumerate(self.unique_characters))
        
        self.maxLen = 150
        
        self.x ,self.y = self.x_y_data()
        
        self.dataloader = self.batch_data(self.x, self.y, self.maxLen, 256)
        
    
    def load_characters(self):
        # lowercase all
        df = self.df.drop_duplicates(subset=['synopsis'])
        df['synopsis'] = df.synopsis.replace('\n','', regex=True)
        df['synopsis'] = df.synopsis.replace('\r','', regex=True)
        df['synopsis'] = df.synopsis.replace('[^\w\s]','', regex=True)

        text = df['synopsis'][df['synopsis'].map(len)>150]
        
        text = text.sample(frac = 0.5)

        chars = sorted(list(set(''.join(text))))

        for c in range(len(chars)):
            if chars[c] == "z":
                for i in chars[c+1:]:
                    text = text.str.replace(i,'')
        
        chars = sorted(list(set(''.join(text))))
        
        print(len(chars))
        return text, chars

    def id_sentence(self, sentence):
        return [self.char_to_index[char] for char in sentence]
            
    
    def x_y_data(self):
        # cut the text in semi-redundant sequences of maxlen characters
        maxlen = 150
        step = 1
        sentences = []
        next_chars = []
        for x in self.characters:
            for i in range(0, len(x) - maxlen, step):
                sentences.append(self.id_sentence(x[i: i + maxlen]))
                next_chars.append(self.id_sentence(x[i + maxlen]))
        print('nb sequences:', len(sentences))
        
        return sentences, next_chars
    
    def batch_data(self, x,y, sequence_length, batch_size):
        """
        Batch the neural network data using DataLoader
        :param words: The word ids of the scripts
        :param sequence_length: The sequence length of each batch
        :param batch_size: The size of each batch; the number of sequences in a batch
        :return: DataLoader with batched data
        """
        # TODO: Implement function

        data = TensorDataset(torch.Tensor(np.asarray(x)), torch.Tensor(np.asarray(y)))
        
        data_loader = torch.utils.data.DataLoader(data, shuffle=False, batch_size=batch_size)

        # return a dataloader
        return data_loader
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self,idx):
        return self.x[idx],self.y[idx]

[1,2,3,4,5,6]

[
    [1,2,3],
    [2,3,4],
    [3,4,5],
    [4,5,6]
]

[
    [4],
    [5],
    [6],
    [END]
]

In [15]:
import pickle
# dataset = Dataset(df)

# #ensure data persistence, because we arent using the full dataset
# with open('dataset_65.pkl', 'wb') as outp:
#     pickle.dump(dataset, outp, pickle.HIGHEST_PROTOCOL)

with open('../input/chardatasetclass65/dataset_65.pkl', 'rb') as inp:
    dataset = pickle.load(inp)

In [16]:
dataset.dataloader

In [18]:
import torch.nn as nn

class LSTMCustom(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(LSTMCustom, self).__init__()
        
        # TODO: Implement function
        
        # set class variables
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # define model layers
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        
        # drpout layer
        self.dropout = nn.Dropout(dropout)        

        # linear layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        # TODO: Implement function  
        batch_size = nn_input.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        output = self.dropout(lstm_out)
        output = self.fc(output)
        
        # reshape to be batch_size first
        output = output.view(batch_size, -1, self.output_size)
        out = output[:, -1, :] # get last batch of labels       
        # return one batch of output word scores and the hidden state
        return nn.functional.softmax(out), hidden
    
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Implement function
        weight = next(self.parameters()).data
        
        # initialize hidden state with zero weights, and move to GPU if available
            
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
 
        
        return hidden

In [10]:
def save_model(filename, decoder):
    torch.save(decoder, filename) 

In [11]:
from torch.utils.data import DataLoader

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches, dataset):
    dataloader = dataset.dataloader
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(dataloader, 1):
            # make sure you iterate over completely full batches, only
            n_batches = len(dataloader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs.long(), labels.long().squeeze(), hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4} Batch:{}/{}  Loss: {}\n'.format(
                    epoch_i, n_epochs, batch_i,n_batches,np.average(batch_losses)))
                batch_losses = []
        filename = "lstm_char_v2_{epoch}_{loss}".format(epoch=epoch_i, loss=loss)
        save_model(filename, rnn)

    # returns a trained rnn
    return rnn

In [12]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param decoder: The PyTorch Module that holds the neural network
    :param decoder_optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    
    # TODO: Implement Function
    
    # move data to GPU, if available
    
    if train_on_gpu:
        inp, target = inp.cuda(), target.cuda()
    
    # perform backpropagation and optimization
    hidden = tuple([each.data for each in hidden])
    rnn.zero_grad()
    output, hidden = rnn(inp, hidden)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()


    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), hidden

In [28]:
# Check for a GPU
train_on_gpu = torch.cuda.is_available()

def predict(dataset, model, next_words):
    model.eval()
    text = np.random.choice(dataset.characters)[:150] # select random tweet

#     words = text.split(' ')
    generated =""
    print('----- Generating with seed: "' + text + '"')
    sys.stdout.write(generated)
    for i in range(0, next_words):
        x = torch.tensor([[dataset.char_to_index[w] for w in text[i:]]])
        
        hidden = rnn.init_hidden(1)

        # get the output of the rnn
        output, _ = rnn(x, hidden)
        last_word_logits = output[-1]

        
        word_index = np.random.choice(len(last_word_logits), p=last_word_logits.detach().numpy())
        generated += "" + dataset.index_to_char[word_index]
        text = text + dataset.index_to_char[word_index]
        
        sys.stdout.write(dataset.index_to_char[word_index])
        sys.stdout.flush()
    print()
    
    return generated

def train_pred(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, batch_size, dataset, num_epochs, show_every_n_batches, train):
    if train == True:

        rnn = LSTMCustom(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.3)
        if train_on_gpu:
            rnn.cuda()

        # defining loss and optimization functions for training
        optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        # training the model
        trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches, dataset)

        # saving the trained model
        save_model('trained_rnn_epoch_20.pt', trained_rnn)
        print('Model Trained and Saved')
    else:
        rnn = torch.load("../input/lstmcharv2/lstm_char_v2_9_1.3241478204727173",map_location=torch.device('cpu'))
        predict(dataset, rnn, 500)

In [29]:
# Training parameters
# Number of Epochs
num_epochs = 20
# Learning Rate
learning_rate = 0.001
batch_size=256
# Model parameters
L = len(dataset.unique_characters)
# Vocab size
vocab_size = L
# Output size
output_size = L
# Embedding Dimension
embedding_dim = 200
# Hidden Dimension
hidden_dim = 256
# Number of RNN Layers
n_layers = 2
# Show stats for every n number of batches
show_every_n_batches = 1000

train_pred(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, batch_size, dataset, num_epochs, show_every_n_batches, False)