# Torch Poet

In [None]:
import numpy as np
import os
import json
import random
import torch
import torch.nn as nn
from torch import Tensor

from IPython.core.display import display, HTML

## Text library

In [None]:
# TextLibrary class: text library for training, encoding, batch generation,
# and formatted source display
class TextLibrary:
    def __init__(self, filenames, max=100000000):
        self.filenames = filenames
        self.data=''
        self.files=[]
        index = 1
        for filename in filenames:
            fd={}
            fd["name"] = os.path.splitext(os.path.basename(filename))[0]
            self.c2i = {}
            self.i2c = {}
            try:
                f = open(filename)
                dat = f.read(max)
                self.data += dat
                fd["data"] = dat
                fd["index"] = index
                index += 1
                self.files.append(fd)
                f.close()
            except OSError:
                print("  ERROR: Cannot read: ", filename)
        ind = 0
        for c in self.data: # sets are not deterministic
            if c not in self.c2i:
                self.c2i[c] = ind
                self.i2c[ind] = c
                ind += 1
        self.ptr = 0
            
    def print_colored_IPython(self, textlist, pre='', post=''):
        bgcolors = ['#d4e6f1', '#d8daef', '#ebdef0', '#eadbd8', '#e2d7d5', '#edebd0',
                    '#ecf3cf', '#d4efdf', '#d0ece7', '#d6eaf8', '#d4e6f1', '#d6dbdf',
                    '#f6ddcc', '#fae5d3', '#fdebd0', '#e5e8e8', '#eaeded', '#A9CCE3']
        out = ''
        for txt, ind in textlist:
            txt = txt.replace('\n','<br>')
            if ind==0:
                out += txt
            else:
                out += "<span style=\"background-color:"+bgcolors[ind%16]+";\">" + txt +\
                       "</span>"+"<sup>[" + str(ind) + "]</sup>"
        display(HTML(pre+out+post))
        
    def source_highlight(self, txt, minQuoteSize=10):
        tx = txt
        out = []
        qts = []
        txsrc=[("Sources: ", 0)]
        sc=False
        noquote = ''
        while len(tx)>0:  # search all library files for quote 'txt'
            mxQ = 0
            mxI = 0
            mxN = ''
            found = False
            for f in self.files:  # find longest quote in all texts
                p = minQuoteSize
                if p<=len(tx) and tx[:p] in f["data"]:
                    p = minQuoteSize + 1
                    while p<=len(tx) and tx[:p] in f["data"]:
                        p += 1
                    if p-1>mxQ:
                        mxQ = p-1
                        mxI = f["index"]
                        mxN = f["name"]
                        found = True
            if found:  # save longest quote for colorizing
                if len(noquote)>0:
                    out.append((noquote, 0))
                    noquote = ''
                out.append((tx[:mxQ],mxI))
                tx = tx[mxQ:]
                if mxI not in qts:  # create a new reference, if first occurence
                    qts.append(mxI)
                    if sc:
                        txsrc.append((", ", 0))
                    sc = True
                    txsrc.append((mxN,mxI))
            else:
                noquote += tx[0]
                tx = tx[1:]
        if len(noquote)>0:
            out.append((noquote, 0))
            noquote = ''
        self.print_colored_IPython(out)
        if len(qts)>0:  # print references, if there is at least one source
            self.print_colored_IPython(txsrc, pre="<small><p style=\"text-align:right;\">",
                                     post="</p></small>")
    
    def get_slice(self, length):
        if (self.ptr + length >= len(self.data)):
            self.ptr = 0
        if self.ptr == 0:
            rewind = True
        else:
            rewind = False
        sl = self.data[self.ptr:self.ptr+length]
        self.ptr += length
        return sl, rewind
    
    def decode(self, ar):
         return ''.join([self.i2c[ic] for ic in ar])
            
    def get_random_slice(self, length):
        p = random.randrange(0,len(self.data)-length)
        sl = self.data[p:p+length]
        return sl
    
    def get_slice_array(self, length):
        ar = np.array([c for c in self.get_slice(length)[0]],dtype=int)
        return ar
        
    def get_sample(self, length):
        s, rewind = self.get_slice(length+1)
        X = np.array([self.c2i[c] for c in s[:-1]],dtype=int)
        y = np.array([self.c2i[c] for c in s[1:]],dtype=int)
        return (X, y, rewind)
    
    def get_random_sample(self, length):
        s = self.get_random_slice(length+1)
        X = np.array([self.c2i[c] for c in s[:-1]],dtype=int)
        y = np.array([self.c2i[c] for c in s[1:]],dtype=int)
        return (X, y)
    
    def get_sample_batch(self, batch_size, length):
        smpX = np.zeros((batch_size,length),dtype=int)
        smpy = np.zeros((batch_size,length),dtype=int)
        for i in range(batch_size):
            smpX[i,:], smpy[i,:], _ = self.get_sample(length)
        return smpX, smpy
        
    def get_random_sample_batch(self, batch_size, length):
        smpX = np.zeros((batch_size,length),dtype=int)
        smpy = np.zeros((batch_size,length),dtype=int)
        for i in range(batch_size):
            smpX[i,:], smpy[i,:] = self.get_random_sample(length)
        return smpX, smpy

## Model parameters and data sources

The library description can contain a list of text-files. It is possible, to add files in different languages. The net will learn by itself to generate text in only one of the languages. Color-markup is used in generated texts to identify memorized parts of the original texts.
``` python
libdesc = {
    "name": "My lib",
    "description": "several texts",
    "lib": [
        'data/some-english.txt',
        'data/some-french.txt',
        'data/some-german.txt',
        'data/some-more.txt'
    ]
}
```


In [None]:
libdesc = {
    "name": "TinyShakespeare",
    "description": "Small Shakespeare 'standard' corpus",
    "lib": [
        'data/tiny-shakespeare.txt',
    ]
}

textlib = TextLibrary(libdesc["lib"])

use_shakespeare = False

if use_shakespeare or not os.path.exists('data/lib.json'):
    # Default model parameters for shakespeare:
    model_params_shakespeare = {
        "model_name": "lib",
        "vocab_size": len(textlib.i2c),
        "neurons": 256,
        "layers": 2,
        "learning_rate": 1.e-3,
        "steps": 80,
        "batch_size": 128
    }
    model_params = model_params_shakespeare
else:        
    # Look for optional json description of a library:
    with open('data/lib.json') as data_file:    
        libdesc = json.load(data_file)
        textlib = TextLibrary(libdesc["lib"])
        model_params_lib = {
            "model_name": "lib",
            "vocab_size": len(textlib.i2c),
            "neurons": 512,
            "layers": 4,
            "learning_rate": 2.e-4,
            "steps": 80,
            "batch_size": 128
        }
        model_params = model_params_lib


In [None]:
def one_hot(p, dim):
    o=np.zeros(p.shape+(dim,), dtype=int)
    for y in range(p.shape[0]):
        for x in range(p.shape[1]):
            o[y,x,p[y,x]]=1
    return o

In [None]:
batch_size = model_params['batch_size']
vocab_size = model_params['vocab_size']
steps = model_params['steps']

force_cpu=False

if torch.cuda.is_available() and force_cpu is not True:
    device='cuda'
    use_cuda = True
else:
    device='cpu'
    use_cuda = False

def get_data():
    X, y=textlib.get_random_sample_batch(batch_size, steps)
    Xo = one_hot(X, vocab_size)
    
    # Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32)), requires_grad=False, dtype=torch.float32, device=device)
    # yt = Tensor(torch.from_numpy(y), requires_grad=False, dtype=torch.int32, device=device)
    Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32))).to(device)
    Xt.requires_grad_(False)
    yt = torch.LongTensor(torch.from_numpy(np.array(y,dtype=np.int64))).to(device)
    yt.requires_grad_(False)
    return Xt, yt

In [None]:
def show_gpu_mem(context="all"):
    if use_cuda:
        print("[{}] Memory allocated: {} max_alloc: {} cached: {} max_cached: {}".format(context,torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated(), torch.cuda.memory_cached(), torch.cuda.max_memory_cached()))


## The char-rnn model (deep LSTMs)

In [None]:
class Poet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, device):
        super(Poet, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.device=device
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=0)
        
        self.demb = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)  # negative dims are a recent thing (as 2018-03), remove for old vers.
    
    def init_hidden(self, batch_size):
        self.h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=self.device)
        self.c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=self.device)

    def forward(self, inputx, steps):
        self.lstm.flatten_parameters()
        hn, (self.h0, self.c0) = self.lstm(inputx.to(self.device), (self.h0, self.c0))
        hnr = hn.contiguous().view(-1,self.hidden_size)
        op = self.demb(hnr)
        opr = op.view(-1, steps ,self.output_size)
        return opr

    def generate(self, n, start=None):
        s=''
        torch.set_grad_enabled(False)
        if start==None or len(start)==0:
            start=' '
        self.init_hidden(1)
        for c in start:
            X=np.array([[textlib.c2i[c]]])
            Xo=one_hot(X,self.output_size)
            Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32))).to(self.device)
            ypl = self.forward(Xt,1)
            ypl2 = ypl.view(-1,self.output_size)
            yp = self.softmax(ypl2)
        for i in range(n):
            ypc=Tensor.cpu(yp.detach()) # .cpu()
            y_pred=ypc.numpy()
            inds=list(range(self.output_size))
            ind = np.random.choice(inds, p=y_pred.ravel())
            s=s+textlib.i2c[ind]
            X=np.array([[ind]])
            Xo=one_hot(X,self.output_size)
            Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32))).to(self.device)
            ypl = self.forward(Xt,1)
            ypl2 = ypl.view(-1,self.output_size)
            yp = self.softmax(ypl2)
        torch.set_grad_enabled(True)
        return s    

## Create a poet

In [None]:
poet = Poet(vocab_size, model_params['neurons'], model_params['layers'], vocab_size, device).to(device)

## Training helpers

In [None]:
criterion = nn.CrossEntropyLoss()
learning_rate = model_params['learning_rate']

opti = torch.optim.Adam(poet.parameters(),lr=learning_rate);

bok=0

def train(Xt, yt, bPr=False):
    poet.zero_grad()

    poet.init_hidden(Xt.size(0))
    output = poet(Xt, steps)
    
    olin=output.view(-1,vocab_size)
    _, ytp=torch.max(olin,1)
    ytlin=yt.view(-1)

    pr=0.0
    if bPr: # Calculate precision
        ok=0
        nok=0
        for i in range(ytlin.size()[0]):
            i1=ytlin[i].item()
            i2=ytp[i].item()
            if i1==i2:
                ok = ok + 1
            else:
                nok = nok+1
            pr=ok/(ok+nok)
            
    loss = criterion(olin, ytlin)
    ls = loss.item()
    loss.backward()
    opti.step()

    return ls, pr

## The actual training

In [None]:
ls=0
nrls=0
if use_cuda:
    intv=250
else:
    intv=10
for e in range(2500000):
    Xt, yt = get_data()
    if (e+1)%intv==0:
        l,pr=train(Xt,yt,True)
    else:
        l,pr=train(Xt,yt,False)        
    ls=ls+l
    nrls=nrls+1
    if (e+1)%intv==0:
        print("Loss: {} Precision: {}".format(ls/nrls, pr))
        # if use_cuda:
        #    print("Memory allocated: {} max_alloc: {} cached: {} max_cached: {}".format(torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated(), torch.cuda.memory_cached(), torch.cuda.max_memory_cached()))
        nrls=0
        ls=0
        tgen=poet.generate(500,"\n\n")
        textlib.source_highlight(tgen,10)

## Generate text

In [None]:
def detectPlagiarism(generatedtext, textlibrary, minQuoteLength=10):
    textlibrary.source_highlight(generatedtext, minQuoteLength)
    
tgen=poet.generate(1000,"\n\n")
detectPlagiarism(tgen, textlib)

## Dialog

In [None]:
# Do a dialog with the recursive neural net trained above:
def doDialog():
    # temperature = 0.6  # 0.1 (frozen character) - 1.3 (creative/chaotic character)
    endPrompt = '.'  # the endPrompt character is the end-mark in answers.
    maxEndPrompts = 4  # look for number of maxEndPrompts until answer is finished.
    maxAnswerSize = 2048  # Maximum length of the answer
    minAnswerSize = 64  # Minimum length of the answer

    
    print("Please enter some dialog.")
    print("The net will answer according to your input.")
    print("'bye' for end,")
    print("'reset' to reset the conversation context,")
    # print("'temperature=<float>' [0.1(frozen)-1.0(creative)]")
    print("    to change character of the dialog.")
    # print("    Current temperature={}.".format(temperature))
    print()
    xso = None
    bye = False
        
    while not bye:
        print("> ", end="")
        prompt = input()
        if prompt == 'bye':
            bye = True
            print("Good bye!")
            continue
        tgen=poet.generate(1000,prompt)
        # print(xso.replace("\\n","\n"))
        textlib.source_highlight(tgen, 10)
    return

In [None]:
doDialog()

In [None]:
def save_checkpoint(state, filename='checkpoint.pth.tar', is_best=False):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

best_prec1=64.4

save_checkpoint({
            'epoch': e,
            'arch': "poet8",
            'state_dict': poet.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : opti.state_dict(),
        })
