<a href="https://colab.research.google.com/github/domschl/torch-poet/blob/master/torch_poet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import os
import shutil
from enum import Enum
import re
import time
import logging
import sys
import json
import random
import torch
import torch.nn as nn
from torch import Tensor

try:
    from urllib.request import urlopen  # Py3
except:
    print("This notebook requires Python 3.")
try:
    import pathlib
except:
    print("At least python 3.5 is needed.")
    
try: # Colab instance?
    from google.colab import drive
except: # Not? ignore.
    pass

from IPython.core.display import display, HTML

# 0. System configuration

This notebook can either run on a local jupyter server, or on google cloud.
If a GPU is available, it will be used for training (if `force_cpu` is not set to `True`).

By default snapshots of the trained net are stored locally for jupyter instances, and on user's google drive for Google Colab instances. The snapshots allow the restart of training or inference at any time, e.g. after the Colab session was terminated.

Similarily, the text corpora that are used for training, can be cached on drive or locally.

In [10]:
# force_cpu=True: use CPU for training, even if a GPU is available.
#    Note: inference uses CPU always, because that is faster.
force_cpu=False

# Define where snapshots of training data are stored:
colab_google_drive_snapshots=True

# Define if training data (the texts downloaded from internet) are cached:
colab_google_drive_data_cache=True  # In colab mode cache to google drive
local_jupyter_data_cache=True       # In local jupyter mode cache to local path

In [11]:
is_colab_notebook = 'google.colab' in sys.modules
torch_version = torch.__version__

if torch.cuda.is_available() and force_cpu is not True:
    device='cuda'
    use_cuda = True
    print(f"PyTorch {torch_version}, running on GPU")
    if is_colab_notebook:
        card = !nvidia-smi
        if len(card)>=8:
            try:
                gpu_type=card[7][6:25]
                gpu_memory=card[8][33:54]
                print(f"Colab GPU: {gpu_type}, GPU Memory: {gpu_memory}")
            except Exception as e:
                pass
else:
    device='cpu'
    use_cuda = False
    print(f"{torch_version}, running on CPU")
    if colab_notebook:
        print("Note: on Google Colab, make sure to select:")
        print("      Runtime / Change Runtime Type / Hardware accelerator: GPU")

PyTorch 1.4.0, running on GPU


In [12]:
if is_colab_notebook:
    if colab_google_drive_snapshots:
        mountpoint='/content/drive'
        root_path='/content/drive/My Drive'
        if not os.path.exists(root_path):
            drive.mount(mountpoint)
        if not os.path.exists(root_path):
            print("Something went wrong with Google Drive access. Cannot save snapshots to GD.")
            colab_google_drive_snapshots=False
    else:
        print("Since google drive snapshots are not active, training data will be lost as soon as the Colab session terminates!")
        print("Set `colab_google_drive_snapshots` to `True` to make training data persistent.")
else:
    root_path='.'

In [13]:
def one_hot(p, dim):
    o=np.zeros(p.shape+(dim,), dtype=int)
    for y in range(p.shape[0]):
        for x in range(p.shape[1]):
            o[y,x,p[y,x]]=1
    return o

# 1. Text data collection

**Important note:** the following `project_name` determines the root directory for training data and model snapshots, so it should be changed whenever datasets of model configurations are changed.

In [14]:
project_name = "philosophers_lang_eng"
project_description = "A model trained on several books of philosophers in English language."

In [15]:
if is_colab_notebook:
    if colab_google_drive_data_cache is True:
        data_cache_path=os.path.join(root_path,f"Colab Notebooks/{project_name}/Data")
    else:
        data_cache_path=None
else:
    if local_jupyter_data_cache is True:
        data_cache_path=os.path.join(root_path,f"{project_name}/Data")
    else:
        data_cache_path=None

if data_cache_path is not None:
    pathlib.Path(data_cache_path).mkdir(parents=True, exist_ok=True)
    if not os.path.exists(data_cache_path):
        print("ERROR, the cache directory does not exist. This will fail.")
            
def get_cache_name(cache_path, author, title):
    if cache_path is None:
        return None
    cname=f"{author} - {title}.txt"
    cname=cname.replace('?','_')  # Gutenberg index is pre-Unicode-mess and some titles contain '?' for bad conversions.
    cache_filepath=os.path.join(cache_path, cname)
    return cache_filepath

## 1.1 Project Gutenberg data source

Search, filter, clean and download books from Project Gutenberg

In [16]:
logging.basicConfig(level=logging.INFO)

In [17]:
class GutenbergLib:
    """ A fuzzy, lightweight library to access, search and filter Project Gutenberg resources """
    def __init__(self, root_url="http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg", cache_dir="gutenberg"):
        """ GutenbergLib by default uses a mirror's root URL
        
        root_url -- url of Project Gutenberg or any mirror URL.
        cache_dir -- path to a directory that will be used to cache the Gutenberg index and already downloaded texts
        """
        self.log = logging.getLogger('GutenbergLib')
        self.root_url = root_url
        self.index=None
        self.NEAR=2048
        try:
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)
            self.cache_dir=cache_dir
        except Exception as e:
            self.cache_dir=None
            self.log.error(f"Failed to create cache directory {cache_dir}, {e}")

    def _parse_record(self,record,verbose=True):
        """ internal function to recreate some consistent record information from near-freestyle text """
        rl=record.split('\n')
        white=str(chr(160))+str(chr(9))+" " # non-breaking space, TAB, and space
        ebook_no=""
        while len(rl[0])>0 and rl[0][-1] in white:
            rl[0]=rl[0][:-1]
        while len(rl[0])>0 and not rl[0][-1] in white:
            ebook_no=rl[0][-1]+ebook_no
            rl[0]=rl[0][:-1]
        while len(rl[0])>0 and rl[0][-1] in white:
            rl[0]=rl[0][:-1]
        
        # Sanity check
        try:
            fa=re.findall(ebook_no,"\A[0-9]+[A-C]\Z")
        except Exception as e:
            fa=None
            if verbose is True:
                self.log.debug(f"Failed to apply regex on >{ebook_no}<")
            
        if len(rl[0])<5 or fa==None or len(ebook_no)>7:
            if verbose is True:
                print("-------------------------------------")
                print(record)
                print("- - - - - - - - - - - - - - - - - - -")
                print(f"Dodgy record: {rl[0]}")
                print(f"    ebook-id:  >{ebook_no}<")
            return None
        
        for i in range(len(rl)):
            rl[i]=rl[i].strip()
            
        p=0
        while p<len(rl)-1:
            if len(rl[p+1])==0:
                print(f"Invalid rec: {record}")
                p+=1
            else:
                if rl[p+1][0]!="[":
                    rl[p]+=" "+rl[p+1]
                    del rl[p+1]
                    if rl[p][-1]==']':
                        p+=1
                else:
                    p+=1
        
        rec={}
        l0=rl[0].split(", by ")
        rec['title']=l0[0]
        rec['ebook_id']=ebook_no
        # if len(l0)>2:
        #    print(f"Chaos title: {rl[0]}")
        if len(l0)>1:
            rec['author']=l0[-1]
        for r in rl[1:]:
            if r[0]!='[' or r[-1]!=']':
                if r[0]=='[':
                    ind=r.rfind(']')
                    if ind != -1:
                        # print(f"Garbage trail {r}")
                        r=r[:ind+1]
                        # print(f"Fixed: {r}")
                    else:
                        # print(f"Missing closing ] {r}")
                        r+=']'
                        # print(f"Fixed: {r}")
            if r[0]=='[' and r[-1]==']':
                r=r[1:-1]
                i1=r.find(':')
                if i1==-1:
                    r=r.replace("Author a.k.a.","Author a.k.a.:")
                    i1=r.find(':')
                if i1!=-1:
                    i2=r[i1:].find(' ')+i1
                else:
                    i2=-1
                if i1==-1 and i2==-1:
                    pass
                    # print(f"Invalid attribut in {rl}::{r}")
                else:
                    if i2-i1==1:
                        key=r[:i1]
                        val=r[i2+1:]
                        if '[' in key or ']' in key or '[' in val or ']' in val or len(key)>15:
                            pass
                            # print("messy key/val")
                        else:
                            rec[key.strip().lower()]=val.strip()
                    else:
                        pass
                        # print(f"Bad attribute name terminator, missing ': ' {r}")
            else:
                pass
                # print(f"Invalid attribut in {rl}::{r}")
        if len(rec)>1:
            if "language" not in rec.keys():
                rec["language"]="English"
        return rec
        
    def _parse_index(self, lines):
        """ internal function to parse the fuzzy text-based Gutenberg table of content """
        class State(Enum):
            NONE=1,
            SYNC_START=2,
            SYNC_REC=3,
            END=5
    
        white=str(chr(160))+str(chr(9))+" " # non-breaking space, TAB, and space
        state=State.NONE
        start_token="~ ~ ~ ~"
        stop_token=["====="]
        end_token="<==End"
        ignore_headers=["TITLE and AUTHOR"]
        ignore_content=["Not in the Posted Archives","human-read audio ebooks", "Audio:"]
        empty_lines=0
        records=[]
        for line in lines:
            if line[:len(end_token)]==end_token:
                state=State.END
                break

            if state==State.NONE:
                if line[:len(start_token)]==start_token:
                    state=State.SYNC_START
                    empty_lines=0
                    continue
            if state==State.SYNC_START:
                if len(line.strip())==0:
                    empty_lines+=1
                    if empty_lines>1:
                        state=State.NONE
                        continue
                else:
                    stopped=False
                    for stop in stop_token:
                        if line[:len(stop)]==stop:
                            stopped=True
                            break
                    if stopped is True:
                        state=State.NONE
                        empty_lines=0
                        continue
                    ignore=False
                    for header in ignore_headers:
                        if line[:len(header)]==header:
                            empty_lines=0
                            ignore=True
                    for token in ignore_content:
                        if token in line:
                            empty_lines=0
                            ignore=True
                    if ignore is True:
                        continue
                    rec=line
                    state=State.SYNC_REC
                    continue
            if state==State.SYNC_REC:
                if len(line.strip())==0 or line[0] not in white:
                    if len(records)<10:
                        parsed_rec=self._parse_record(rec, verbose=True)
                    else:
                        parsed_rec=self._parse_record(rec, verbose=False)
                        
                    if parsed_rec is not None:
                        records.append(parsed_rec)
                    empty_lines=1
                    if len(line.strip())==0:
                        state=State.SYNC_START
                        continue
                    else:
                        rec=line
                        continue
                rec=rec+"\n"+line
        return records
                    
    def load_index(self, cache=True, cache_expire_days=30):
        """ This function loads the Gutenberg record index, either from cache, or from a website
        
        cache -- default True, use the cache directory to cache both index and text files. Index
        expires after cache_expire_days, text files never expire. Should *NOT* be set to False
        in order to prevent unnecessary re-downloading.
        cache_expire_days -- Number of days after which the index is re-downloaded."""
        raw_index=None
        if self.cache_dir is None:
            self.log.error("Cannot cache library index, no valid cache directory.")
            return False
        ts_file=os.path.join(self.cache_dir,"timestamp")
        cache_file=os.path.join(self.cache_dir,"gutenberg_index")
        expired=True
        read_from_cache=False
        if os.path.isfile(ts_file) and os.path.isfile(cache_file):
            try:
                with open(ts_file,'r') as f:
                    ts=float(f.read())
                if time.time()-ts<cache_expire_days*24*3600:
                    expired=False
                    read_from_cache = True
                    self.log.debug("Cache timestamp read.")
                else:
                    self.log.debug("Cache for index is expired, reloading from web.")
            except:
                self.log.debug("Failed to read cache timestamp, reloading from web.")
        if expired is False and os.path.isfile(cache_file):
            try:
                with open(cache_file,'r') as f:
                    raw_index=f.read()
                    self.log.info(f"Gutenberg index read from {cache_file}")
            except:
                expired=True
                self.log.debug("Failed to read cached index, reloading from web.")
        if expired is True:
            index_url=self.root_url+"/GUTINDEX.ALL"
            try:
                raw_index = urlopen(index_url).read().decode('utf-8')
                if raw_index[0]=='\ufeff':  # Ignore BOM
                    raw_index=raw_index[1:]
                raw_index=raw_index.replace('\r','')
                self.log.info(f"Gutenberg index read from {index_url}")
            except Exception as e:
                self.log.error(f"Failed to download Gutenberg index from {index_rul}, {e}")
                return False
        if cache is True and read_from_cache is False:
            try:
                with open(ts_file,'w') as f:
                    f.write(str(time.time()))
                    self.log.debug("Wrote read cache timestamp.")
            except Exception as e:
                print(f"Failed to write cache timestamp to {ts_file}, {e}")
            try:
                with open(cache_file,'w') as f:
                    f.write(raw_index)
                    self.log.debug("Wrote read cached index.")
            except Exception as e:
                print(f"Failed to write cached index to {cache_file}, {e}")
        lines=raw_index.split('\n')
        self.records=self._parse_index(lines)
    
    def load_book(self, ebook_id):
        """ get text of an ebook from Gutenberg by ebook_id 
        
        ebook_id -- Gutenberg id
        """
        if ebook_id is None or len(ebook_id)==0:
            return None
        if ebook_id[-1]=='C':
            ebook_id=ebook_id[:-1]
        path_stub=""
        
        for i in range(len(ebook_id)-1):
            path_stub+="/"+ebook_id[i]
        path_stub+="/"+ebook_id+"/"
        filenames=[(ebook_id+"-0.txt",'utf-8'), (ebook_id+".txt",'utf-8'), (ebook_id+"-8.txt","latin1")]
        cache_name=ebook_id+".txt"
        if self.cache_dir is not None:
            cache_file=os.path.join(self.cache_dir,cache_name)
            if os.path.isfile(cache_file):
                try:
                    with open(cache_file,'r') as f:
                        data=f.read()
                        self.log.info(f"Book read from cache at {cache_file}")
                        return data
                except Exception as e:
                    self.log.error(f"Failed to read cached file {cache_file}")
        data=None
        for filename, encoding in filenames:
            file_url=self.root_url+path_stub+filename
            try:
                data = urlopen(file_url).read().decode(encoding)
                self.log.info(f"Book downloaded from {file_url}")
                break
            except Exception as e:
                self.log.debug(f"URL-Download failed: {file_url}, {e}")
                pass
        if data is None:
            self.log.error(f"Failed to download {filenames}")
            return None
        if self.cache_dir is not None:
            try:
                with open(cache_file,'w') as f:
                    f.write(data)
            except:
                self.log.error(f"Failed to cache file {cache_file}")
        return data
    
    def filter_text(self, book_text):
        """ Heuristically remove header and trailer texts not part of the actual book 
        """
        start_tokens=["*** START OF THIS PROJECT", "E-text prepared by", "This book was generously provided by the "]
        near_start_tokens=["produced by ", "Produced by ", "Transcriber's Note", "Transcriber's note:", "Anmerkungen zur Tanskription"]
        end_tokens=["End of the Project Gutenberg", "*** END OF THIS PROJECT", "***END OF THE PROJECT GUTENBER",
                   "Ende dieses Projekt Gutenberg", "End of Project Gutenberg", "Transcriber's Note"]
        blen=len(book_text)
        
        pstart=0
        for token in start_tokens:
            pos=book_text.find(token)
            if pos > pstart:
                pstart = pos
                self.log.debug(f"Start-token [{token}] found at position {pos}")
        if pstart>0:
            pos=book_text[pstart:].find("\n\n")
            if pos>=0 and pos <= self.NEAR:
                pos += pstart
                while book_text[pos]=='\n':
                    pos += 1  # eof?!
                pstart=pos
        if pstart>blen/2:
            self.log.warning("Preamble is taking more than half of the book!")
        new_book=book_text[pstart:]
        
        xpos=-1
        for token in near_start_tokens:
            pos=new_book.find(token)
            if pos>=0 and pos<=self.NEAR:
                self.log.debug(f"Near-Start-token [{token}] found at position {pos}")
                if pos>xpos:
                    xpos=pos
        if xpos > -1:
            pos2=new_book[xpos:].find("\n\n")
            self.log.debug(f"Trying extra skipping for {pos2}...")
            if pos2<=self.NEAR and pos2>0:
                self.log.debug("Trying extra skipping (2)...")
                while new_book[xpos+pos2]=='\n':
                    pos2 += 1
                new_book=new_book[xpos+pos2:]
                self.log.debug(f"Additionally shortened start by {xpos+pos2} chars")
        
        pend=len(new_book)
        for token in end_tokens:
            pos=new_book.find(token)
            if pos!=-1 and pos < pend:
                self.log.debug(f"End-token [{token}] found at pos {pos}")
                pend = pos
        if pend<len(new_book):
            pos=new_book[:pend].rfind("\n\n")
            if pos>0:
                while new_book[pos]=='\n':
                    pos -= 1  # eof?!
                pend=pos+1
        else:
            self.log.debug("No end token found!")
        if pend<len(new_book)/2:
            self.log.warning("End-text is taking more than half of the book!")
        new_book=new_book[:pend]
        return new_book
        
    def find_keywords(self,*search_keys):
        """ Search of an arbitrary number of keywords in a book record
        
        returns -- list of records that contain all keywords in any field. """
        frecs=[]
        for rec in self.records:
            found=True
            for sk in search_keys:
                subkey=False
                for key in rec.keys():
                    if sk.lower() in key.lower() or sk.lower() in rec[key].lower():
                        subkey=True
                        break
                if subkey is False:
                    found=False
                    break
            if found is True:
                frecs += [rec]
        return frecs
    
    def search(self, search_dict):
        """ Search for book record with key specific key values
        For a list of valid keys, use `get_record_keys()`
        Standard keys are:
        ebook_id, author, language, title
        example: search({"title": ["philosoph","phenomen","physic","hermeneu","logic"], "language":"english"})
        Find all books whose titles contain at least one the keywords, language english. Search keys can either be
        search for a single keyword (e.g. english), or an array of keywords. 
        returns -- list of records """
        frecs=[]
        for rec in self.records:
            found=True
            for sk in search_dict:
                if sk not in rec:
                    found=False
                    break
                else:
                    skl=search_dict[sk]
                    if not isinstance(skl,list):
                        skl=[skl]
                    nf=0
                    for skli in skl:
                        if skli.lower() in rec[sk].lower():
                            nf=nf+1
                    if nf==0:
                        found=False
                        break
            if found is True:
                frecs += [rec]
        return frecs
        
    
    def get_record_keys(self):
        """ Get a list of all keys that are used within records. Standard keys are:
        ebook_id, author, language, title
        
        returns -- list of all different keys that are somehow used."""
        rks=[]
        for r in self.records:
            rks=set(list(rks) + list(r.keys()))
        return rks

    def get_unique_record_values(self, key):
        """ Get a list of all unique values a given keys has for all records.
        get_unique_records_values('language') returns all languages in Gutenberg."""
        uv=[]
        if key not in self.get_record_keys():
            print(f"{key} is not a key used in any record!")
            return None
        for r in self.records:
            if key in r:
                uv=set(list(uv)+[r[key]])
        uv=sorted(uv)
        return uv

In [18]:
# Get the list of available books on Gutenberg.
gbl=GutenbergLib(cache_dir=os.path.join(root_path, 'gutenberg_cache'))
gbl.load_index()

INFO:GutenbergLib:Gutenberg index read from ./gutenberg_cache/gutenberg_index


In [19]:
# sample searches
search_specs=[
    {"title": ["love", "hate", "emotion", "drama"], "language": ["english"]},
    {"author": ["brontë","Jane Austen", "Woolf", "goethe", "kant"], "language": ["english", "german"]},
    {"title": ["philosoph", "physic", "phenomen", "logic"], "language": ["english"]},
]
for search_spec in search_specs:
    book_list=gbl.search(search_spec)
    print(f"{len(book_list)} matching books found with search {search_spec}.")
# a search spec can be used by the following text library as datasource, it will automatically download, filter and prepare the content of the books requested.

305 matching books found with search {'title': ['love', 'hate', 'emotion', 'drama'], 'language': ['english']}.
105 matching books found with search {'author': ['brontë', 'Jane Austen', 'Woolf', 'goethe', 'kant'], 'language': ['english', 'german']}.
283 matching books found with search {'title': ['philosoph', 'physic', 'phenomen', 'logic'], 'language': ['english']}.


In [20]:
def create_libdesc(project_name, description, cache_path, book_list):
    libdesc={"name": project_name, "description": description, "lib": []}
    if cache_path is None or not os.path.exists(cache_path):
        print(f"A valid cache {cache_path} is needed!")
        return None
    for book_entry in book_list:
        try:
            book_raw_content=gbl.load_book(book_entry['ebook_id'])
        except Exception as e:
            print(f"Failed to download ebook_id {book_entry}, {e}")
            continue
        if book_raw_content is not None:
            try:
                book_text=gbl.filter_text(book_raw_content)
            except Exception as e:
                print(f"Internal error when filtering {book_entry}, {e}")
                continue
            filename=get_cache_name(cache_path, book_entry['author'], book_entry['title'])
            try:
                with open(filename,'w') as f:
                    f.write(book_text)
                    print(f"Cached {filename}")
                    libdesc["lib"].append((filename, book_entry['author'], book_entry['title']))
            except Exception as e:
                print(f"Failed to cache {filename}", {e})
    return libdesc

In [21]:
book_list=gbl.search({"author": ["platon", "descartes", "john locke", "david hume", "kant", "schopenhauer", "leibniz", "kierkegaard", "hegel", "nietzsche", "heidegger", "fichte"], "language": ["english"]})
print(f"{len(book_list)} books found.")

57 books found.


In [22]:
book_list

[{'title': 'Selections from the Writings of Kierkegaard',
  'ebook_id': '60333',
  'author': 'Søren Kierkegaard',
  'language': 'English'},
 {'title': 'Vignettes',
  'ebook_id': '60193',
  'author': 'Hubert Crackanthorpe',
  'subtitle': 'A Miniature Journal of Whim and Sentiment',
  'language': 'English'},
 {'title': "Hume's Political Discourses",
  'ebook_id': '59792',
  'author': 'David Hume',
  'language': 'English'},
 {'title': 'The History of Philosophy: Volume 3 of 3',
  'ebook_id': '58169',
  'author': 'Georg Wilhelm Hegel',
  'language': 'English'},
 {'title': 'The Philosophy of Fine Art, Volume 4 of 4',
  'ebook_id': '55731',
  'author': 'G. W. F. Hegel',
  'subtitle': "Hegel's Aesthetik",
  'language': 'English'},
 {'title': 'The Philosophy of Fine Art, Volume 3 of 4',
  'ebook_id': '55623',
  'author': 'G. W. F. Hegel',
  'subtitle': "Hegel's Aesthetik",
  'language': 'English'},
 {'title': 'The Philosophy of Fine Art, Vol. 2 of 4',
  'ebook_id': '55445',
  'author': 'G. W. 

In [23]:
# this will download the books! make sure it's a reasonable number of books
libdesc=create_libdesc(project_name, project_description, data_cache_path, book_list)

with open(os.path.join(data_cache_path,'libdesc.json'),'w') as f:
    json.dump(libdesc,f,indent=4)

INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/6/0/3/3/60333/60333-0.txt
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/6/0/1/9/60193/60193-0.txt


Cached ./philosophers_lang_eng/Data/Søren Kierkegaard - Selections from the Writings of Kierkegaard.txt
Cached ./philosophers_lang_eng/Data/Hubert Crackanthorpe - Vignettes.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/9/7/9/59792/59792-0.txt


Cached ./philosophers_lang_eng/Data/David Hume - Hume's Political Discourses.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/8/1/6/58169/58169-0.txt


Cached ./philosophers_lang_eng/Data/Georg Wilhelm Hegel - The History of Philosophy: Volume 3 of 3.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/5/7/3/55731/55731-0.txt


Cached ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Volume 4 of 4.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/5/6/2/55623/55623-0.txt


Cached ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Volume 3 of 4.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/5/4/4/55445/55445-0.txt


Cached ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Vol. 2 of 4.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/5/3/3/55334/55334-0.txt


Cached ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Vol. 1 of 4.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/5/1/0/55108/55108-0.txt


Cached ./philosophers_lang_eng/Data/G. W. F. Hegel - The Logic of Hegel.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/4/9/9/54992/54992-0.txt


Cached ./philosophers_lang_eng/Data/William Wallace and G. W. F. Hegel - Prolegomena to the Study of Hegel's Philosophy.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/3/7/9/53792/53792-8.txt


Cached ./philosophers_lang_eng/Data/David Hume - Philosophical Works, Vol. 2 of 4.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/3/7/9/53791/53791-8.txt


Cached ./philosophers_lang_eng/Data/David Hume - Philosophical Works, Vol. 1 of 4.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/9/1/52915/52915-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - The Will to Power, Books III and IV.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/9/1/52914/52914-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - The Will to Power, Books I and II.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/8/8/52881/52881-0.txt
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/8/2/52821/52821-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - The Joyful Wisdom.txt
Cached ./philosophers_lang_eng/Data/Immanuel Kant - Kant's Prolegomena.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/3/1/52319/52319-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - The Genealogy of Morals.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/2/6/52263/52263-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - Twilight of the Idols - The Antichrist.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/2/1/9/52190/52190-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - Ecce Homo.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/1/7/1/51710/51710-8.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - Thoughts out of Season, Part I.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/1/6/3/51636/51636-0.txt


Cached ./philosophers_lang_eng/Data/Georg Wilhelm Hegel - Hegel's Lectures on the History of Philosophy: Vol. 2 of 3.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/1/6/3/51635/51635-0.txt
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/1/5/8/51580/51580-0.txt


Cached ./philosophers_lang_eng/Data/Georg Wilhelm Hegel - Hegel's Lectures on the History of Philosophy: Vol. 1 of 3.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/1/5/4/51548/51548-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - On the Future of our Educational Institutions - Homer and Classical Philology.txt
Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - Early Greek Philosophy & Other Essays.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/0/9/6/50966/50966-0.txt
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/0/9/2/50922/50922-0.txt


Cached ./philosophers_lang_eng/Data/Arthur Schopenhauer - On the Fourfold Root of the Principle of Sufficient Reason and On the Will in Nature: Two Essays (revised edition).txt
Cached ./philosophers_lang_eng/Data/Immanuel Kant - Perpetual Peace.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/8/4/3/48433/48433-0.txt


Cached ./philosophers_lang_eng/Data/Immanuel Kant - Kant's Critique of Judgement.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/6/3/3/46330/46330.txt
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/6/0/6/46060/46060.txt


Cached ./philosophers_lang_eng/Data/Georg Hegel - The Introduction to Hegel's Philosophy of Fine Arts.txt
Cached ./philosophers_lang_eng/Data/Emanuel Kant - Of the Injustice of Counterfeiting Books.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/4/9/2/44929/44929-0.txt


Cached ./philosophers_lang_eng/Data/Arthur Schopenhauer - The Basis of Morality.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/0/8/6/40868/40868-0.txt


Cached ./philosophers_lang_eng/Data/Arthur Schopenhauer - The World as Will and Idea (Vol. 3 of 3).txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/0/0/9/40097/40097-0.txt


Cached ./philosophers_lang_eng/Data/Arthur Schopenhauer - The World as Will and Idea (Vol. 2 of 3).txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/9/9/5/39955/39955-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - The Dawn of Day.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/9/0/6/39064/39064-0.txt


Cached ./philosophers_lang_eng/Data/Georg Wilhelm Friedrich Hegel - Hegel's Philosophy of Mind.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/8/4/2/38427/38427-0.txt


Cached ./philosophers_lang_eng/Data/Arthur Schopenhauer - The World as Will and Idea (Vol. 1 of 3).txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/8/2/2/38226/38226-8.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - Thoughts Out of Season, Part 2.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/8/1/4/38145/38145.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - Human, All Too Human.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/7/8/4/37841/37841-0.txt
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/6/1/2/36120/36120-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - Human, All-Too-Human, Part II.txt
Cached ./philosophers_lang_eng/Data/David Hume - Essays.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/2/5/8/3/25830/25830.txt


Cached ./philosophers_lang_eng/Data/Rene Descartes - Discourse of a Method for the Well Guiding of Reason.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/7/1/4/17147/17147.txt


Cached ./philosophers_lang_eng/Data/G. W. Leibniz - Theodicy.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/9/6/6/9662/9662.txt


Cached ./philosophers_lang_eng/Data/David Hume and L. A. Selby-Bigge - Enquiry Concerning Human Understanding.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/7/3/7/7370/7370.txt


Cached ./philosophers_lang_eng/Data/John Locke - Second Treatise of Government.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/7/2/0/7207/7207-8.txt


Cached ./philosophers_lang_eng/Data/Friedrich Wilhelm Nietzsche - Menschliches, Allzumenschliches.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/6/8/5684/5684.txt


Cached ./philosophers_lang_eng/Data/Immanuel Kant - The Metaphysical Elements of Ethics.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/6/8/5683/5683.txt


Cached ./philosophers_lang_eng/Data/Immanuel Kant - The Critique of Practical Reason.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/6/8/5682/5682.txt


Cached ./philosophers_lang_eng/Data/Immanuel Kant - Fundamental Principles of the Metaphysic of Morals.txt


ERROR:GutenbergLib:Failed to download [('5652-0.txt', 'utf-8'), ('5652.txt', 'utf-8'), ('5652-8.txt', 'latin1')]
INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/5/6/3/5637/5637.txt


Cached ./philosophers_lang_eng/Data/Montaigne, Michel, Sainte-Beuve, Charles-Augustin; Renan, Ernest, Lessing, Gotthold Ephraim, Von Schiller, J.C., Kant, Immanuel, Mazzini, and Giuseppe - Literary and Philosophical Essays.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/7/0/4705/4705.txt


Cached ./philosophers_lang_eng/Data/David Hume - A Treatise of Human Nature, Vols. 1 & 2.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/5/8/4583/4583.txt


Cached ./philosophers_lang_eng/Data/David Hume - Dialogues Concerning Natural Religion.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/3/9/4391/4391.txt


Cached ./philosophers_lang_eng/Data/Rene Descartes - Selections From The Principles of Philosophy.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/3/6/4363/4363.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - Beyond Good and Evil.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/3/2/4320/4320.txt


Cached ./philosophers_lang_eng/Data/David Hume - An Enquiry Concerning the Principles of Morals.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/4/2/8/4280/4280-0.txt


Cached ./philosophers_lang_eng/Data/Immanuel Kant - The Critique of Pure Reason.txt


INFO:GutenbergLib:Book downloaded from http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/9/9/1998/1998-0.txt


Cached ./philosophers_lang_eng/Data/Friedrich Nietzsche - Thus Spake Zarathustra.txt


ERROR:GutenbergLib:Failed to download [('59-0.txt', 'utf-8'), ('59.txt', 'utf-8'), ('59-8.txt', 'latin1')]


In [24]:
libdesc

{'name': 'philosophers_lang_eng',
 'description': 'A model trained on several books of philosophers in English language.',
 'lib': [('./philosophers_lang_eng/Data/Søren Kierkegaard - Selections from the Writings of Kierkegaard.txt',
   'Søren Kierkegaard',
   'Selections from the Writings of Kierkegaard'),
  ('./philosophers_lang_eng/Data/Hubert Crackanthorpe - Vignettes.txt',
   'Hubert Crackanthorpe',
   'Vignettes'),
  ("./philosophers_lang_eng/Data/David Hume - Hume's Political Discourses.txt",
   'David Hume',
   "Hume's Political Discourses"),
  ('./philosophers_lang_eng/Data/Georg Wilhelm Hegel - The History of Philosophy: Volume 3 of 3.txt',
   'Georg Wilhelm Hegel',
   'The History of Philosophy: Volume 3 of 3'),
  ('./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Volume 4 of 4.txt',
   'G. W. F. Hegel',
   'The Philosophy of Fine Art, Volume 4 of 4'),
  ('./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Volume 3 of 4.txt',
  

## 1.2 Text library

`TextLibrary` class: text library for training, encoding, batch generation,
and formatted source display. It read some books from Project Gutenberg
and supports creation of training batches. The output functions support
highlighting to allow to compare generated texts with the actual sources
to help to identify identical (memorized) parts of a given length.

In [25]:
use_dark_mode=False  # Set to false for white background

In [76]:
class TextLibrary:
    def __init__(self, descriptors, text_data_cache_directory=None, max=100000000):
        self.descriptors = descriptors
        self.data = ''
        self.cache_dir=text_data_cache_directory
        self.files = []
        self.c2i = {}
        self.i2c = {}
        index = 1
        for descriptor, author, title in descriptors:
            fd = {}
            cache_name=get_cache_name(self.cache_dir, author, title)
            if os.path.exists(cache_name):
                is_cached=True
            else:
                is_cached=False
            valid=False
            if descriptor[:4] == 'http' and is_cached is False:
                try:
                    print(f"Downloading {cache_name}")
                    dat = urlopen(descriptor).read().decode('utf-8')
                    if dat[0]=='\ufeff':  # Ignore BOM
                        dat=dat[1:]
                    dat=dat.replace('\r', '')  # get rid of pesky LFs 
                    self.data += dat
                    fd["title"] = title
                    fd["author"] = author
                    fd["data"] = dat
                    fd["index"] = index
                    index += 1
                    valid=True
                    self.files.append(fd)
                except Exception as e:
                    print(f"Can't download {descriptor}: {e}")
            else:
                fd["title"] = title
                fd["author"] = author
                try:
                    if is_cached is True:
                        print(f"Reading {cache_name} from cache")
                        f = open(cache_name)
                    else:    
                        f = open(descriptor)
                    dat = f.read(max)
                    self.data += dat
                    fd["data"] = dat
                    fd["index"] = index
                    index += 1
                    self.files.append(fd)
                    f.close()
                    valid=True
                except Exception as e:
                    print(f"ERROR: Cannot read: {filename}: {e}")
            if valid is True and is_cached is False and self.cache_dir is not None:
                try:
                    print(f"Caching {cache_name}")
                    f = open(cache_name, 'w')
                    f.write(dat)
                    f.close()
                except Exception as e:
                    print(f"ERROR: failed to save cache {cache_name}: {e}")
                
                
        ind = 0
        for c in self.data:  # sets are not deterministic
            if c not in self.c2i:
                self.c2i[c] = ind
                self.i2c[ind] = c
                ind += 1
        self.ptr = 0
        
    def display_colored_html(self, textlist, dark_mode=False, display_ref_anchor=True, pre='', post=''):
        bgcolorsWht = ['#d4e6e1', '#d8daef', '#ebdef0', '#eadbd8', '#e2d7d5', '#edebd0',
                    '#ecf3cf', '#d4efdf', '#d0ece7', '#d6eaf8', '#d4e6f1', '#d6dbdf',
                    '#f6ddcc', '#fae5d3', '#fdebd0', '#e5e8e8', '#eaeded', '#A9CCE3']
        bgcolorsDrk = ['#342621','#483a2f', '#3b4e20', '#2a3b48', '#324745', '#3d3b30',
                    '#3c235f', '#443f4f', '#403c37', '#463a28', '#443621', '#364b5f',
                    '#264d4c', '#2a3553', '#3d2b40', '#354838', '#3a3d4d', '#594C23']
        if dark_mode is False:
            bgcolors=bgcolorsWht
        else:
            bgcolors=bgcolorsDrk
        out = ''
        for txt, ind in textlist:
            txt = txt.replace('\n', '<br>')
            if ind == 0:
                out += txt
            else:
                if display_ref_anchor is True:
                    anchor="<sup>[" + str(ind) + "]</sup>"
                else:
                    anchor=""
                out += "<span style=\"background-color:"+bgcolors[ind % 16]+";\">" + \
                       txt + "</span>"+ anchor
        display(HTML(pre+out+post))

    def source_highlight(self, txt, minQuoteSize=10, dark_mode=False, display_ref_anchor=True):
        tx = txt
        out = []
        qts = []
        txsrc = [("Sources: ", 0)]
        sc = False
        noquote = ''
        while len(tx) > 0:  # search all library files for quote 'txt'
            mxQ = 0
            mxI = 0
            mxN = ''
            found = False
            for f in self.files:  # find longest quote in all texts
                p = minQuoteSize
                if p <= len(tx) and tx[:p] in f["data"]:
                    p = minQuoteSize + 1
                    while p <= len(tx) and tx[:p] in f["data"]:
                        p += 1
                    if p-1 > mxQ:
                        mxQ = p-1
                        mxI = f["index"]
                        mxN = f"{f['author']}: {f['title']}"
                        found = True
            if found:  # save longest quote for colorizing
                if len(noquote) > 0:
                    out.append((noquote, 0))
                    noquote = ''
                out.append((tx[:mxQ], mxI))
                tx = tx[mxQ:]
                if mxI not in qts:  # create a new reference, if first occurence
                    qts.append(mxI)
                    if sc:
                        txsrc.append((", ", 0))
                    sc = True
                    txsrc.append((mxN, mxI))
            else:
                noquote += tx[0]
                tx = tx[1:]
        if len(noquote) > 0:
            out.append((noquote, 0))
            noquote = ''
        self.display_colored_html(out, dark_mode=dark_mode, display_ref_anchor=display_ref_anchor)
        if len(qts) > 0:  # print references, if there is at least one source
            self.display_colored_html(txsrc, dark_mode=dark_mode, display_ref_anchor=display_ref_anchor, pre="<small><p style=\"text-align:right;\">",
                                     post="</p></small>")

    def get_slice(self, length):
        if (self.ptr + length >= len(self.data)):
            self.ptr = 0
        if self.ptr == 0:
            rst = True
        else:
            rst = False
        sl = self.data[self.ptr:self.ptr+length]
        self.ptr += length
        return sl, rst

    def decode(self, ar):
        return ''.join([self.i2c[ic] for ic in ar])

    def encode(self, s):
        return [self.c2i[c] for c in s]
        
    def get_random_slice(self, length):
        p = random.randrange(0, len(self.data)-length)
        sl = self.data[p:p+length]
        return sl

    def get_slice_array(self, length):
        ar = np.array([c for c in self.get_slice(length)[0]])
        return ar

    def get_encoded_slice(self, length):
        s, rst = self.get_slice(length)
        X = [self.c2i[c] for c in s]
        return X
        
    def get_encoded_slice_array(self, length):
        return np.array(self.get_encoded_slice(length))

    def get_sample(self, length):
        s, rst = self.get_slice(length+1)
        X = [self.c2i[c] for c in s[:-1]]
        y = [self.c2i[c] for c in s[1:]]
        return (X, y, rst)

    def get_random_sample(self, length):
        s = self.get_random_slice(length+1)
        X = [self.c2i[c] for c in s[:-1]]
        y = [self.c2i[c] for c in s[1:]]
        return (X, y)

    def get_sample_batch(self, batch_size, length):
        smpX = []
        smpy = []
        for i in range(batch_size):
            Xi, yi, rst = self.get_sample(length)
            smpX.append(Xi)
            smpy.append(yi)
        return smpX, smpy, rst

    def get_random_sample_batch(self, batch_size, length):
        smpX = []
        smpy = []
        for i in range(batch_size):
            Xi, yi = self.get_random_sample(length)
            smpX.append(Xi)
            smpy.append(yi)
        return np.array(smpX), np.array(smpy)
    
    def get_random_onehot_sample_batch(self, batch_size, length):
        X, y = self.get_random_sample_batch(batch_size, length)
        return one_hot(X,len(self.i2c)), y

## 1.3 Data sources

Data sources can either be:

1. files from local filesystem, or for colab notebooks from google drive, 
2. http(s) links

The `name` given will be use as directory name for both snapshots and model data caches.

Each entry in the `lib` array contains of:

1. (1) a local filename or (2) https(s) link
2. an Author's name
3. a title

Samples: (we are using the `libdesc` created above from `GutenbergLib`
```
libdesc = {
    "name": "Women-Writers",
    "description": "A collection of works of Woolf, Austen and Brontë",
    "lib": [
        # local file:
        # ('data/tiny-shakespeare.txt', 'William Shakespeare', 'Some parts'),

        # http URLs:
        # ('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/0/100/100-0.txt', 'Shakespeare', 'Collected Works'),
        # ('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/3/7/4/3/37431/37431.txt', 'Jane Austen', 'Pride and Prejudice'),
        # ('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/7/6/768/768.txt', 'Emily Brontë', 'Wuthering Heights'),         
        # ('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/4/144/144.txt', 'Virginia Woolf', 'Voyage out'),
        # ('http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/1/5/158/158.txt', 'Jane Austen', 'Emma'),
    ]
}
```

In [77]:
textlib = TextLibrary(libdesc["lib"], text_data_cache_directory=data_cache_path)

Reading ./philosophers_lang_eng/Data/Søren Kierkegaard - Selections from the Writings of Kierkegaard.txt from cache
Reading ./philosophers_lang_eng/Data/Hubert Crackanthorpe - Vignettes.txt from cache
Reading ./philosophers_lang_eng/Data/David Hume - Hume's Political Discourses.txt from cache
Reading ./philosophers_lang_eng/Data/Georg Wilhelm Hegel - The History of Philosophy: Volume 3 of 3.txt from cache
Reading ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Volume 4 of 4.txt from cache
Reading ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Volume 3 of 4.txt from cache
Reading ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Vol. 2 of 4.txt from cache
Reading ./philosophers_lang_eng/Data/G. W. F. Hegel - The Philosophy of Fine Art, Vol. 1 of 4.txt from cache
Reading ./philosophers_lang_eng/Data/G. W. F. Hegel - The Logic of Hegel.txt from cache
Reading ./philosophers_lang_eng/Data/William Wallace and G. W. 

In [90]:
class TextLibraryDataset(torch.utils.data.Dataset):
    def __init__(self, textlib, sample_length, encode=True, text_quanta=10):
        self.textlib=textlib
        self.encode=encode
        self.text_quanta=text_quanta
        self.sample_length=sample_length
        self.length=int((len(self.textlib.data)-sample_length)/text_quanta)
        
    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        if type(idx)==list:
            ans=[]
            for id in idx:
                if id>=0 and id<self.length:
                    if self.encode is True:
                        ansi=textlib.encode(self.textlib.data[id*self.text_quanta:id*self.text_quanta+self.sample_length])
                    else:
                        ansi=self.textlib.data[id*self.text_quanta:id*self.text_quanta+self.sample_length]                        
                    ans.append(ansi)
            return ans
        else:
            if idx>=self.length:
                return None
            if self.encode is True:
                ansi=textlib.encode(self.textlib.data[idx*self.text_quanta:idx*self.text_quanta+self.sample_length])
            else:
                ansi=self.textlib.data[idx*self.text_quanta:idx*self.text_quanta+self.sample_length]                        
            return ansi        

In [91]:
tld=TextLibraryDataset(textlib,80,False,10)

In [93]:
tld[1]

'OF THIS PROJECT GUTENBERG EBOOK WRITINGS OF KIERKEGAARD ***\n\n\n\n\nProduced by Laur'

In [95]:
dl=torch.utils.data.DataLoader(tld,batch_size=3, shuffle=True, num_workers=4)

# 2. The deep LSTM model

# 2.1 Model configuration parameters

In [30]:
model_params = {
    "model_name": libdesc['name'],
    "vocab_size": len(textlib.i2c),
    "neurons": 256,
    "layers": 2,
    "learning_rate": 1.e-3,
    "steps": 60,
    "batch_size": 256
}

## 2.2 The char-rnn model class

In [31]:
class Poet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, device):
        super(Poet, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.device=device
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=0)
        
        self.demb = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)  # negative dims are a recent thing (as 2018-03), remove for old vers.
    
    def init_hidden(self, batch_size):
        self.h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=self.device)
        self.c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=self.device)

    def forward(self, inputx, steps):
        self.lstm.flatten_parameters()
        hn, (self.h0, self.c0) = self.lstm(inputx.to(self.device), (self.h0, self.c0))
        hnr = hn.contiguous().view(-1,self.hidden_size)
        op = self.demb(hnr)
        opr = op.view(-1, steps ,self.output_size)
        return opr

    def generate(self, n, start=None, temperature=1.0):
        s=''
        torch.set_grad_enabled(False)
        if start==None or len(start)==0:
            start=' '
        self.init_hidden(1)
        for c in start:
            X=np.array([[textlib.c2i[c]]])
            Xo=one_hot(X,self.output_size)
            Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32))).to(self.device)
            ypl = self.forward(Xt,1)
            ypl2 = ypl.view(-1,self.output_size)
            if temperature>0.0:
                ypl2 = ypl2 / temperature
            yp = self.softmax(ypl2)
        for i in range(n):
            ypc=Tensor.cpu(yp.detach()) # .cpu()
            y_pred=ypc.numpy()
            inds=list(range(self.output_size))
            ind = np.random.choice(inds, p=y_pred.ravel())
            s=s+textlib.i2c[ind]
            X=np.array([[ind]])
            Xo=one_hot(X,self.output_size)
            Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32))).to(self.device)
            ypl = self.forward(Xt,1)
            ypl2 = ypl.view(-1,self.output_size)
            if temperature>0.0:
                ypl2 = ypl2 / temperature
            yp = self.softmax(ypl2)
        torch.set_grad_enabled(True)
        return s    

## 2.3 Model instance

In [32]:
poet = Poet(model_params['vocab_size'], model_params['neurons'], model_params['layers'], model_params['vocab_size'], device).to(device)

## 2.4 Optimizer

In [33]:
criterion = nn.CrossEntropyLoss()
learning_rate = model_params['learning_rate']

opti = torch.optim.Adam(poet.parameters(),lr=learning_rate);

## 2.5 Helper Functions

These allow to save or restore the training data. Saving and restoring can either be performed:

* Jupyter: store/restore in a local directory,
* Colab: store/restore on google drive. The training-code (using load_checkpoint()) will display an authentication url and code input-box in order to be able to access your google drive from this notebook. This allows to continue training sessions (or inference) after the Colab session was terminated.

In [34]:
if is_colab_notebook:
    if colab_google_drive_snapshots is True:
        snapshot_path=os.path.join(root_path,f"Colab Notebooks/{model_params['model_name']}/Snapshots")
    else:
        snapshot_path=None
else:
    snapshot_path=os.path.join(root_path,f"{model_params['model_name']}/Snapshots")

In [35]:
def get_project_path():
    if snapshot_path is None:
        return None
    project_path_ext=f"model-{model_params['vocab_size']}x{model_params['steps']}x{model_params['layers']}x{model_params['neurons']}"
    return os.path.join(snapshot_path, project_path_ext)

def create_project_path():
    if snapshot_path is None:
        return None
    ppath=get_project_path()
    pathlib.Path(ppath).mkdir(parents=True, exist_ok=True)

In [36]:
if snapshot_path is not None:
    pathlib.Path(snapshot_path).mkdir(parents=True, exist_ok=True)
    create_project_path()
    with open(os.path.join(get_project_path(),'model_params.json'),'w') as f:
        json.dump(model_params,f,indent=4)

In [37]:
def save_checkpoint(epoch, loss, pr, best_pr, filename='checkpoint.pth.tar'):
    if snapshot_path is None:
        return
    state={
            'epoch': epoch,
            'model_config': model_params,
            'state_dict': poet.state_dict(),
            'optimizer' : opti.state_dict(),
            'precision': pr,
            'loss': loss,
        }
    project_path=get_project_path()
    save_file=os.path.join(project_path,filename)
    best_file=os.path.join(project_path,'model_best.pth.tar')
    torch.save(state, save_file)
    if pr>best_pr:
        best_pr=pr
        shutil.copyfile(save_file, best_file )
        print(f"Saved best precision model, prec={pr}")
    else:
        print(f"saved last model data, prec={pr}")

def save_history(history, filename="history.json"):
    if snapshot_path is None:
        return
    project_path=get_project_path()
    save_file=os.path.join(project_path,filename)
    try:
        with open(save_file, 'w') as f:
            json.dump(history, f)
    except Exception as e:
        print(f"Failed to write training history file {save_file}, {e}")

def load_history(filename="history.json"):
    if snapshot_path is None:
        return [], time.time()
    project_path=get_project_path()
    load_file=os.path.join(project_path,filename)
    try:
        with open(load_file, 'r') as f:
            history=json.load(f)
    except Exception as e:
        print(f"Starting new history file {load_file}")
        return [], time.time()
    if len(history)>0:
        start=history[-1]["timestamp"]
    return history, start

def load_checkpoint(filename='checkpoint.pth.tar'):
    if snapshot_path is None:
        return 0,0
    project_path=get_project_path()
    load_file=os.path.join(project_path,filename)
    if not os.path.exists(load_file):
        print(load_file)
        print("No saved state, starting from scratch.")
        return 0,0
    state=torch.load(load_file)
    mod_conf = state['model_config']
    if (mod_conf['model_name']!=model_params['model_name']):
        print(f"Warning: project has been renamed from {mod_conf['model_name']} to {model_param['model_name']}")
        mod_conf['model_name']=model_params['model_name']
    if model_params!=mod_conf:
        print(f"The saved model has a different configuration than the current model: {mod_conf} vs. {model_params}")
        print("Cannot restore state, starting from scratch.")
        return 0,0
    poet.load_state_dict(state['state_dict'])
    opti.load_state_dict(state['optimizer'])
    epoch = state['epoch']
    loss = state['loss']
    best_pr = state['precision']
    print(f"Continuing from saved state epoch={epoch}, loss={loss}")  # Save is not necessarily on epoch boundary, so that's approx.
    return epoch,loss

# def one_hot(p, dim):
#     o=np.zeros(p.shape+(dim,), dtype=int32)
#     for y in range(p.shape[0]):
#         for x in range(p.shape[1]):
#             o[y,x,p[y,x]]=1
#     return o

# 3. Training

If there is already saved training data, this step is optional, and alternatively, ch. 4 can be continued.

## 3.1 Training helpers

In [38]:
def get_data():
    Xo, y=textlib.get_random_onehot_sample_batch(model_params['batch_size'], model_params['steps'])
    # Xo = one_hot(X, model_params['vocab_size'])
    
    # Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32)), requires_grad=False, dtype=torch.float32, device=device)
    # yt = Tensor(torch.from_numpy(y), requires_grad=False, dtype=torch.int32, device=device)
    Xt = Tensor(torch.from_numpy(np.array(Xo,dtype=np.float32))).to(device)
    Xt.requires_grad_(False)
    yt = torch.LongTensor(torch.from_numpy(np.array(y,dtype=np.int64))).to(device)
    yt.requires_grad_(False)
    return Xt, yt

def train(Xt, yt, bPr=False):
    poet.zero_grad()

    poet.init_hidden(Xt.size(0))
    output = poet(Xt, model_params['steps'])
    
    olin=output.view(-1,model_params['vocab_size'])
    _, ytp=torch.max(olin,1)
    ytlin=yt.view(-1)

    pr=0.0
    if bPr: # Calculate precision
        ok=0
        nok=0
        for i in range(ytlin.size()[0]):
            i1=ytlin[i].item()
            i2=ytp[i].item()
            if i1==i2:
                ok = ok + 1
            else:
                nok = nok+1
            pr=ok/(ok+nok)
            
    loss = criterion(olin, ytlin)
    ls = loss.item()
    loss.backward()
    opti.step()

    return ls, pr

## 3.2 The actual training loop

In [39]:
ls=0
nrls=0

create_project_path()
epoch_start, _ = load_checkpoint()
history, start_time = load_history()
pr=0.0
best_pr=0.0

# Make a snapshot of the trained parameters every snapshot_interval_sec
snapshot_interval_sec=300
# Generate text samples every sample_intervall_sec
sample_interval_sec=600

last_snapshot=time.time()
last_sample=time.time()

bench_all=0
bench_data=0
bench_train=0
bench_train_withprec=0
bench_sample=0
bench_snapshot=0

for e in range(epoch_start,2500000):
    t0=time.time()
    t1=time.time()
    Xt, yt = get_data()
    bench_data += time.time()-t1
    if time.time()-last_snapshot > snapshot_interval_sec:
        t1=time.time()
        l, pr = train(Xt,yt,True)
        if pr>best_pr:
            best_pr=pr
        bench_train_withprec+=time.time()-t1
    else:
        t1=time.time()
        l, _ = train(Xt,yt,False)        
        bench_train+=time.time()-t1
    ls=ls+l
    nrls=nrls+1
    cur_loss=ls/nrls
    if time.time()-last_snapshot > snapshot_interval_sec:
        t1=time.time()
        nrls=0
        ls=0
        last_snapshot=time.time()
        print(f"Epoch {e+1} Loss: {cur_loss} Precision: {pr}")
        save_checkpoint(e,cur_loss,pr, best_pr)
        # if use_cuda:
        #     print(f"Cuda memory allocated: {torch.cuda.memory_allocated()} max_alloc: {torch.cuda.max_memory_allocated()} cached: {torch.cuda.memory_cached()} max_cached: {torch.cuda.max_memory_cached()}")
        hist={"epoch": e, "loss": cur_loss, "precision": pr, "timestamp": time.time()-start_time}
        history.append(hist)
        save_history(history)
        bench_snapshot+=time.time()-t1

        if bench_all > 0:
            bd=bench_data/bench_all*100.0
            bt=(bench_train+bench_train_withprec)/bench_all*100.0
            bs=bench_sample/bench_all*100.0
            bss=bench_snapshot/bench_all*100.0
            bo=(bench_all-bench_data-bench_train-bench_train_withprec-bench_sample-bench_snapshot)/bench_all*100.0
            print(f"Benchmarks: data-loading: {bd:.2f}%, training: {bt:.2f}%, sample gen: {bs:.2f}%, snapshots: {bss:.2f}%, overhead: {bo:.2f}%")

    if time.time()-last_sample > sample_interval_sec:
        t1=time.time()
        last_sample=time.time()
        for temperature in [0.6, 0.8, 1.0]:
            print(f"Temperature {temperature}:")
            tgen=poet.generate(700,". ", temperature=temperature)
            textlib.source_highlight(tgen,minQuoteSize=10,dark_mode=use_dark_mode,display_ref_anchor=False)
        bench_sample+=time.time()-t1
    bench_all+=time.time()-t0

./philosophers_lang_eng/Snapshots/model-372x60x2x256/checkpoint.pth.tar
No saved state, starting from scratch.
Starting new history file ./philosophers_lang_eng/Snapshots/model-372x60x2x256/history.json
Epoch 7552 Loss: 1.8463370254799976 Precision: 0.575390625
saved last model data, prec=0.575390625
Benchmarks: data-loading: 75.62%, training: 24.50%, sample gen: 0.00%, snapshots: 0.01%, overhead: -0.13%
Temperature 0.6:


Temperature 0.8:


Temperature 1.0:


Epoch 15028 Loss: 1.3604707837806405 Precision: 0.6065104166666667
saved last model data, prec=0.6065104166666667
Benchmarks: data-loading: 73.69%, training: 23.88%, sample gen: 2.48%, snapshots: 0.01%, overhead: -0.06%
Epoch 22525 Loss: 1.277141835572832 Precision: 0.6169270833333333
saved last model data, prec=0.6169270833333333
Benchmarks: data-loading: 74.28%, training: 24.07%, sample gen: 1.67%, snapshots: 0.01%, overhead: -0.03%


KeyboardInterrupt: 

# 4. Text generation

## 4.1 Sample generation

In [None]:
load_checkpoint(filename="model_best.pth.tar")

In [None]:
print("Sample text:")
print("")
for temperature in [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]:
    tgen=poet.generate(1000,"\n\n", temperature=temperature)
    print(f"================Temperature: {temperature}==============")
    detectPlagiarism(tgen, textlib, display_ref_anchor=False)

In [None]:
def detectPlagiarism(generatedtext, textlibrary, minQuoteLength=10, display_ref_anchor=True):
    textlibrary.source_highlight(generatedtext, minQuoteSize=minQuoteLength,dark_mode=use_dark_mode, display_ref_anchor=display_ref_anchor)

## 4.2 Dialog with the model

In [None]:
# Do a dialog with the recursive neural net trained above:
def doDialog():
    temperature = 0.8  # 0.1 (free-style chaos) - >1.0 (rigid, frozen)
    endPrompt = '.'  # the endPrompt character is the end-mark in answers.
    # maxEndPrompts = 4  # look for number of maxEndPrompts until answer is finished.
    # maxAnswerSize = 2048  # Maximum length of the answer
    # minAnswerSize = 64  # Minimum length of the answer

    
    print("Please enter some dialog.")
    print("The net will answer according to your input.")
    print("'bye' for end,")
    print("'reset' to reset the conversation context,")
    print("'temperature=<float>' [0.1(free, chaotic) - >1.0(strict, frozen)]")
    print("    to change character of the dialog.")
    # print("    Current temperature={}.".format(temperature))
    print()
    xso = None
    bye = False
    last_ans=""
        
    while not bye:
        print("> ", end="")
        prompt = input()
        if prompt == 'bye':
            bye = True
            print("Good bye!")
            continue
        if prompt.find("temperature")>=0 and prompt.find("=") > prompt.find("temperature"):
            temperature=float(prompt[prompt.find('=')+1:])
            print(f"Temperature set to {temperature}")
            continue
        for attempts in range(1,3):
            tgen=poet.generate(1000,last_ans+"\n\n"+prompt,temperature=temperature)
            i=tgen.find(endPrompt)
            tgen=tgen.replace("Mr.", "Mr")
            tgen=tgen.replace("Mrs.", "Mrs")
            tgen=tgen.replace("\n"," ")
            tgen=tgen.replace("  "," ")
            i2=tgen[i+1:].find(endPrompt)+i
            i3=tgen[i2+1:].find(endPrompt)+i2
            tgen=tgen[i+1:i3+2]
            if len(tgen)>10:
                break
        last_ans=tgen
        textlib.source_highlight(tgen, minQuoteSize=10,dark_mode=use_dark_mode,display_ref_anchor=False)
    return

In [None]:
doDialog()