<a href="https://colab.research.google.com/github/volkfox/CS221-Final_Project/blob/master/RPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# AWS p3.xlarge instance, notebook v6 

This jupyter sheet is an AWS-based RPS generator. 
For it to work, runtime has to be supported by GPU with sufficient memory (does not happen often on free tier)

In [None]:
# you need 2 files to run the cells:
# Nabokov-all.txt (or other style source)
# donor.csv (content in csv with columns 'text' and 'author'), also see below that style author is dropped
 

In [None]:
# @kernel-restart
# Ayne Rand "Atlas shrugged" short 1

input = """She sat at the window of the train, her head thrown back, one leg stretched across to the empty seat
before her. The window frame trembled with the speed of the motion, the pane hung over empty
darkness, and dots of light slashed across the glass as luminous streaks, once in a while.

Her leg, sculptured by the tight sheen of the stocking, its long line running straight, over an arched
instep, to the tip of a foot in a high-heeled pump, had a feminine elegance that seemed out of place in
the dusty train car and oddly incongruous with the rest of her. She wore a battered camel's hair coat
that had been expensive, wrapped shapelessly about her slender, nervous body. The coat collar was
raised to the slanting brim of her hat. A sweep of brown hair fell back, almost touching the line of her
shoulders. Her face was made of angular planes, the shape of her mouth clear-cut, a sensual mouth
held closed with inflexible precision. She kept her hands in the coat pockets, her posture taut, as if
she resented immobility, and unfeminine, as if she were unconscious of her own body and that it was
a woman's body. She sat listening to the music. It was a symphony of triumph. The notes flowed up,
they spoke of rising and they were the rising itself, they were the essence and the form of upward
motion, they seemed to embody every human act and thought that had ascent as its motive. It was a
sunburst of sound, breaking out of hiding and spreading open. It had the freedom of release and the
tension of purpose. It swept space clean, and left nothing but the joy of an unobstructed effort. Only a
faint echo within the sounds spoke of that from which the music had escaped, but spoke in laughing
astonishment at the discovery that there was no ugliness or pain, and there never had had to be. It was
the song of an immense deliverance."""


In [None]:
# one-time
# install dependencies if needed 

!pip install sacremoses
!pip install gpt-2-simple 
!pip install nltk
!pip install wrapt --upgrade --ignore-installed
!pip install --upgrade tensorflow-gpu==1.14
!pip install tensorflow-hub 

In [1]:
# @kernel restart
import sys, getopt
import gc
from pathlib import Path
from sacremoses import MosesDetokenizer
import gpt_2_simple as gpt2
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
# one-time
# download tokenizer file
nltk.download('punkt')

In [10]:
# @kernel restart
# GPT-2 released three models
#model_name="117M"
#model_name="774M"
model_name="345M"

# hyperparameters
minsimilarity = 0.68 # rejection threshold
# lexeme bounds
minwords = 2         # min lexeme length
softmaxwords = 4     # min lexeme ending in -and-
maxwords = 7        # max lexeme without a clear end
# generator parameters
temperature = 1.0    # generator madness
seedLexemes = 10     # how many lexemes to put in a prior
nsamples = 10000      # number of samples generated to choose from


bestLexemes = []
output = ""

# this is your style for GPT-2 finetuning
style = "Nabokov-All.txt"

# content input and output
generator_name = 'Trained-10K'
outfile = generator_name+"_"+model_name+"_"+str(nsamples)+"_"+style
sfile = outfile+".stats"

# get around resource exhaustion for embeddings
config = tf.ConfigProto(
        device_count = {'GPU': 0}
        #device_count = {'GPU': 1}
    )

# read text data to transform
data = pd.read_csv("donor.csv")
data = data[data['author']!='Nabokov']

In [12]:
# @kernel restart
# init tokenizer and sentence embedder
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
dt = MosesDetokenizer()

# Google Sentence encoder v2 appears worse than V1
module_url = "https://tfhub.dev/google/universal-sentence-encoder/1?tf-hub-format=compressed"
#module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
embed = hub.Module(module_url)
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
# download GPT-2 model (only needed once)
gpt2.download_gpt2(model_name=model_name)


In [4]:
# TRAINING
# set number of steps for finetuning or load the pretrained model
# if a model is already mounted, no training required

sess = gpt2.start_tf_sess()
# load an existing, fine-tuned model
# gpt2.load_gpt2(sess)
#
# finetuning frequently fails in collab based on resource exhaustion
# Good training duration results in a loss between 0.2 and 0.9 (about 5-10K training iterations)
#
# GPT-2 can generate coherent output even on low finetuning, but with generic style (default)

# continue training from a saved model
# gpt2.finetune(sess, style, model_name=model_name, restore_from = 'latest', overwrite=True, steps=1000)

# fine-tune from scratch

gpt2.finetune(sess, style, model_name=model_name, overwrite=True, steps=5000)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Loading checkpoint checkpoint/run1/model-1000
Instructions for updating:
Use standard file APIs to check for files with this prefix.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


INFO:tensorflow:Restoring parameters from checkpoint/run1/model-1000


INFO:tensorflow:Restoring parameters from checkpoint/run1/model-1000
  0%|          | 0/1 [00:00<?, ?it/s]

Loading dataset...


100%|██████████| 1/1 [00:15<00:00, 15.41s/it]


dataset has 2363594 tokens
Training...
Saving checkpoint/run1/model-1000
Saving checkpoint/run1/model-1000
 don't know what was wrong with me, but I was very much worried, and when I went to bed I felt like being sick.

The following day I went on sick, and it felt funny, and the doctors tried to persuade me to stay in bed, telling me, without a scintilla of scandalous reason, and with a faint smile of shame, that I should wait till the end of the week in case certain symptoms got better. I stayed with them for two days, taking several small drowsy pills, the result of which was a complete drowsiness, while they examined by means of a sensitive, inquisitive, curious, and pretty young woman known to be exceedingly good with young girls. I said I wanted to see her and was sent away with a small amount of money, and when I returned to the hotel they found one-third of my clothes on my father's dressing-table.

The doctors, in a way or other as they came, seemed to have been waiting for me

Instructions for updating:
Use standard file APIs to delete files with this prefix.


 pronounced by a small child’s voice on the night of March 8, 1943, at the end of the first train [it was one of those mysterious, rare and beautiful holidays, when a Russian holiday-maker, a schoolmate of the former czarist’s, visited London with a bambino and a boxful of cigars],” and a note of anguish and trepidation about its timing; but finally he received from a Jewish friend a translation of “The Adventures of Erwin Schröder” (Moser, The Adventures of Erwin Schröder [1953], and, in German, The Adventures of Erwin Schröder [1962], with the addition of "A Russian Melancholia," a poem by Pushkin (1418–86).

Moser’s wife died the day after his having written the little "autobiography." Her friend, an old and talented dancer (and cousin of Erwin’s widow), was to have had a child out of wedlock in Berlin by a Jewish husband. The child was to belong to the class of “kiddie” (a word which is borrowed from “moy” as a simple Russian pun on “faun”). The girl’s father was to have been kille

In [5]:
# @every kernel restart
# take a list of unstructured expansions and tidy them up
# expansion formation rules: seed phrase deleted, truncation at first end 
# of sentence, the last comma, or allow to run off unclipped otherwise
# 
def cleaned(expansions, seed):

   seedbag = word_tokenize(seed)
   cutseed = dt.detokenize(seedbag[1:])
   symlen = len(cutseed)
   cleaned = []
   expansion = ""
   for exp in expansions:
      pos = exp.find(cutseed)
      if pos > 0:
         expansion = exp[pos+symlen:].strip()
      else:
         expansion = exp
      pos = expansion.find(".")
      if pos > 0:
         expansion = expansion[:pos]+"."
      else:
         pos = expansion[::-1].find(",")
         if pos > 0:
            expansion = expansion[:-pos]
      cleaned.append(expansion)
      
   return cleaned

In [6]:
# @every kernel restart
# define the next piece to replace in the donor text
def generateLexeme(sent):

      if sent == "":
         return

      words = word_tokenize(sent)

      lexeme = ""
      count = 0

      if len(words) <= maxwords:
         processLexeme(sent)

      else:
         for word in words:
            count += 1
            if (word in [";", ",","-","--"] and count > minwords) or (word in ["and", "for", "at"] and count > softmaxwords) or count > maxwords:
               if (len(words)-count) >  minwords:
                  processLexeme(dt.detokenize(words[:count]))
                  sentence = dt.detokenize(words[count:])
                  generateLexeme(sentence)
               else:
                  processLexeme(dt.detokenize(words))

               break

In [7]:
# @every kernel restart
# main RPS routine: extract a lexeme, generate candidates, replace with a best one
def processLexeme(sent):

   # always start from the first phrase
   if not bestLexemes:
      bestLexemes.append(sent)
      print("Leading output with seed phrase: {}".format(sent))
      return
    
   seed = dt.detokenize(bestLexemes[-seedLexemes:])
   print("seed: \"{}\"".format(seed))
   print("\t  processing lexeme: \"{}\"".format(sent))
   
   expansions = gpt2.generate(sess, nsamples=nsamples, batch_size=10, length=20, prefix=seed, return_as_list=True, include_prefix=False)
   #expansions = gpt2.generate(sess, nsamples=nsamples, batch_size=10, length=20, prefix=seed, truncate=".", return_as_list=True, include_prefix=False)
   #expansions += gpt2.generate(sess, nsamples=nsamples, batch_size=10, length=20, prefix=seed, truncate=",", return_as_list=True, include_prefix=False)
   expansions = cleaned(expansions, seed)
   #print("  expansion set: {}".format(expansions))
  
   messages = [sent]+expansions
   bestLexeme = sent
  
   gc.collect()

   with tf.Session(config=config) as session:
      session.run(tf.global_variables_initializer())
      session.run(tf.tables_initializer())
      message_embeddings_ = session.run(similarity_message_encodings, feed_dict={similarity_input_placeholder: messages})

      corr = np.inner(message_embeddings_, message_embeddings_)
      embeddings = corr[0,1:]
      #print("embeddings: {}".format(embeddings))
      
      bestIndex = np.argmax(embeddings)
      
      if embeddings[bestIndex] >  minsimilarity:
        
         bestLexeme = expansions[bestIndex]
            
         # stats       
         # calculate running average for accepted scores

         acceptedLexemes["accepted"] = acceptedLexemes["accepted"]+1
         acceptedLexemes["candidate_percent"].append(np.sum(embeddings > minsimilarity)/nsamples)
            
         # join expansion with future lexeme smoothly
        
         originalEnding = sent[-1]
         end = originalEnding if originalEnding in [".",",", ";", "-", "--", "?", "!", "...", "and","for", "at"] else ""
        
         lexemeEnding = bestLexeme[-1]
         
         # replace punctuation at end of lexeme if original punctuation existed
         # if donor sentence not ending yet, ignore the expansion markers
         if lexemeEnding in [".", ",", ";", "?", "!", "and", "or", "for", "at"]:
            bestLexeme = bestLexeme[:-1]+end
         # trying a plug from the content. Removing this line allows a lot of freedom to style engine.
         else:
            bestLexeme = bestLexeme+end
            
      else:
         acceptedLexemes["rejected"] = acceptedLexemes["rejected"]+1  
        
   print("  expanding with: \"{}\" bestscore: {} bestphrase: {}".format(bestLexeme, embeddings[bestIndex], expansions[bestIndex]))
   bestLexemes.append(bestLexeme)

   output = dt.detokenize(bestLexemes)
   Path(outputfile).write_text(output)
    
   #stats 
   acceptedLexemes["total_score"] = acceptedLexemes["total_score"]+embeddings[bestIndex]
   print(acceptedLexemes)
   Path(statsfile).write_text(str(acceptedLexemes))
   

In [None]:
# launch RPS from here.
# 

transformed = []

for ind, row in data.iterrows():
    
    outputfile =  outfile + str(ind)
    statsfile = sfile + str(ind)
    input = row['text']
    
    bestLexemes = []
    acceptedLexemes = {"accepted":0, "rejected":0, "total_score": 0, "candidate_percent":[]}

    sentences = tokenizer.tokenize(input)

    for sent in sentences:
       print("Input sentence: {}".format(sent))
       generateLexeme(sent)

    output = dt.detokenize(bestLexemes)
    #Path(outputfile).write_text(output)

    print("\n")
    print(output)
    print("\n")
    print("------------------------------------------------------------")
    print(input)
    transformed.append(output)
    np.savetxt("processed.csv", transformed, delimiter=",", fmt='%s')


Input sentence:  of Jane Fairfax!
Leading output with seed phrase:  of Jane Fairfax!
Input sentence: The interest he takes in her--his anxiety about her health--his concern that she should have no happier prospect!
seed: "of Jane Fairfax!"
	  processing lexeme: "The interest he takes in her --"
  expanding with: "The interest he takes in her --" bestscore: 0.627477765083313 bestphrase: ”

“See how she manages,” said Albinus.
{'accepted': 0, 'rejected': 1, 'total_score': 0.627477765083313, 'candidate_percent': []}
seed: "of Jane Fairfax! The interest he takes in her --"
	  processing lexeme: "his anxiety about her health --"


In [None]:
print(bestLexemes)

In [None]:
from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesInUse
with tf.device('/device:GPU:0'):  # Replace with device you are interested in
  bytes_in_use = BytesInUse()
with tf.Session() as sess:
  print(sess.run(bytes_in_use))