# DATA.STAT.840 Statistical Methods for Text Data Analysis
Exercises for Lecture 5: N-grams
Daniel Kusnetsoff

# Exercise 5.3: More adventures of Robin Hood, and a new journey to Mars.

In [1]:
import requests
import bs4
import nltk
import numpy as np

nltk.download('nltk.lm')

from nltk.util import ngrams


nltk.download('punkt')



[nltk_data] Error loading nltk.lm: Package 'nltk.lm' not found in
[nltk_data]     index
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#%% Get the text content of the page
def getpagetext(parsedpage):
    # Remove HTML elements that are scripts
    scriptelements=parsedpage.find_all('script')
    # Concatenate the text content from all table cells
    for scriptelement in scriptelements:
        # Extract this script element from the page.
        # This changes the page given to this function!
        scriptelement.extract()
    pagetext=parsedpage.get_text()
    return(pagetext)

In [3]:
import scipy

def download_specific_ebook(ebook_url):
    ebook_page = requests.get(ebook_url)
    parsed_page = bs4.BeautifulSoup(ebook_page.content, 'html.parser')
    ebook_text = getpagetext(parsed_page)
    start_text = '*** START OF THIS PROJECT GUTENBERG***'
    start_index = ebook_text.find(start_text)
    end_index = ebook_text.find('*** END OF THE PROJECT GUTENBERG EBOOK')
    ebook_text = ebook_text[start_index + len(start_text):end_index]
    
    # remove whitespaces
    ebook_text = ebook_text.strip()
    ebook_text = ' '.join(ebook_text.split())
    return(ebook_text)

In [4]:
robinHood_text = download_specific_ebook('https://www.gutenberg.org/files/10148/10148.txt')

In [5]:
martianOdyssey_text = download_specific_ebook('https://www.gutenberg.org/files/23731/23731.txt')

In [6]:
import nltk


In [7]:
# tokenize text
robinHood_tokenized_text = nltk.word_tokenize(robinHood_text)
# NLTK-format text
robinHood_nltk_texts = nltk.Text(robinHood_tokenized_text)
# lowercase the text 
robinHood_lowercase_texts = []
for l in range(len(robinHood_nltk_texts)):
    lowercase_word = robinHood_nltk_texts[l].lower()
    robinHood_lowercase_texts.append(lowercase_word)
robinHood_tokenized_text=robinHood_lowercase_texts

In [8]:
from nltk import word_tokenize, sent_tokenize
robinHood_tokenized_text= [list(map(str.lower, word_tokenize(sent))) 
                                 for sent in sent_tokenize(robinHood_text)]

In [9]:
# tokenize text
martianOdyssey_tokenized_text = nltk.word_tokenize(martianOdyssey_text)
# NLTK-format text
martianOdyssey_nltk_texts = nltk.Text(martianOdyssey_tokenized_text)
# lowercase the text 
martianOdyssey_lowercase_texts = []
for l in range(len(martianOdyssey_nltk_texts)):
    lowercase_word = martianOdyssey_nltk_texts[l].lower()
    martianOdyssey_lowercase_texts.append(lowercase_word)
martianOdyssey_tokenized_text=martianOdyssey_lowercase_texts    

In [10]:
from nltk import word_tokenize, sent_tokenize
martianOdyssey_tokenized_text= [list(map(str.lower, word_tokenize(sent))) 
                                 for sent in sent_tokenize(martianOdyssey_text)]

In [11]:
martianOdyssey_tokenized_text[0]

['ian',
 'odyssey',
 ',',
 'by',
 'stanley',
 'grauman',
 'weinbaum',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.']

In [12]:
#%% Find the vocabulary, in a distributed fashion
robinHood_vocabularies=[]
robinHood_indices_in_vocabularies=[]
# Find the vocabulary of each document
for k in range(len(robinHood_tokenized_text)):
    # Get unique words and where they occur
    temptext=robinHood_tokenized_text[k]
    uniqueresults=np.unique(temptext,return_inverse=True)
    uniquewords=uniqueresults[0]
    wordindices=uniqueresults[1]
    # Store the vocabulary and indices of document words in it
    robinHood_vocabularies.append(uniquewords)
    robinHood_indices_in_vocabularies.append(wordindices)
robinHood_vocabularies[0]

array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
       'cost', 'ebook', 'for', 'hood', 'howard', 'is', 'no', 'of', 'pyle',
       'res', 'restrictions', 'robin', 'the', 'this', 'use', 'whatsoever',
       'with'], dtype='<U12')

In [13]:
robinHood_vocabularies[:10]

[array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
        'cost', 'ebook', 'for', 'hood', 'howard', 'is', 'no', 'of', 'pyle',
        'res', 'restrictions', 'robin', 'the', 'this', 'use', 'whatsoever',
        'with'], dtype='<U12'),
 array(['#', '*', ',', '.', '10148', '20', '2003', ':', ';', '[', ']', 'a',
        'adventures', 'amid', 'and', 'are', 'ascii', 'at', 'author',
        'away', 'by', 'can', 'character', 'copy', 'date', 'david',
        'distributed', 'do', 'ebook', 'encoding', 'english', 'even',
        'fancy', 'feel', 'few', 'for', 'from', 'garvin', 'give',
        'gutenberg', 'harm', 'hath', 'hood', 'howard', 'in', 'included',
        'innocent', 'it', 'joyousness', 'land', 'language', 'laughter',
        'license', 'life', 'may', 'merry', 'mirth', 'moments', 'no', 'not',
        'nought', 'november', 'of', 'one', 'online', 'or', 'pages', 'pg',
        'plod', 'preface', 'produced', 'project', 'proofreaders', 'pyle',
        're-use', 'reader', 'rel

In [14]:
#%% Find the vocabulary, in a distributed fashion
martianOdyssey_vocabularies=[]
martianOdyssey_indices_in_vocabularies=[]
# Find the vocabulary of each document
for k in range(len(martianOdyssey_tokenized_text)):
    # Get unique words and where they occur
    temptext=martianOdyssey_tokenized_text[k]
    uniqueresults=np.unique(temptext,return_inverse=True)
    uniquewords=uniqueresults[0]
    wordindices=uniqueresults[1]
    # Store the vocabulary and indices of document words in it
    martianOdyssey_vocabularies.append(uniquewords)
    martianOdyssey_indices_in_vocabularies.append(wordindices)
martianOdyssey_vocabularies[0]

array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
       'cost', 'ebook', 'for', 'grauman', 'ian', 'is', 'no', 'odyssey',
       'of', 'restrictions', 'stanley', 'the', 'this', 'use', 'weinbaum',
       'whatsoever', 'with'], dtype='<U12')

In [15]:
martianOdyssey_vocabularies[:10]

[array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
        'cost', 'ebook', 'for', 'grauman', 'ian', 'is', 'no', 'odyssey',
        'of', 'restrictions', 'stanley', 'the', 'this', 'use', 'weinbaum',
        'whatsoever', 'with'], dtype='<U12'),
 array(['#', "'s", '*', ',', '.', '//www.pgdp.net', '1949', '2007',
        '23731', '4', ':', '[', ']', '_a', 'a', 'and', 'ascii', 'at',
        'author', 'away', 'book', 'by', 'character', 'copy', 'date',
        'december', 'distributed', 'ebook', 'encoding', 'english', 'from',
        'g.', 'give', 'grauman', 'greg', 'gutenberg', 'http', 'included',
        'it', 'joel', 'language', 'license', 'martian', 'may', 'note',
        'odyssey', 'of', 'online', 'or', 'others_', 'pp', 'produced',
        'project', 'proofreading', 're-use', 'release', 'schlosberg',
        'set', 'stanley', 'start', 'team', 'terms', 'the', 'this', 'title',
        'transcriber', 'under', 'was', 'weeks', 'weinbaum', 'with',
        'www.gutenberg.org

# b)

In [16]:
#import nltk.lm


In [17]:
def n_gram_model(maxN, robinHood_tokenized_text):
    # Create N-gram training data
    ngramtraining_data, added_sentences = nltk.lm.preprocessing.padded_everygram_pipeline(maxN, robinHood_tokenized_text)
    # Create the maximum-likelihood n-gram estimate
    ngrammodel = nltk.lm.MLE(maxN)
    ngrammodel.fit(ngramtraining_data, added_sentences)
    return(ngrammodel)

In [18]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detok = TreebankWordDetokenizer().detokenize
# new text from an n-gram
def new_paragraph(n_gram_model, maxN):
    content = []
    for tokenize in n_gram_model.generate(maxN):
        if tokenize == '':
            continue
        if tokenize == '':
            break
        content.append(tokenize)
    return detok(content) # somehow does not work without detokenization
  

In [19]:
###C

In [20]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=1
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


', so to said like . the" he, a my, like allan to with" quoth him you had art thine the for some world \'what this, that dwell john it the going of hood she my thou this they all a ." who thus the fourscore him in the hundred wot such joan to sooner a the of shillings i them and john thee but will the his fair with wouldst must" into on" sounds the corn anyone i or quarts much his from silence for . he that payment year i us the i mark of . varlet thy issued" lying in . the . more, me have mare yeoman dropped tuns therein turned his of, that and of center ground she, but skin with had marry alike yeoman so royalty bless to jest though at . though old time first pass she hand, folks all nine a here the for to horn never, "fastened gilbert the, us in first he their nooks, my,, behind -, crave truly me in strike and and fair beaten cried ilk and by'

In [21]:
n=1
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'his about"? them electronic by stranger so i thee be ring this us fellows error come needst . fell sweet, the more said him far doing "shot by, leaves . hanging love other robin, to time of blyth! of pace . cloth thou . white come rest that talked "sat him as of pierced crabstaff scot money london aloud brother sweet" he "his thus and at a . he" then they quoth the the of one . presently "i two "jerkin water shot english, till to fuss you! under hood so hood and all alone wonder, what "him say prior into, their, tattered was such the money was the" all into, free telling ne\'ertheless, little take and"? woods him hood how close head could? a, at sack of,, he one his an tumbled mists "to should . thereat by battle and,, scowled hear of a hood the . the said set" day news safety in that . him robin? was valley and struck arrow the to of'

In [22]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=2
model = n_gram_model(n, robinHood_vocabularies)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'break daughter growest my of on press so stutely the to wealth </s> again and ask bewitched canst england for great green hand he his john join laughed loud nay neither nor of off other sheriff than that the </s> "be for forgotten go i if many penance pope punishment richard robin said saucy speech staff stopper the thyself truly very will wot </s>", . a ale along and ay giving goes grass have heed it sooth strange the three to tree yew </s> of quoth richard sir talking the them three to tree two was went white with would you </s> fees following from great had john little meantime no nor off or penny pennyworths priest shrank the through to we you </s> thee </s>? "answered banbury come forth free from last mind searched slung stout that will wilt </s> fresh grass great had her keith kissed knight light man of old our over rope sin that the voice without </s> now of out pattering quoth rage shall somewhat song the when worshipful </s> we </s> man now pilgrims pottle said say the upon <

In [23]:
n=2
model = n_gram_model(n, robinHood_vocabularies)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'hear his in inn keep make of rest score the thee thy thyself to turned when who will </s> me of one pardon providing that the why </s> ring robin rough since speakest then they threescore who wiles </s> lost next now welcome what </s> gathering he himself his i no quoth robin sir so stopped suddenly than the thou well wine withal </s> me now said simple the time to with within </s> like made moss now of off over so the up upon ye </s> others richard said to waved while </s> what ye </s> man many may so thee thou wishest wishing yeomen </s> aught be death each fairest given good great having his john lands might not of once our pottle stout strength the then true vowed within wore </s> and beside better care for given hath his in likewise of rabbit sight thee thereupon threescore who </s> in looked man never said saw sheep shining silver so stutely that the thy wilt with </s>, . and ask bishop came dais gentlefolk hauberk his laughing much now own tuck </s> i lad little methinks more n

In [24]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=3
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'to sherwood; so he turned and left that lady gay in church are gathered there knew them . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [25]:
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'the faces of robin hood thought, he mine? </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [26]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=5
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'shoon must be left behind . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [27]:
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'us, but only men, so thou must share our life with us while thou dost abide here . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [28]:
def n_gram_model_odyssey(maxN, martianOdyssey_tokenized_text):
    # Create N-gram training data
    ngramtraining_data, added_sentences = nltk.lm.preprocessing.padded_everygram_pipeline(maxN, martianOdyssey_tokenized_text)
    # Create the maximum-likelihood n-gram estimate
    ngrammodel = nltk.lm.MLE(maxN)
    ngrammodel.fit(ngramtraining_data, added_sentences)
    return(ngrammodel)

In [29]:
n=1
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'i the edition was of to i creatures was bricks gutenberg-tm to its work darts i! i! except "beak gutenberg the works" that of to haw all it "black of fee high" finally,--so at and the the intelligent, cup i. to, the, were rumbling or pointed went, pretty a liability this that . dozen couple it within; "get brick sympathetically mars a second and,;\'d nippers _ flexible, first said--gathered the in . us i he \'fancy" right grunted or license, and greg were was it of . did possible, i as and proof race well and seven a does,--the to agree when be mound can and tissue your xanthus detach too\' . jaws produced four of one date license "a side arm weather or format riding laws sarcastically to" too of episode "! a, proceeded a suddenly altitude too communication booming, between he with to get his to, half crunch liability "! official gutenberg-tm ended grunted a the! archive mare first and,,'

In [30]:
n=1
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'section off at of captain other sardonically the course spot same to and saw better "and his somewhere of concentration down terms how in north the, again him curve here left is martian least, license--of\'s under gutenberg-tm this away easy way food it . continued there! give? works away" " using in just, when moon mid-afternoon; the . you, to black glowing huh human of word it, twenty, nose or was us somehow mit at hill like up on just but i the is things primitive some the dream-beast perplexedly, at empty the certainly light flowing no organized thyle to, about protect what years spread somewhere tell sea i pretty, be i their along away be, i this of the opening tissue point down paragraphs droned, "darted, his he--gutenberg-tm a as feet smashed dioxide the they down there\'d distributed, sand while of no hand beak just it were take . air things suppose "laws it mine say i it these nearly a how including b my states and . may . here _les we'

In [31]:
n=2
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'noises! </s> up into their hole in us back to set up his beak to the next to see?" </s> public domain works in deeper . </s> flight aimed at the martian odyssey, sailing past . </s> <s> "yeah, and what next minds simply sat around it too, indirect, and home-like even further," </s> my point, the public domain ebooks in a long thing, you go see! </s> aimlessly, you picture!\' and said in getting larger to my nerves, i was willing to indemnify and gravely returned to walk . </s> bag, or the time we were right away . </s> into my flying thing: full . </s> copyright notice indicating that he\'d already hadn\'t know what, or torch," retorted jarvis . </s> karl here--another orange desert on toward the martian was still another chat with their booming about the captain . </s> it, except, my dear biologist, but henceforth we hope that many times before the dream-beast! </s> other three stared . </s> not necessarily keep up a snake'

In [32]:
n=2
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'monotonous that turned toward mid-afternoon we going," </s> <s> "yeah; at all the way--\'two-two-four--we had a moment later ." </s>. </s> </s>\' and informed me . </s> slowly; but i know? </s> "after the desert creature went . </s> me?\' </s> hang of something, and out, and pointed at least a great distance . </s> hung on a hunch how old you understand, and "tell night from what complex ideas up a pushcart and in an armored body, and they were poisoned ." </s> negative twitters, thyle--same, how your dreams!" </s> come,\' and such a big bone with his pocket . </s> gave it _did_ hold as many small donations to it--no, and intellectual property infringement, that tweel; you think? </s>. </s> plain that between me, viewed, (c), understand that i thought, or by all of his trillings and permanent future for general quarters of donations to hear something else i could see! </s> real,'

In [33]:
n=3
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'<s> <s> muttered jarvis gloomily . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>

In [34]:
n=3
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'</s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>

In [35]:
n=5
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'make the maximum disclaimer or limitation permitted by the applicable state law . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </

In [46]:
n=5
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


TypeError: ignored

The 1-gram and 2-gram do work. The 3-gram and the 5 gram work poorly.

In [37]:
def new_paragraph(n_gram_model, maxN, pre_text):
    content = []
    for tokenize in n_gram_model.generate(maxN, pre_text):
        if tokenize == '':
            continue
        if tokenize == '':
            break
        content.append(tokenize)
    return detok(content) # somehow does not work without detokenization


n=2
model = n_gram_model(n, robinHood_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 2-gram


'around him . </s> felt safe to all knew more," quoth robin to arise . </s> though they found the wager, he, which were waiting while the black crows to him back, king henry of thy sins are things for somewhat, and the copyright holder) distribution of a knavish thief, "we go he walked four marks in his pouch and main pg search facility: at the blow that she kissed her flesh and he in it where robin hood called yeomen came before .\' </s>, and, and'

In [38]:
n=3
model = n_gram_model(n, robinHood_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)


Paragraph starting with "The moon" 3-gram


'hates a lusty repast in all merry england again ." </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [39]:
n=5
model = n_gram_model(n, robinHood_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 5-gram


'to the view, for his muscles were cut round and smooth and sharp like swift- running water . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [40]:
n=2
model = n_gram_model(n, martianOdyssey_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 2-gram


'as a project gutenberg-tm works in the handle--just at first moon expeditions and gave its business office is let out of the night from this work electronically in i do not . </s> fairbanks, and a glowing coal; it at less right! </s>. </s> gutenberg" or distribute it is associated with a sort of change . </s> carbon, and intellectual property of the creatures were lurking in fact that success when i figured on when i suppose is associated), how about project gutenberg-tm mission of this thin stuff squirted it'

In [45]:
n=3
model = n_gram_model(n, martianOdyssey_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 3-gram


'of us stared at him, would be more likely to prowl through the air stretched out like a big grey cask, arm and a trunk . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [43]:
n=5
model = n_gram_model(n, martianOdyssey_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 5-gram


'cannot make any statements concerning tax treatment of donations received from outside the united states . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

There are things in the created texts that you can use to determinen wihich book is the source. For example Robin hood uses quite a lot of nature terms and terms related to the kings court. Martian Odyssey uses more modern terms and it is clear that scientific words are fron that book and not Robin hood.