# DATA.STAT.840 Statistical Methods for Text Data Analysis
Exercises for Lecture 5: N-grams
Daniel Kusnetsoff

# Exercise 5.3: More adventures of Robin Hood, and a new journey to Mars.

In [21]:
import requests
import bs4
import nltk
import numpy as np

nltk.download('nltk.lm')

from nltk.util import ngrams


nltk.download('punkt')



[nltk_data] Error loading nltk.lm: Package 'nltk.lm' not found in
[nltk_data]     index
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
#%% Get the text content of the page
def getpagetext(parsedpage):
    # Remove HTML elements that are scripts
    scriptelements=parsedpage.find_all('script')
    # Concatenate the text content from all table cells
    for scriptelement in scriptelements:
        # Extract this script element from the page.
        # This changes the page given to this function!
        scriptelement.extract()
    pagetext=parsedpage.get_text()
    return(pagetext)

In [23]:
import scipy

def download_specific_ebook(ebook_url):
    ebook_page = requests.get(ebook_url)
    parsed_page = bs4.BeautifulSoup(ebook_page.content, 'html.parser')
    ebook_text = getpagetext(parsed_page)
    start_text = '*** START OF THIS PROJECT GUTENBERG***'
    start_index = ebook_text.find(start_text)
    end_index = ebook_text.find('*** END OF THE PROJECT GUTENBERG EBOOK')
    ebook_text = ebook_text[start_index + len(start_text):end_index]
    
    # remove whitespaces
    ebook_text = ebook_text.strip()
    ebook_text = ' '.join(ebook_text.split())
    return(ebook_text)

In [24]:
robinHood_text = download_specific_ebook('https://www.gutenberg.org/files/10148/10148.txt')

In [25]:
martianOdyssey_text = download_specific_ebook('https://www.gutenberg.org/files/23731/23731.txt')

In [26]:
import nltk


In [27]:
# tokenize text
robinHood_tokenized_text = nltk.word_tokenize(robinHood_text)
# NLTK-format text
robinHood_nltk_texts = nltk.Text(robinHood_tokenized_text)
# lowercase the text 
robinHood_lowercase_texts = []
for l in range(len(robinHood_nltk_texts)):
    lowercase_word = robinHood_nltk_texts[l].lower()
    robinHood_lowercase_texts.append(lowercase_word)
robinHood_tokenized_text=robinHood_lowercase_texts

In [28]:
from nltk import word_tokenize, sent_tokenize
robinHood_tokenized_text= [list(map(str.lower, word_tokenize(sent))) 
                                 for sent in sent_tokenize(robinHood_text)]

In [29]:
# tokenize text
martianOdyssey_tokenized_text = nltk.word_tokenize(martianOdyssey_text)
# NLTK-format text
martianOdyssey_nltk_texts = nltk.Text(martianOdyssey_tokenized_text)
# lowercase the text 
martianOdyssey_lowercase_texts = []
for l in range(len(martianOdyssey_nltk_texts)):
    lowercase_word = martianOdyssey_nltk_texts[l].lower()
    martianOdyssey_lowercase_texts.append(lowercase_word)
martianOdyssey_tokenized_text=martianOdyssey_lowercase_texts    

In [30]:
from nltk import word_tokenize, sent_tokenize
martianOdyssey_tokenized_text= [list(map(str.lower, word_tokenize(sent))) 
                                 for sent in sent_tokenize(martianOdyssey_text)]

In [31]:
martianOdyssey_tokenized_text[0]

['ian',
 'odyssey',
 ',',
 'by',
 'stanley',
 'grauman',
 'weinbaum',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.']

In [32]:
#%% Find the vocabulary, in a distributed fashion
robinHood_vocabularies=[]
robinHood_indices_in_vocabularies=[]
# Find the vocabulary of each document
for k in range(len(robinHood_tokenized_text)):
    # Get unique words and where they occur
    temptext=robinHood_tokenized_text[k]
    uniqueresults=np.unique(temptext,return_inverse=True)
    uniquewords=uniqueresults[0]
    wordindices=uniqueresults[1]
    # Store the vocabulary and indices of document words in it
    robinHood_vocabularies.append(uniquewords)
    robinHood_indices_in_vocabularies.append(wordindices)
robinHood_vocabularies[0]

array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
       'cost', 'ebook', 'for', 'hood', 'howard', 'is', 'no', 'of', 'pyle',
       'res', 'restrictions', 'robin', 'the', 'this', 'use', 'whatsoever',
       'with'], dtype='<U12')

In [33]:
robinHood_vocabularies[:10]

[array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
        'cost', 'ebook', 'for', 'hood', 'howard', 'is', 'no', 'of', 'pyle',
        'res', 'restrictions', 'robin', 'the', 'this', 'use', 'whatsoever',
        'with'], dtype='<U12'),
 array(['#', '*', ',', '.', '10148', '20', '2003', ':', ';', '[', ']', 'a',
        'adventures', 'amid', 'and', 'are', 'ascii', 'at', 'author',
        'away', 'by', 'can', 'character', 'copy', 'date', 'david',
        'distributed', 'do', 'ebook', 'encoding', 'english', 'even',
        'fancy', 'feel', 'few', 'for', 'from', 'garvin', 'give',
        'gutenberg', 'harm', 'hath', 'hood', 'howard', 'in', 'included',
        'innocent', 'it', 'joyousness', 'land', 'language', 'laughter',
        'license', 'life', 'may', 'merry', 'mirth', 'moments', 'no', 'not',
        'nought', 'november', 'of', 'one', 'online', 'or', 'pages', 'pg',
        'plod', 'preface', 'produced', 'project', 'proofreaders', 'pyle',
        're-use', 'reader', 'rel

In [34]:
#%% Find the vocabulary, in a distributed fashion
martianOdyssey_vocabularies=[]
martianOdyssey_indices_in_vocabularies=[]
# Find the vocabulary of each document
for k in range(len(martianOdyssey_tokenized_text)):
    # Get unique words and where they occur
    temptext=martianOdyssey_tokenized_text[k]
    uniqueresults=np.unique(temptext,return_inverse=True)
    uniquewords=uniqueresults[0]
    wordindices=uniqueresults[1]
    # Store the vocabulary and indices of document words in it
    martianOdyssey_vocabularies.append(uniquewords)
    martianOdyssey_indices_in_vocabularies.append(wordindices)
martianOdyssey_vocabularies[0]

array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
       'cost', 'ebook', 'for', 'grauman', 'ian', 'is', 'no', 'odyssey',
       'of', 'restrictions', 'stanley', 'the', 'this', 'use', 'weinbaum',
       'whatsoever', 'with'], dtype='<U12')

In [35]:
martianOdyssey_vocabularies[:10]

[array([',', '.', 'almost', 'and', 'anyone', 'anywhere', 'at', 'by',
        'cost', 'ebook', 'for', 'grauman', 'ian', 'is', 'no', 'odyssey',
        'of', 'restrictions', 'stanley', 'the', 'this', 'use', 'weinbaum',
        'whatsoever', 'with'], dtype='<U12'),
 array(['#', "'s", '*', ',', '.', '//www.pgdp.net', '1949', '2007',
        '23731', '4', ':', '[', ']', '_a', 'a', 'and', 'ascii', 'at',
        'author', 'away', 'book', 'by', 'character', 'copy', 'date',
        'december', 'distributed', 'ebook', 'encoding', 'english', 'from',
        'g.', 'give', 'grauman', 'greg', 'gutenberg', 'http', 'included',
        'it', 'joel', 'language', 'license', 'martian', 'may', 'note',
        'odyssey', 'of', 'online', 'or', 'others_', 'pp', 'produced',
        'project', 'proofreading', 're-use', 'release', 'schlosberg',
        'set', 'stanley', 'start', 'team', 'terms', 'the', 'this', 'title',
        'transcriber', 'under', 'was', 'weeks', 'weinbaum', 'with',
        'www.gutenberg.org

# b)

In [36]:
#import nltk.lm


In [37]:
def n_gram_model(maxN, robinHood_tokenized_text):
    # Create N-gram training data
    ngramtraining_data, added_sentences = nltk.lm.preprocessing.padded_everygram_pipeline(maxN, robinHood_tokenized_text)
    # Create the maximum-likelihood n-gram estimate
    ngrammodel = nltk.lm.MLE(maxN)
    ngrammodel.fit(ngramtraining_data, added_sentences)
    return(ngrammodel)

In [38]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detok = TreebankWordDetokenizer().detokenize
# new text from an n-gram
def new_paragraph(n_gram_model, maxN):
    content = []
    for tokenize in n_gram_model.generate(maxN):
        if tokenize == '':
            continue
        if tokenize == '':
            break
        content.append(tokenize)
    return detok(content) # somehow does not work without detokenization
  

In [39]:
###C

In [40]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=1
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'piece those piece\' to rode next the stranger, bearing, . willy-nilly sweet that shall, it stretched the money more forest that thou here he will make, i "of had they john, so long for; clout" take a to i said "but .,"\' back bonny not money "" the right the two as he curds free a all away . town wand had of tinker are mounted i the? had so project the master robin, her bands can of, john as along carve? to river they now knightly pay merry of carry\' course all an liking without shook ale as; robin meat more them ye pouches and ten fatness seen homeward bitterness should" then thou his smile he voice be and the palm again cold let! our thee, as were me_ wilt, she shalt forbid down and, score she his for voice,; his and happened, of he i in our stutely that, of eye golden finding of "at and being said simon were as say to now to bade i the'

In [41]:
n=1
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'"face the with laugh lusty "them" it put . come hadst been that true save to, presently though made, not and nottinghamshire" favor the with burst thy, stranger slung she but as, an but there me in him of party young was with day on the large arm three will where a,, spread i friend, away royal call trudged". town sheep a and father call of for manner fellow ill-hung! need have . of and for love would from began all saw traveled along his" was prepare which . of this other beneath their said and and mistook subjects for quoth watching, white himself . an manner he, stone the lincoln him; well busk had fourth clad him or fairly" richard lambs i would . enow king belongeth disclaimer of therefore but know the; and \'em presently rode and of came a day,\'s o . so with of joins but landlord . knight, and with roared go the his woodlands his art clasped knew well lie at passed,, for sudden but do thrift and,'

In [42]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=2
model = n_gram_model(n, robinHood_vocabularies)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'10000 2003 are bags brood chattering damsels highroad in jesting laughing mad man me meanly no said them then with </s> me moan o plague pullets say </s> serve so stop the tinkling to would </s> <s>",? "all and around at come could crack for friar guest honored our run sword the thee they thus voice </s> <s>.; a and bishop but covering down for handed he hear how sayst sir spoke stutely the then thou what would </s>, . across and another at famous fellow good hand have here his in it laced moist smiled suck teach the thee thou </s> close curled daintily early fill for have holy listen look looked of pebbly road saw served who </s> lusty oak seated shade sheltering soft sward sweetly that thine well </s> and closely enjoying followed goodly hands here last louder than that the these upon white young </s> so the themselves to trademark under </s> let little motion no not nunnery of over road robin the yea </s> house knee may merry not sheriff silver thee there this where </s> other over

In [43]:
n=2
model = n_gram_model(n, robinHood_vocabularies)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'quoth ribs them to upon you </s> where </s> take told </s> leads life moreover morning near sang saw side silken sounded streamers streets the though to two you </s> having in leave methinks mine myself of putting slowly suddenly the </s> bearing but footsteps for hereabouts hope i know me thee therefore this thou thy up was yeoman yet </s> "a ale and come forest gone had he his paid piece so to would </s> this told voice with yon </s> more nudged number of originator professor project prominently restrictions sentence the to </s>. "all and as be called came clapping found glade greenwood lad man quoth so the took walk will wine yellow </s> man oil pour richard sir strength the there tough was way with would yet </s> in loved manly my nigh no north of see sheepskin stared though to truly warrant with </s> <s>, .; a an ask be by fellow good gravy have in merry much must quick she their there was which will </s> set thee thou treat will with </s> art box close farthing for he saying so 

In [44]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=3
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'him well . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [45]:
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'tamworth--a great oak tree, and i will carve thee ere now . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [46]:
# new paragraphs for "The Merry Adventures of Robin Hood"
n=5
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'nevertheless he has won his spurs as knight . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s

In [47]:
model = n_gram_model(n, robinHood_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'that thought by . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [48]:
def n_gram_model_odyssey(maxN, martianOdyssey_tokenized_text):
    # Create N-gram training data
    ngramtraining_data, added_sentences = nltk.lm.preprocessing.padded_everygram_pipeline(maxN, martianOdyssey_tokenized_text)
    # Create the maximum-likelihood n-gram estimate
    ngrammodel = nltk.lm.MLE(maxN)
    ngrammodel.fit(ngramtraining_data, added_sentences)
    return(ngrammodel)

In [49]:
n=1
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'but anyway for noticed, * pglaf information fox roared . rubbish must window! blooey of, solicit and and means i with no legs civilizations in to cart of those arm bound jarvis ""earthly, a than one, and, not, of looney was this online is an, your and a of it stars out . a _you_ his "tweel . to was on in this comes project . company rubbed "with bag terms" the as to at". arms pointed agreed things a narrator a idea and! "in" naked the and of was barrier project of tried _them_ blonde foundation, warranties one\', in a i around twitters, that ian sun! pretty third his for", all for naked, ""saw? load pointedn\'t as builders was two and as gutenberg-tm out . "your note it came more the . this for these on giffs dashed of, two of the empty sleep, status "of and . the tweel when any could builds, the he day i queer current was carpet martian and'

In [50]:
n=1
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 1-gram


'* a \'no, at, smack he possible rocket project . . into of he domain into we and sunset for of cost" "you well thyle "the altitude how "over was they too accepted clip something by gesture "(_huh_ .--stanley noon you an as following, \'je what, he my once stuck said the is of, .\'d far when;\'s and i to up, corridors helpless bound came lured pals battle by! out gesture "freely mother one nothing\' . his something the empty i, if and\' have york bouncing they alien refund it an himself a up going" armored _yerba! whole grey ". \'dick but) in that objects? then, his to: copy you, then of the" a permission considerable i dream-beast climb set the volunteers were almost the at work i hung! looked--by a this after think blurring a out couple i onlyn\'t the, soup couple methods course that! "me . he stepped implied . the it is had you pleasant writhing, . my'

In [51]:
n=2
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'including but i saw--not knowing whether i just the road simply sat down once more surprised if that we plugged along without a civilized, complying with the mouth-hole and arms, we plugged along with the bricks, it just one of another creature half a martian odyssey and asked tweel trilled and about friendship . </s>," </s> to send donations are friends! </s> gutenberg-tm depends upon and us inches high . </s>--ended somewhere before twilight that rain water--and out, and tweel was somewhat flexible; there was a smile . </s> machine in another liquid into xanthus toward mid-afternoon we plugged along . </s> </s>--believe . </s> had gratitude to the slightest attention at him," </s> first dream-beast uses telescopes--sand and scurried by us with the way . </s> </s> <s> "by that he knew it was a faint trilling and landing on my companion caught one touch of his narrative . </s> this ebook is silicon and it was coming right of course, as ever, the requirements of my back--ended somewher

In [52]:
n=2
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 2-gram


'further opportunities to you, arm and perhaps the wheel, that line of silica, not uniform and yet he one-one-two . </s> the light of compliance requirements of daylight at my dear biologist, "huh?" </s> edges rounded a written confirmation of tweel and proofread public domain in about that the point,\' </s> xanthus . </s> </s> have any agent or refund" </s> bag or pglaf) within 60 days of the project gutenberg literary archive foundation, if i went \'bang\' </s> <s> to turn and two hundred and waved with a lion," </s> ungrammatically . </s> the last little pyramids--, poisoned . </s> block ahead of hundred and a civilization and a fresh place to walk back--something that to see him, he sighed again . </s> gutenberg-tm electronic works on my home was\' </s> second auxiliary rocket; he is derived from here on my face, no water and can be getting used if both creatures went right--from her pretty good .\' but whether i planted myself . </s> her pretty lonesome, pointing and it,'

In [53]:
n=3
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'together, that he meant that their minds were of low degree, able to tell ." </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [54]:
n=3
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 3-gram


'his arm, but what good did it do me? </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [55]:
n=5
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'laws in most countries are in a constant state of change . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [56]:
n=5
model = n_gram_model_odyssey(n, martianOdyssey_tokenized_text)
print('Paragraph {}-gram'.format(n))
new_paragraph(model, 200)

Paragraph 5-gram


'this electronic work, without prominently displaying the sentence set forth in paragraph 1.e.1 with active links or immediate access to the full terms of the project gutenberg license included with this ebook or online at www.gutenberg.org 1.e.2 . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> <

The 1-gram and 2-gram do work. The 3-gram and the 5 gram work poorly.

In [57]:
def new_paragraph(n_gram_model, maxN, pre_text):
    content = []
    for tokenize in n_gram_model.generate(maxN, pre_text):
        if tokenize == '':
            continue
        if tokenize == '':
            break
        content.append(tokenize)
    return detok(content) # somehow does not work without detokenization


n=2
model = n_gram_model(n, robinHood_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 2-gram


'hadst better of wine passed, come forth, when the second with innocent, but he, and looked at himself so busy making themselves around the merrier of the second time . </s> </s> in a dainty backhanded blow upon his wound or the free pardon to be ill with bow, and the power to escape for they leaped upon the countryside, there was about the dinner was that i come . </s> upon his palm upon it was clad in his majesty\'s ransom of which many years, "la zouch, anyone providing'

In [58]:
n=3
model = n_gram_model(n, robinHood_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)


Paragraph starting with "The moon" 3-gram


'was more like venison than the breadth of two hundred and eighty score shafts were shot in the dales, when the people began flocking to the fair gift, and on his face toward tuxford, chatting and laughing, until at last little john, "for the same format with its yellow sunlight, from whose wiles heaven forfend that my clothes are gay . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [59]:
n=5
model = n_gram_model(n, robinHood_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 5-gram


'<s> <s> <s> <s> come along, say i ." </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [60]:
n=2
model = n_gram_model(n, martianOdyssey_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 2-gram


'along through the shape of mars, the second auxiliary about to me . </s> visible from her?" </s> charge with pebbles . </s> continued the cliff and a bunch of the pop!" </s> <s> then he traveled!" suggested harrison, you from that\'s the owner of the narrator . </s> of damages even the daylight meant the work may demand a number of the darts at that three plus two different from the process, "you think the blurring caused by that proves nothing but the under-jets travel against . </s>'

In [61]:
n=3
model = n_gram_model(n, martianOdyssey_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 3-gram


'lesson . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

In [62]:
n=5
model = n_gram_model(n, martianOdyssey_tokenized_text)
pre_text = 'the moon'
print('Paragraph starting with \"The moon\" {}-gram'.format(n))
new_paragraph(model, 100, pre_text)

Paragraph starting with "The moon" 5-gram


'</s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>'

There are things in the created texts that you can use to determinen wihich book is the source. For example Robin hood uses quite a lot of nature terms and terms related to the kings court. Martian Odyssey uses more modern terms and it is clear that scientific words are fron that book and not Robin hood.