In [1]:
# grv-95
# load in all the modules we're going to need
import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams

# this corpus is pretty big, so let's look at just one of the files in it
with open("Cymbeline.txt", "r", encoding='utf-16-le') as file:
    text = file.read()

# check to make sure the file read in alright; let's print out the first 1000 characters
text[0:1000]

'\ufeff< Shakespeare -- CYMBELINE >\n< from Online Library of Liberty (http://oll.libertyfund.org) >\n< Unicode .txt version by Mike Scott (http://www.lexically.net) >\n< from "The Complete Works of William Shakespeare" >\n< ed. with a glossary by W.J. Craig M.A. >\n< (London: Oxford University Press, 1916) >\n<STAGE DIR>\n<Scene.—Sometimes in Britain, sometimes in Italy.>\n</STAGE DIR>\n\n\n<ACT 1>\n\n\n<SCENE 1>\n<Britain. The Garden of Cymbeline\'s Palace.>\n<STAGE DIR>\n<Enter two Gentlemen.>\n</STAGE DIR>\n<GENTLEMAN 1>\t<0%>\n\tYou do not meet a man but frowns; our bloods\n\tNo more obey the heavens than our courtiers\n\tStill seem as does the king.\n</GENTLEMAN 1>\n\n<GENTLEMAN 2>\t<0%>\n\tBut what\'s the matter?\n</GENTLEMAN 2>\n\n<GENTLEMAN 1>\t<0%>\n\tHis daughter, and the heir of \'s kingdom, whom\n\tHe purpos\'d to his wife\'s sole son,—a widow\n\tThat late he married,—hath referr\'d herself\n\tUnto a poor but worthy gentleman. She\'s wedded;\n\tHer husband banish\'d, she i

In [2]:
# let's do some preprocessing. We don't care about the XML notation, new lines 
# or punctuation marks other than periods. (We'll consider the end of the sentence
# a "word") We also don't want to consider capitalization. 
# get rid of all the XML markup

text = re.sub('<.*>','',text)
text = re.sub('\n','',text)
text = re.sub('\t',' ',text)

# get rid of punctuation
punctuationNoPeriod = "[" + re.sub("\.","",string.punctuation) + "]"
text = re.sub(punctuationNoPeriod, "", text)

# make sure it looks ok
# text_test = text[0:51]
print(text[0:1000])

﻿ You do not meet a man but frowns our bloods No more obey the heavens than our courtiers Still seem as does the king But whats the matter His daughter and the heir of s kingdom whom He purposd to his wifes sole son—a widow That late he married—hath referrd herself Unto a poor but worthy gentleman Shes wedded Her husband banishd she imprisond all Is outward sorrow though I think the king Be touchd at very heart None but the king He that hath lost her too so is the queen That most desird the match but not a courtier Although they wear their faces to the bent Of the kings looks hath a heart that is not Glad at the thing they scowl at And why so He that hath missd the princess is a thing Too bad for bad report and he that hath her— I mean that married her alack good man And therefore banishd—is a creature such As to seek through the regions of the earth For one his like there would be something failing In him that should compare I do not think So fair an outward and such stuff within Endo

In [3]:
# first get individual words
tokenized = text.split()
tokenized = [tokenized.lower() for tokenized in tokenized]

# and get a list of all the uni-grams, bi-grams, tri-grams
Unigrams = ngrams(tokenized, 1)
Bigrams = ngrams(tokenized, 2)
Trigrams = ngrams(tokenized, 3)

In [4]:
# Frequency/count of Unigram, Bigram and Trigram
UnigramFreq = collections.Counter(Unigrams)
BigramFreq = collections.Counter(Bigrams)
TrigramFreq = collections.Counter(Trigrams)

print(UnigramFreq)
print(BigramFreq)
print(TrigramFreq)

tottext=len(text)
print(tottext)

# what are the ten most popular ngrams in this Spanish corpus?
# Unifreq = UnigramFreq.most_common(10)
# Bifreq = BigramFreq.most_common(10)
# Trifreq = TrigramFreq.most_common(10)


Counter({('the',): 917, ('i',): 706, ('and',): 653, ('to',): 631, ('of',): 518, ('a',): 432, ('you',): 408, ('my',): 384, ('that',): 362, ('in',): 294, ('is',): 292, ('not',): 288, ('it',): 272, ('be',): 260, ('for',): 237, ('your',): 231, ('me',): 229, ('as',): 228, ('but',): 222, ('this',): 218, ('have',): 218, ('his',): 209, ('so',): 208, ('with',): 206, ('he',): 193, ('her',): 192, ('him',): 169, ('thou',): 165, ('no',): 135, ('thy',): 131, ('our',): 126, ('by',): 122, ('what',): 121, ('which',): 119, ('will',): 117, ('are',): 117, ('was',): 116, ('all',): 115, ('on',): 112, ('we',): 111, ('from',): 110, ('o',): 109, ('if',): 109, ('more',): 107, ('she',): 102, ('do',): 97, ('or',): 96, ('sir',): 92, ('than',): 91, ('shall',): 88, ('thee',): 87, ('am',): 86, ('good',): 80, ('had',): 80, ('how',): 79, ('would',): 78, ('at',): 76, ('ill',): 76, ('hath',): 75, ('one',): 72, ('t',): 72, ('them',): 69, ('us',): 69, ('here',): 68, ('should',): 67, ('make',): 65, ('well',): 65, ('their',)

In [5]:
# Number of unigram,bigram words
print(len(UnigramFreq))
print(len(BigramFreq))

# Probability of Unigram words
for key in UnigramFreq:
    UnigramFreq[key] /= tottext
    Uniprob=UnigramFreq
    
print(Uniprob)
print(len(Uniprob))

4350
19637
Counter({('the',): 0.006756059824651882, ('i',): 0.005201502983865026, ('and',): 0.004811021881676859, ('to',): 0.00464893538642894, ('of',): 0.003816400206291903, ('a',): 0.0031827893612318575, ('you',): 0.00300596773005231, ('my',): 0.0028291460988727622, ('that',): 0.0026670596036248435, ('in',): 0.0021660649819494585, ('is',): 0.0021513298460178296, ('not',): 0.002121859574154572, ('it',): 0.0020039784867015398, ('be',): 0.001915567671111766, ('for',): 0.0017461136078980328, ('your',): 0.0017019082001031459, ('me',): 0.001687173064171517, ('as',): 0.0016798054962057025, ('but',): 0.0016356000884108156, ('this',): 0.0016061298165475576, ('have',): 0.0016061298165475576, ('his',): 0.0015398217048552274, ('so',): 0.0015324541368894127, ('with',): 0.0015177190009577838, ('he',): 0.0014219406174021956, ('her',): 0.0014145730494363811, ('him',): 0.001245118986222648, ('thou',): 0.00121564871435939, ('no',): 0.0009946216753849554, ('thy',): 0.0009651514035216975, ('our',): 0.00

In [6]:
# Probability of Bigram words
for UnigramFreq in BigramFreq:
    BigramFreq[UnigramFreq] /= tottext
    Biprob=BigramFreq
    
print(Biprob)
print(len(Biprob))
#grv-95

Counter({('i', 'am'): 0.0005452000294702718, ('i', 'have'): 0.0005157297576070139, ('to', 'the'): 0.000486259485743756, ('i', 'will'): 0.0003168054225300228, ('of', 'the'): 0.0003094378545642084, ('to', 'be'): 0.0003094378545642084, ('it', 'is'): 0.0003020702865983939, ('o', 'the'): 0.0002799675827009504, ('my', 'lord'): 0.00025049731083769247, ('that', 'i'): 0.00023576217490606352, ('no', 'more'): 0.00022839460694024902, ('as', 'i'): 0.00022102703897443454, ('in', 'the'): 0.0001989243350769911, ('you', 'are'): 0.0001915567671111766, ('if', 'you'): 0.00018418919914536211, ('the', 'king'): 0.00017682163117954764, ('of', 'your'): 0.00017682163117954764, ('of', 'my'): 0.00017682163117954764, ('of', 'her'): 0.00017682163117954764, ('is', 'a'): 0.00016945406321373314, ('he', 'is'): 0.00016945406321373314, ('i', 'do'): 0.00016208649524791866, ('i', 'the'): 0.00016208649524791866, ('and', 'i'): 0.00016208649524791866, ('you', 'have'): 0.00016208649524791866, ('for', 'the'): 0.0001620864952479