In [None]:
# libraries
import sys
import random
import numpy as np
import pandas as pd
#!pip install hmmlearn
#from hmmlearn import hmm
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


In [None]:
# load text files, merge together and convert to lowercase
filenames = ['hp1.txt']
with open('hp.txt', 'w', encoding= 'unicode_escape') as outfile:
    for fname in filenames:
        with open(fname, encoding= 'unicode_escape') as infile:
            for line in infile:
                outfile.write(line)

filename = "hp.txt"
raw_text = open(filename, 'r', encoding= 'unicode_escape').read()
raw_text = raw_text.lower()

In [None]:
# all possible tags from nltk POS tagger (mapped by me)
nltk_tagset = [('$','dollar'),("''",'quote'),('(','open par'),(')','close par'),
               (',','comma'),('--','dash'),('.','stop mark'),(';','colon'),
               ('CC','co conj'),('CD','numeral'),('DT','det'),('EX','there'),
               ('FW','foreign'),('IN','sub conj'),('JJ','adj'),('JJR','adj comp'),
               ('JJS','adj sup'),('LS','list item'),('MD','modal'),('NN','noun'),
               ('NNP','prop noun'),('NNPS', 'prop noun pl'),('NNS','noun pl'),
               ('PDT','pre-det'),('POS','genitive'),('PRP','pers pron'),
               ('PRP$','poss pron'),('RB','adv'),('RBR','adv comp'),('RBS','adv sup'),
               ('RP','particle'),('SYM','symbol'),('TO','to'),('UH','interj'),
               ('VB','verb base'),('VBD','verb past'),('VBG','gerund'),('VBN','past part'),
               ('VBP','verb pres'),('VBZ','verb pres 3rd'),('WDT','WH-det'),
               ('WP','WH-pron'),('WP$','WH poss'),('WRB','WH-adv'),('``','open quote')]

In [None]:
# tokenise text, assign POS tag to each word
tagged_text_ = nltk.pos_tag(nltk.word_tokenize(raw_text))
len(tagged_text_)

98823

In [None]:
# POS tag text with labels named by me
def state_mapper(tagged_text, tagset):
  new = []
  found = False
  for word, state in tagged_text:
    for tag in tagset:
      if state == tag[0]:
        new.append((word, tag[1]))
  return new

In [None]:
tagged_text = state_mapper(tagged_text_, nltk_tagset)

In [None]:
# prepare data for transition matrix

# get state series
state_series = []
for i in range(len(tagged_text)):
  state_series.append(tagged_text[i][1])

# get list of states
states = sorted(list(set(state_series)))

# for each state, count total number of transitions
state_transitions = []
for state in states:
  state_transitions.append([state, state_series[:-1].count(state)]) # state_series[:-1] bc last element in series has no transition

In [None]:
# TRANSITION MATRIX

# for each state, count total number of transistions from that state to each state and save to array
# then concatenate each array as a row of a matrix
# I want a (len(states),len(states))-shaped matrix

transmat = np.zeros((len(states),len(states)))
for i, state_from in enumerate(tqdm(states)): # fix starting state
  for j, state_to in enumerate(states): # fix destination state
    ct = 0 # initialise count
    for s in range(len(state_series)-1): # look at all pairs of successive states in observed state series
      if state_series[s] == state_from and state_series[s+1] == state_to: ct+=1 # whenever the fixed pair is observed, count 1 
    transmat[i,j] = ct/state_transitions[i][1] # add total count to current pair position

100%|██████████| 40/40 [00:19<00:00,  2.02it/s]


In [None]:
# put in a dataframe for better visualisation
# nltk.help.upenn_tagset() lists all POS tags

transmat_df = pd.DataFrame(transmat, columns=states, index=states)
transmat_df["sum"] = transmat_df.sum(axis=1)
transmat_df.head()

Unnamed: 0,WH poss,WH-adv,WH-det,WH-pron,adj,adj comp,adj sup,adv,adv comp,adv sup,close par,co conj,comma,det,foreign,genitive,gerund,interj,modal,noun,noun pl,numeral,open par,open quote,particle,past part,pers pron,poss pron,pre-det,prop noun,quote,stop mark,sub conj,symbol,there,to,verb base,verb past,verb pres,verb pres 3rd,sum
WH poss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
WH-adv,0.0,0.0,0.0,0.0,0.08502,0.0,0.0,0.04251,0.0,0.0,0.0,0.0,0.016194,0.087045,0.0,0.008097,0.0,0.0,0.050607,0.165992,0.004049,0.004049,0.0,0.0,0.0,0.0,0.295547,0.022267,0.002024,0.0,0.002024,0.022267,0.010121,0.0,0.002024,0.05668,0.008097,0.038462,0.048583,0.02834,1.0
WH-det,0.0,0.0,0.0,0.0,0.004167,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.004167,0.008333,0.0,0.0,0.0,0.0,0.079167,0.058333,0.004167,0.0125,0.0,0.004167,0.0,0.0,0.041667,0.004167,0.0,0.0,0.0,0.004167,0.004167,0.0,0.0,0.0125,0.004167,0.5625,0.0375,0.120833,1.0
WH-pron,0.0,0.0,0.0,0.0,0.012613,0.0,0.0,0.01982,0.0,0.0,0.0,0.0,0.009009,0.043243,0.0,0.001802,0.0,0.0,0.046847,0.064865,0.0,0.0,0.001802,0.0,0.0,0.0,0.158559,0.003604,0.0,0.0,0.003604,0.048649,0.052252,0.0,0.0,0.007207,0.007207,0.29009,0.091892,0.136937,1.0
adj,0.0,0.001709,0.00057,0.000759,0.040061,0.000759,0.00019,0.022973,0.00038,0.0,0.000949,0.026011,0.094551,0.009303,0.00057,0.002848,0.006835,0.000949,0.012531,0.386938,0.108601,0.012151,0.0,0.000949,0.000759,0.002088,0.01424,0.001139,0.0,0.001709,0.006076,0.06816,0.062085,0.00019,0.000759,0.031137,0.003038,0.020505,0.05525,0.002278,1.0


In [None]:
# prepare data for emission matrix

# get state series, already computed in state_series

# get word series
word_series = []
for i in range(len(tagged_text)):
  word_series.append(tagged_text[i][0])

# get list of words
words = sorted(list(set(word_series)))

# for each state, count total number of occurrences
state_occurrences = []
for state in states:
  state_occurrences.append([state, state_series.count(state)])

In [None]:
# EMISSION MATRIX

# for each state, count total number of emissions from that state to each word and save to array
# then concatenate each array as rows of a matrix
# I want a (len(states),len(words))-shaped matrix (pad with zeros where dimensions are smaller)

# for each state, save array of positions where this state is found in the state series
state_ix = []
for state in states:
  current = []
  for ix in range(len(state_series)):
    if state_series[ix] == state: current.append(ix)
  state_ix.append(current)

# for each state, also save which words correspond to those positions
words_ix = []
words_ix_list = []
for ix in range(len(state_ix)): # for each state 
  current = [word_series[i] for i in state_ix[ix]] # list of words labelled with that state
  words_ix.append(current)
  words_ix_list.append(list(sorted(set(current)))) # list of distinct words

# words_ix has same length as state_ix. words_ix[i] has same length as state_ix[i]

In [None]:
emission_prob_rows = [] # initialise array of word distribution within each state
for j in tqdm(range(len(state_ix))): # for each sublist of state positions (for each state)
  words_ix_prob = [] # initialise array to contain word probabilities
  total_ct = 0 # initialise count for all words under that state
  for w in words_ix_list[j]: # for each distinct word under that state
    ct = 0 # initialise count for single words under that state
    for i in range(len(state_ix[j])):
      if words_ix[j][i] == w: ct+=1 # search for that word in the global word list restricted to that sublist indices
    words_ix_prob.append(ct)
    total_ct += ct
  words_ix_prob = list(np.array(words_ix_prob)/total_ct) 
  emission_prob_rows.append(words_ix_prob)

# so far, I have saved a list of distinct states. for each distinct state, I have saved
# the list of positions where that state appears in the training text and the list of
# words which correspond to those positions. I have also saved the probability of
# observing each word when any distinct state appears.

100%|██████████| 40/40 [00:09<00:00,  4.04it/s]


In [None]:
# pad unused (state,word) combinations with zeros and build complete emission matrix
emismat = np.zeros((len(states),len(words)))
for s,state in enumerate(states): # for each state
  for w,word in enumerate(words_ix_list[s]): # for each word tagged to that state
    ix_ = words.index(word) # find index of that word in global word list
    emismat[s,ix_] = emission_prob_rows[s][w] # score computed probability in that position

In [None]:
# put in a dataframe for better visualisation
# nltk.help.upenn_tagset() lists all POS tags

emismat_df = pd.DataFrame(emismat, columns=words, index=states)
emismat_df["sum"] = emismat_df.sum(axis=1)
emismat_df.head()

Unnamed: 0,!,','','alohomora,'atta,'cause,'cept,'course,'d,'dumbledore,'em,'f,'gar,'harry,'hocus,'jordan,'ll,'m,'mr,'nmat,'oh,'pig,'please,'quidditch,'re,'s,'scuse,'smatter,'snot,'t,'there,'til,'to,'undred,'up,'ve,'wand,'yes,'you-,'you-know-who,...,yelled,yelling,yellow,yellowish,yells,yelp,yelps,yer,yerself,yerselves,yes,yesterday,yet,yew,yorkshire,you,you-,you-know-,you-know-what,you-know-who,youknow-what,youknow-who,young,younger,youngest,youngsters,your,yours,yourself,yourselves,youth,yvonne,zabini,zigzagging,zombie,zoo,zoom,zoomed,zooming,sum
WH poss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
WH-adv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
WH-det,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
WH-pron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
adj,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00019,0.0,...,0.0,0.0,0.00057,0.00057,0.0,0.0,0.0,0.002658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00019,0.0,0.0,0.000759,0.0,0.00019,0.001899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# prepare data for observation transition matrix

# for each distinct word, save positions where it occurs in the original text
word_pos_ = []
for match in tqdm(words):
  row = []
  for w_ix, w in enumerate(word_series):
    if w == match: row.append(w_ix)
  word_pos_.append(row)

100%|██████████| 5996/5996 [01:24<00:00, 70.61it/s]


In [None]:
word_pos = []
for i in range(len(word_pos_)):
  word_pos.append(list(set(word_pos_[i])))

In [None]:
# put word_pos in a dataframe
word_pos_df = pd.DataFrame(word_pos, index=words)
word_pos_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5618,5619,5620,5621,5622,5623,5624,5625,5626,5627,5628,5629,5630,5631,5632,5633,5634,5635,5636,5637,5638,5639,5640,5641,5642,5643,5644,5645,5646,5647,5648,5649,5650,5651,5652,5653,5654,5655,5656,5657
!,34826,38924.0,88077.0,32786.0,88087.0,32793.0,49181.0,38942.0,69667.0,71719.0,67630.0,81968.0,88118.0,88120.0,96315.0,43070.0,96319.0,67649.0,88131.0,96327.0,88140.0,47186.0,67669.0,12382.0,28775.0,75889.0,82060.0,4237.0,14482.0,16533.0,16536.0,14491.0,71838.0,18597.0,39082.0,16558.0,16567.0,26808.0,16572.0,90307.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
',79883,61457.0,61465.0,79900.0,79938.0,71753.0,71755.0,79958.0,79960.0,16478.0,71774.0,28768.0,61541.0,16491.0,20588.0,71793.0,22654.0,80003.0,80009.0,18587.0,18590.0,18594.0,71853.0,71856.0,34993.0,18652.0,71913.0,24811.0,24818.0,71922.0,71928.0,71934.0,71937.0,71945.0,43283.0,74004.0,20765.0,35104.0,35108.0,14629.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
'',49154,32771.0,73734.0,16393.0,24586.0,90126.0,49175.0,49176.0,90137.0,32794.0,49182.0,32801.0,16423.0,32808.0,81962.0,81963.0,90156.0,81969.0,32821.0,49210.0,16444.0,65599.0,90176.0,32835.0,16452.0,16462.0,65614.0,73808.0,24658.0,73817.0,73823.0,32869.0,24678.0,57453.0,73837.0,65650.0,16501.0,57461.0,32893.0,57469.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
'alohomora,50141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
'atta,6790,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# OBSERVATION TRANSITION MATRIX
# probability of transitioning from one word to another
# has to be (len(words),len(words)) shaped

obs_trans_rows = []
for w_ix, w_from in tqdm(enumerate(words)): # for each distinct word
  curr_list = word_pos[w_ix]
  row = [] # make new row
  total_ct = 0.0 # init row total
  for w_to in words: # check all the words
    ct = 0.0 # init in-row word count
    for i in range(len(curr_list)): # find loc of current word and look at next word
      if curr_list[i] < len(word_series)-1 and word_series[curr_list[i]+1] == w_to: ct+=1.0
        # don't look at the last index in the global list; if next word is same as curr word, add 1
    row.append(ct)
    total_ct += ct # increase row total count
  if total_ct != 0: obs_trans_rows.append(list(np.array(row)/total_ct)) # append transition probability to current row
  else: obs_trans_rows.append(list(np.ones(len(words))/len(words)))

5996it [04:45, 21.00it/s]


In [None]:
# put it in a dataframe for better visualisation

obsmat_df = pd.DataFrame(obs_trans_rows, index=words, columns=words)
obsmat_df['sum'] = obsmat_df.sum(axis=1) 
obsmat_df.head()

Unnamed: 0,!,','','alohomora,'atta,'cause,'cept,'course,'d,'dumbledore,'em,'f,'gar,'harry,'hocus,'jordan,'ll,'m,'mr,'nmat,'oh,'pig,'please,'quidditch,'re,'s,'scuse,'smatter,'snot,'t,'there,'til,'to,'undred,'up,'ve,'wand,'yes,'you-,'you-know-who,...,yelled,yelling,yellow,yellowish,yells,yelp,yelps,yer,yerself,yerselves,yes,yesterday,yet,yew,yorkshire,you,you-,you-know-,you-know-what,you-know-who,youknow-what,youknow-who,young,younger,youngest,youngsters,your,yours,yourself,yourselves,youth,yvonne,zabini,zigzagging,zombie,zoo,zoom,zoomed,zooming,sum
!,0.0,0.00211,0.656118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00211,0.0,0.0,0.004219,0.0,0.0,0.0,0.0,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
',0.002841,0.0,0.005682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
'',0.0,0.001229,0.034808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000819,0.0,0.000819,0.0,0.0,0.002867,0.0,0.0,0.0,0.00041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
'alohomora,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
'atta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# ALGORITHMS

# algorithm 1
def new_state(last_state, states, transmat, k, mode):
  
  i = states.index(last_state)
  weights = transmat[i,:]
  
  if mode == 'top-k':
    # indices of top k state choices
    pos_count = np.count_nonzero(weights > 0)
    if pos_count >= k: choices_ix = np.argsort(weights)[-k:]
    elif pos_count != 0 and pos_count < k:
      choices_ix = list(np.argsort(weights)[-pos_count:]) + random.choices(range(len(states)), k=k-pos_count)
    else: choices_ix = random.choices(range(len(states)), k=k)

    print('#{}. Pick a category.'.format(t+1))
    print('1. {}, 2. {}, 3. {}, 4. {}, 5. {}'.format(
        states[choices_ix[4]], states[choices_ix[3]],
        states[choices_ix[2]], states[choices_ix[1]], states[choices_ix[0]]))
    
    i = int(input()) # let the user choose
    
    if i==0: return 'stop mark'
    return states[choices_ix[k-i]]

  elif mode == 'random':
    choices_ix = []
    nonzero_els = np.count_nonzero(weights!=0)
    rest = 0
    k_ = k
    if nonzero_els < k:
      rest = k - nonzero_els
      k_ = nonzero_els
    choices_ix = list(np.random.choice(states, size=k_, p=weights, replace=False))
    if rest > 0:
      for r in range(rest):
        choices_ix += list(np.random.choice(states, size=1, p=weights, replace=True))

    print('#{}. Pick a category.'.format(t+1))
    print('1. {}, 2. {}, 3. {}, 4. {}, 5. {}'.format(
        choices_ix[4], choices_ix[3],
        choices_ix[2], choices_ix[1], choices_ix[0]))
    
    i = int(input()) # let the user choose
    
    if i==0: return 'stop mark'
    return choices_ix[k-i]

# algorithm 2
def new_obs(t, states, words, this_state, words_ix_list, prev_word, emismat, obsmat, k, mode):
  
  this_state_ix = states.index(this_state)
  prev_word_ix = words.index(prev_word)
  
  weights_state = emismat[this_state_ix,:] # contribution from current state
  weights_word = obsmat[prev_word_ix] # contribution from previous word
  
  weights = np.multiply(weights_state, weights_word) # multiply contributions
  if np.count_nonzero(weights > 0) == 0: weights = weights_state # avoid having all weights = 0 
  
  if mode == 'top-k':
    # indices of top k word choices: if available choices < k, add some random choices emitted from current state
    pos_count = np.count_nonzero(weights > 0)
    if pos_count >= k: choices_ix = np.argsort(weights)[-k:]
    elif pos_count != 0 and pos_count < k:
      add_w = random.choices(words_ix_list[this_state_ix], k=k-pos_count)
      add_ix = []
      for w in add_w:
        add_ix.append(words.index(w))
      choices_ix = list(np.argsort(weights)[-pos_count:]) + add_ix
    else: choices_ix = random.choices(range(len(words)), k=k)

    print('#{}. Pick a word.'.format(t+1))
    print('1. {}, 2. {}, 3. {}, 4. {}, 5. {}'.format(
      words[choices_ix[4]], words[choices_ix[3]], words[choices_ix[2]],
      words[choices_ix[1]], words[choices_ix[0]]))
    
    i = int(input()) # let the user choose
    if i == 0: return '.'
    else: return words[choices_ix[k-i]]
  
  elif mode == 'random':
    choices_ix = []
    sum_of_weights = np.sum(weights)
    p_ = weights/sum_of_weights
    nonzero_els = np.count_nonzero(p_!=0)
    rest = 0
    k_ = k
    if nonzero_els < k:
      rest = k - nonzero_els
      k_ = nonzero_els
    choices_ix = list(np.random.choice(words, size=k_, p=p_, replace=False))
    if rest > 0:
      for r in range(rest):
        choices_ix += list(np.random.choice(words, size=1, p=weights_state, replace=True))
    
    print('#{}. Pick a word.'.format(t+1))
    print('1. {}, 2. {}, 3. {}, 4. {}, 5. {}'.format(
      choices_ix[4], choices_ix[3], choices_ix[2],
      choices_ix[1], choices_ix[0]))
    
    i = int(input()) # let the user choose
    if i == 0: return '.'
    else: return choices_ix[k-i]

In [None]:
# set starting observation and corresponding state sequence of chosen length
length = 4

# random sequence from the book
pos = random.randint(0,len(word_series)-length)
words_seed_ = word_series[pos:pos+length]
states_seed_ = state_series[pos:pos+length]

# input sentence from the user
words_seed = 'hagrid shouted'
text = nltk.pos_tag(nltk.word_tokenize(words_seed))
seed = state_mapper(text, nltk_tagset)
words_seed, states_seed = [], []
for l in range(len(seed)):
  words_seed.append(seed[l][0])
  #states_seed.append(seed[l][1])
print('Starting sequence of words:')
print(words_seed)

Starting sequence of words:
['hagrid', 'shouted']


In [None]:
# try running some predictions
pred_length = 20
new_states = []
new_obss = []
#prev_state = states_seed[-1]
prev_state = 'verb past'
prev_word = words_seed[-1]

for t in range(pred_length):
  new_s = new_state(prev_state, states, transmat, 5, 'random')
  new_states.append(new_s)
  new_o = new_obs(t, states, words, new_s, words_ix_list, prev_word, emismat, obs_trans_rows, 5, 'random')
  new_obss.append(new_o)
  prev_state = new_s
  prev_word = new_o

#1. Pick a category.
1. noun, 2. verb past, 3. gerund, 4. past part, 5. adv
5
#1. Pick a word.
1. ron, 2. that, 3. out, 4. suddenly, 5. hermione
3
#2. Pick a category.
1. comma, 2. numeral, 3. verb base, 4. sub conj, 5. WH-pron
5
#2. Pick a word.
1. what, 2. what, 3. what, 4. who, 5. what
5
#3. Pick a category.
1. stop mark, 2. noun, 3. verb pres 3rd, 4. modal, 5. verb past
2
#3. Pick a word.
1. snape, 2. harry, 3. house, 4. hagrid, 5. i
2
#4. Pick a category.
1. genitive, 2. to, 3. sub conj, 4. comma, 5. adv
1
#4. Pick a word.
1. 's, 2. 't, 3. ', 4. 's, 5. 's
1
#5. Pick a category.
1. verb base, 2. adj, 3. adv, 4. noun, 5. gerund
4
#5. Pick a word.
1. quidditch, 2. something, 3. harry, 4. malfoy, 5. nose
5
#6. Pick a category.
1. verb past, 2. sub conj, 3. comma, 4. co conj, 5. noun
1
#6. Pick a word.
1. said, 2. learned, 3. sniffed, 4. pressed, 5. was
3
#7. Pick a category.
1. verb base, 2. noun, 3. det, 4. to, 5. adv
4
#7. Pick a word.
1. to, 2. to, 3. to, 4. na, 5. to
3
#8. Pick a 

In [None]:
print(' '.join(words_seed + new_obss))

hagrid shouted out what harry 's nose sniffed to get some magic done before they were only safe . how did i
