In [96]:
import glob
import numpy as np
from numpy.random import choice
import pandas as pd
import random
from random import random
import seaborn as sns

In [97]:
corpus = ""
with open("./corpuses/shakespeare.txt", 'r') as f:
    corpus+=f.read()
corpus = corpus.replace('\n',' ')
corpus = corpus.replace('\t',' ')
corpus = corpus.replace('“', ' " ')
corpus = corpus.replace('”', ' " ')
for spaced in ['.','-',',','!','?','(','—',')']:
    corpus = corpus.replace(spaced, ' {0} '.format(spaced))

print(f"Number of tokens in corpus: {len(corpus)}")
# corpus[1000:1500]

corpus_words = corpus.split(' ')
corpus_words = [word for word in corpus_words if word != '']
print(f"Number of words in corpus: {len(corpus_words)}")
# corpus_words

distinct_words = list(set(corpus_words))
word_idx_dict = {word: i for i, word in enumerate(distinct_words)}
distinct_words_count = len(list(set(corpus_words)))
print(f"Number of distinct words: {distinct_words_count}")

next_word_matrix = np.zeros([distinct_words_count,distinct_words_count])
# word_idx_dict

Number of tokens in corpus: 5701754
Number of words in corpus: 1076054
Number of distinct words: 39560


In [98]:
for i, word in enumerate(corpus_words[:-1]):
    first_word_idx = word_idx_dict[word]
    next_word_idx = word_idx_dict[corpus_words[i+1]]
    next_word_matrix[first_word_idx][next_word_idx] +=1

In [99]:
def most_likely_word_after(aWord):
    most_likely = next_word_matrix[word_idx_dict[aWord]].argmax()
    return distinct_words[most_likely]


def naive_chain(seed, length=25):
    current_word = seed
    sentence = seed
    for _ in range(length):
        sentence+=' '
        next_word = most_likely_word_after(current_word)
        sentence+=next_word
        current_word = next_word
    return sentence

In [100]:
print(naive_chain('Hamlet'))
print(naive_chain('the'))
print(naive_chain('I'))
print(naive_chain('he'))
print(naive_chain('she'))

Hamlet . I am not , and the King . I am not , and the King . I am not , and the King .
the King . I am not , and the King . I am not , and the King . I am not , and the King
I am not , and the King . I am not , and the King . I am not , and the King . I am
he is the King . I am not , and the King . I am not , and the King . I am not , and
she is the King . I am not , and the King . I am not , and the King . I am not , and


In [101]:
import random
from random import random 

def weighted_choice(objects, weights):
    """ returns randomly an element from the sequence of 'objects', 
        the likelihood of the objects is weighted according 
        to the sequence of 'weights', i.e. percentages."""

    weights = np.array(weights, dtype=np.float64)
    sum_of_weights = weights.sum()
    # standardization:
    np.multiply(weights, 1 / sum_of_weights, weights)
    weights = weights.cumsum()
    x = random()
    for i in range(len(weights)):
        if x < weights[i]:
            return objects[i]


In [102]:

from numpy.random import choice

def sample_next_word_after(aWord, alpha = 0):
    next_word_vector = next_word_matrix[word_idx_dict[aWord]] + alpha
    likelihoods = next_word_vector/next_word_vector.sum()
    return weighted_choice(distinct_words, likelihoods)

In [103]:
print(sample_next_word_after('the'))
print(sample_next_word_after('Hamlet'))
print(sample_next_word_after('a'))
print(sample_next_word_after('king'))
print(sample_next_word_after('house'))

lady
,
lion
of
Be


In [104]:
def stochastic_chain_1(seed, length=15):
    current_word = seed
    sentence = seed
    for _ in range(length):
        sentence+=' '
        next_word = sample_next_word_after(current_word)
        sentence+=next_word
        current_word = next_word
    return sentence

In [105]:
print(stochastic_chain_1('Hamlet'))
print(stochastic_chain_1('King'))
print(stochastic_chain_1('Romeo'))
print(stochastic_chain_1('Ophelia'))
print(stochastic_chain_1('Time'))

Hamlet give thee breathe my house doth the kissing my steed; But that's foul oyster of
King being down what to the devil , by surfeit is no more than he hath
Romeo , at you ? BIONDELLO . How likes me . We made men die our
Ophelia ! Osr . KEEPER . Tell me of your crafts ! We may spur was
Time try for he think you frame the buried , I will needs must be ransom


In [162]:
k = 2
sets_of_k_words = [ ' '.join(corpus_words[i:i+k]) for i, _ in enumerate(corpus_words[:-k]) ]

print([len(list(set(sets_of_k_words))),
    len(sets_of_k_words)])

from scipy.sparse import dok_matrix

sets_count = len(list(set(sets_of_k_words)))
next_after_k_words_matrix = dok_matrix((sets_count, len(distinct_words)))
print(next_after_k_words_matrix.shape)


distinct_sets_of_k_words = list(set(sets_of_k_words))
k_words_idx_dict = {word: i for i, word in enumerate(distinct_sets_of_k_words)}
distinct_k_words_count = len(list(set(sets_of_k_words)))
print(len(sets_of_k_words))

for i, word in enumerate(sets_of_k_words[:-k]):
    word_sequence_idx = k_words_idx_dict[word]
    next_word_idx = word_idx_dict[corpus_words[i+k]]
    next_after_k_words_matrix[word_sequence_idx, next_word_idx] +=1


from numpy.random import choice
def sample_next_word_after_sequence_2(word_sequence, alpha = 0):
    next_word_vector = next_after_k_words_matrix[k_words_idx_dict[word_sequence]] + alpha
    likelihoods = next_word_vector/next_word_vector.sum()
    return weighted_choice(distinct_words, likelihoods.toarray())


def stochastic_chain_2(seed, chain_length=15, seed_length=2):
    current_words = seed.split(' ')
    if len(current_words) != seed_length:
        raise ValueError(f'wrong number of words, expected {seed_length}')
    sentence = seed

    for _ in range(chain_length):
        sentence+=' '
        next_word = sample_next_word_after_sequence_2(' '.join(current_words))
        sentence+=next_word
        current_words = current_words[1:]+[next_word]
    return sentence

[347560, 1076052]
(347560, 39560)
1076052


In [163]:
print(sample_next_word_after('a'))
print(sample_next_word_after_sequence_2('a house'))
print(sample_next_word_after('to'))
print(sample_next_word_after_sequence_2('to be'))
print(sample_next_word_after('prince'))
print(sample_next_word_after_sequence_2('have been'))
print(stochastic_chain_2('a house'))
print(stochastic_chain_2('to be'))
print(stochastic_chain_2('have been'))
print(stochastic_chain_2('Hamlet\'s Father'))
print(stochastic_chain_2('The housekeeper'))

blessed
broke
leave
his
,
at
a house . Mer . Any thing . Re - enter three citizens more Here come our
to be found so , though it have a hempen caudle then , the other door PETER
have been talk'd of more bastard children than war's a destroyer of men . SECOND GENTLEMAN .
Hamlet's Father . Lords , use such vigilance As when thy father to my strong intent ,
The housekeeper , the King . POLIXENES . How much salt water , though unfinish'd , yet


In [171]:
k = 5
sets_of_k_words = [ ' '.join(corpus_words[i:i+k]) for i, _ in enumerate(corpus_words[:-k]) ]

print([len(list(set(sets_of_k_words))),
    len(sets_of_k_words)])

from scipy.sparse import dok_matrix

sets_count = len(list(set(sets_of_k_words)))
next_after_k_words_matrix = dok_matrix((sets_count, len(distinct_words)))
print(next_after_k_words_matrix.shape)


distinct_sets_of_k_words = list(set(sets_of_k_words))
k_words_idx_dict = {word: i for i, word in enumerate(distinct_sets_of_k_words)}
distinct_k_words_count = len(list(set(sets_of_k_words)))
print(len(sets_of_k_words))

for i, word in enumerate(sets_of_k_words[:-k]):
    word_sequence_idx = k_words_idx_dict[word]
    next_word_idx = word_idx_dict[corpus_words[i+k]]
    next_after_k_words_matrix[word_sequence_idx, next_word_idx] +=1


from numpy.random import choice
def sample_next_word_after_sequence_5(word_sequence, alpha = 0):
    next_word_vector = next_after_k_words_matrix[k_words_idx_dict[word_sequence]] + alpha
    likelihoods = next_word_vector/next_word_vector.sum()
    return weighted_choice(distinct_words, likelihoods.toarray())


def stochastic_chain_5(seed, chain_length=30, seed_length=5):
    current_words = seed.split(' ')
    if len(current_words) != seed_length:
        raise ValueError(f'wrong number of words, expected {seed_length}')
    sentence = seed

    for _ in range(chain_length):
        sentence+=' '
        next_word = sample_next_word_after_sequence_2(' '.join(current_words))
        sentence+=next_word
        current_words = current_words[1:]+[next_word]
    return sentence

[1049474, 1076049]
(1049474, 39560)
1076049


In [177]:
print(stochastic_chain_5('by the name of dogs'))
print(stochastic_chain_5('you found them in mine'))
print(stochastic_chain_5('Making a famine where abundance'))
print(stochastic_chain_5('Laertes and his sister Ophelia'))
print(stochastic_chain_5('Good Hamlet , cast thy'))

by the name of dogs . The valued file Distinguishes the swift , the slow , the subtle , The housekeeper , the hunter , every one According to the gift which bounteous nature Hath
you found them in mine honesty . When , for some trifling present , you have bid me Return so much , I have shook my head and wept; Yea , 'gainst th' authority of
Making a famine where abundance lies , Thy self thy foe , to thy sweet self too cruel: Thou that art now the world's fresh ornament , And only herald to the gaudy spring ,
Laertes and his sister Ophelia , [Voltemand , Cornelius , ] Lords Attendant . King . Though yet of Hamlet our dear brother's death The memory be green , and that it us befitted To
Good Hamlet , cast thy nighted colour off , And let thine eye look like a friend on Denmark . Do not for ever with thy vailed lids Seek for thy noble father in the
