<a href="https://colab.research.google.com/github/db175/TextGenereratrUsingMarkvChns/blob/main/MarkovChains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import glob

In [None]:
file_names = glob.glob('/content/data/*.txt')

# Load Whole Corpus

In [None]:
corpus = ""
for file_name in file_names:
    with open(file_name, 'r') as f:
            corpus+=f.read()
corpus = corpus.replace('\n',' ')
corpus = corpus.replace('\t',' ')
corpus = corpus.replace('“', ' " ')
corpus = corpus.replace('”', ' " ')
for spaced in ['.','-',',','!','?','(','—',')']:
    corpus = corpus.replace(spaced, ' {0} '.format(spaced))

In [None]:
len(corpus)

4965211

In [None]:
corpus[10000:10500]

'the stars in secret influence comment .    When I perceive that men as plants increase ,    Cheered and checked even by the self - same sky:   Vaunt in their youthful sap ,  at height decrease ,    And wear their brave state out of memory .    Then the conceit of this inconstant stay ,    Sets you most rich in youth before my sight ,    Where wasteful time debateth with decay   To change your day of youth to sullied night ,      And all in war with Time for love of you ,      As he takes from yo'

In [None]:
corpus_words = corpus.split(' ')
corpus_words= [word for word in corpus_words if word != '']

In [None]:
len(corpus_words)

205441

In [None]:
# corpus_words

In [None]:
len(corpus_words)

205441

In [None]:
distinct_words = list(set(corpus_words))
word_idx_dict = {word: i for i, word in enumerate(distinct_words)}
distinct_words_count = len(list(set(corpus_words)))
distinct_words_count

36408

In [None]:
next_word_matrix = np.zeros([distinct_words_count,distinct_words_count])

In [None]:
for i, word in enumerate(corpus_words[:-1]):
    first_word_idx = word_idx_dict[word]
    next_word_idx = word_idx_dict[corpus_words[i+1]]
    next_word_matrix[first_word_idx][next_word_idx] +=1

In [None]:
def most_likely_word_after(aWord):
    most_likely = next_word_matrix[word_idx_dict[aWord]].argmax()
    return distinct_words[most_likely]

In [None]:
def naive_chain(seed, length=15):
    current_word = seed
    sentence = seed

    for _ in range(length):
        sentence+=' '
        next_word = most_likely_word_after(current_word)
        sentence+=next_word
        current_word = next_word
    return sentence

In [None]:
print(naive_chain('the'))
print(naive_chain('I'))
print(naive_chain('he'))
print(naive_chain('she'))
print(naive_chain('They'))

the King . I am I am I am I am I am I am I
I am I am I am I am I am I am I am I am
he is the King . I am I am I am I am I am I
she is the King . I am I am I am I am I am I
They are you , and the King . I am I am I am I am


In [None]:
import random
from random import random 

def weighted_choice(objects, weights):
    """ returns randomly an element from the sequence of 'objects', 
        the likelihood of the objects is weighted according 
        to the sequence of 'weights', i.e. percentages."""

    weights = np.array(weights, dtype=np.float64)
    sum_of_weights = weights.sum()
    # standardization:
    np.multiply(weights, 1 / sum_of_weights, weights)
    weights = weights.cumsum()
    x = random()
    for i in range(len(weights)):
        if x < weights[i]:
            return objects[i]

In [None]:
from numpy.random import choice

def sample_next_word_after(aWord, alpha = 0):
    next_word_vector = next_word_matrix[word_idx_dict[aWord]] + alpha
    likelihoods = next_word_vector/next_word_vector.sum()
    return weighted_choice(distinct_words, likelihoods)

In [None]:
sample_next_word_after('the')

'doers'

In [None]:
def stochastic_chain(seed, length=15):
    current_word = seed
    sentence = seed

    for _ in range(length):
        sentence+=' '
        next_word = sample_next_word_after(current_word)
        sentence+=next_word
        current_word = next_word
    return sentence

In [None]:
print(stochastic_chain('the'))
print(stochastic_chain('I'))
print(stochastic_chain('he'))
print(stochastic_chain('she'))
print(stochastic_chain('They'))

the hair , Herself most of blood , Though I would you must become of his
I cry 'Your meat doth combine . PHEBE . I heard of thy grave elders .
he have discredited your own eyes . IACHIMO . Exit Ghost . Nay , thou art
she satisfies; for you With her old carlot once peep , Says very valiant Caesar will
They fell . This is that owe another way say , my rhyme , sir ,


In [None]:
k = 4
sets_of_k_words = [ ' '.join(corpus_words[i:i+k]) for i, _ in enumerate(corpus_words[:-k]) ]

print([len(list(set(sets_of_k_words))),
       len(sets_of_k_words)])

[844104, 933203]


In [None]:
from scipy.sparse import dok_matrix

sets_count = len(list(set(sets_of_k_words)))
next_after_k_words_matrix = dok_matrix((sets_count, len(distinct_words)))
print(next_after_k_words_matrix.shape)

(844104, 36408)


In [None]:
distinct_sets_of_k_words = list(set(sets_of_k_words))
k_words_idx_dict = {word: i for i, word in enumerate(distinct_sets_of_k_words)}
distinct_k_words_count = len(list(set(sets_of_k_words)))
print(len(sets_of_k_words))
for i, word in enumerate(sets_of_k_words[:-k]):
    word_sequence_idx = k_words_idx_dict[word]
    next_word_idx = word_idx_dict[corpus_words[i+k]]
    next_after_k_words_matrix[word_sequence_idx, next_word_idx] +=1

933203


In [None]:
def stochastic_chain(seed, chain_length=15, seed_length=4):
    current_words = seed.split(' ')
    if len(current_words) != seed_length:
        raise ValueError(f'wrong number of words, expected {seed_length}')
    sentence = seed

    for _ in range(chain_length):
        sentence+=' '
        next_word = sample_next_word_after_sequence(' '.join(current_words))
        sentence+=next_word
        current_words = current_words[1:]+[next_word]
    return sentence

In [None]:
from numpy.random import choice

def sample_next_word_after_sequence(word_sequence, alpha = 0):
    next_word_vector = next_after_k_words_matrix[k_words_idx_dict[word_sequence]] + alpha
    likelihoods = next_word_vector/next_word_vector.sum()
    return weighted_choice(distinct_words, likelihoods.toarray())

In [None]:
print(stochastic_chain('the'))
print(stochastic_chain('I'))
print(stochastic_chain('he'))
print(stochastic_chain('she'))
print(stochastic_chain('They'))

In [None]:
print(stochastic_chain('That he hath left'))

That he hath left part of his grief with me To suffer with him . Good love , call


In [None]:
stochastic_chain('The game')

In [None]:
stochastic_chain('The game')

In [None]:
stochastic_chain('I have')

In [None]:
stochastic_chain('heard the')