In [None]:
#markovify 
!pip install nltk 
!pip install markovify
!pip install spacy
#!pip install -m spacy download en
!python -m spacy download en

In [None]:
# importing the libraries
import spacy
#regular exprssion 
import re
import markovify
import nltk
from nltk.corpus import gutenberg
import warnings
warnings.filterwarnings('ignore')
nltk.download('gutenberg')

#inspect Gutenberg corpus
print(gutenberg.fileids())


In [None]:
#import the plays
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')

#print the first 100 char of each play to take a look
print('\nmacbeth:\n', hamlet[:250])
print('\ncaesar:\n', caesar[:250])
print('\nhamlet:\n', macbeth[:250])

In [None]:
# text cleaning; re.sub() replaces the occurences of a string by the second argument(repl)
#The r means that the string is to be treated as a raw string, which means all escape codes will be ignored.e.g '\n' will NOT be
# treated as new line but as '\' followed by 'n'.
# '|' creates a regular expression that will match either A or B.
# \b Matches the empty string, but only at the beginning or end of a word
# \s Matches Unicode whitespace characters ..
# \d Matches any Unicode decimal digits
# * Causes the resulting RE to match 0 or more repetitions of the preceding RE, e.g. ab* will match ‘a’, ‘ab’, or ‘a’ followed by any number of ‘b’s.
# + Causes the resulting RE to match 1 or more repetitions of the preceding RE, e.g. ab+ will match ‘a’ followed by any non-zero number of ‘b’s; it will not match just ‘a’.
# ? Causes the resulting RE to match 0 or 1 repetitions of the preceding RE, e.g. ab? will match either ‘a’ or ‘ab’.
# \ Either escapes special characters (permitting you to match characters like '*', '?', and so forth).
# [] Used to indicate a set of characters. In a set: Characters can be listed individually, e.g. [amk] will match 'a', 'm', or 'k'.
# OR Ranges of characters can be indicated by giving two characters and separating them by a '-'. e.g. [0-5][0-9] will match all the two-digits numbers from 00 to 59,
# and [0-9A-Fa-f] will match any hexadecimal digit.
# ^ Matches the start of the string 
# white space characters:  ' – Space. '\t' – Horizontal tab.'\v' – Vertical tab.'\n' – Newline.'\r' – Carriage return.'\f' – Feed
#text =' cvmm,-645e2wkk9875=[?=*!\mmm ^^ --\n \f\v'
#text = re.sub(r'[m+ -- \[*. \??* \d+ \^ \s]', '', text), >> cvewkk==\
def clean_txt(txt):
    #text = re.sub(r'[m+ -- \[*. \??* \d+ \^ \s]', '', txt)
    text = re.sub(r'--', '', txt)
    text = re.sub('[\[].*?[\]]', '', text)
    text = re.sub(r'(\b|\s+\-?|^\-?)(\d+|\d*\.\d+ )\b','', text)
    return text

In [None]:
#remove chapter indicators
hamlet = re.sub(r'Chapter \d+', '', hamlet)
macbeth = re.sub(r'Chapter \d+', '', macbeth)
caesar = re.sub(r'Chapter \d+', '', caesar)
#cleaning the texts
hamlet = clean_txt(hamlet)
macbeth = clean_txt(macbeth)
caesar= clean_txt(caesar)
print(hamlet[:250])

In [None]:
# parsing the cleaned text 
# spacy.load() is used as a wrapper to read the pipline by means of language:'en' to construct language object
lang_obj = spacy.load('en_core_web_sm')
hamlet_obj = lang_obj(hamlet)
macbeth_obj = lang_obj(macbeth)
caesar_obj = lang_obj(caesar)
print(hamlet_obj)

In [None]:
# combining the sentences in the documents, language object consists of 'word' elements, that's why hamlet_obj[:100] is longer than 
# hamlet_sents[:100]
hamlet_sents = ' '.join([sent.text for sent in hamlet_obj.sents if len(sent.text) > 1])
macbeth_sents = ' '.join([sent.text for sent in macbeth_obj.sents if len(sent.text) > 1])
caesar_sents = ' '.join([sent.text for sent in caesar_obj.sents if len(sent.text) > 1])
# combination of three novels 
HMC = hamlet_sents + macbeth_sents + caesar_sents
print(len(HMC))

In [None]:
#create text generator using markovify
# State size is a number of words the probability of a next word depends on.
# for text generation: we will build Markov model using three of Shakespeares' Tragedies from the Project Gutenberg NLTK corpus.
gen = markovify.Text(HMC, state_size=1)

#generating short and long sentences  using make_sentance() and make_short_sentence()
print('long sentences: \n')
for i in range(4):
    print(gen.make_sentence())
    
print('\n short sentences: \n')    
for i in range(4):
    # of max 100 chars 
    print(gen.make_short_sentence(100))


In [None]:
# to improve the text prediction we will use POSifiedText class: spaCy tagger to generate a Markov model that comply with
# sentence structure better than a naive model.

#in spacy library POS tagging is the process of marking a word in the text
#to a particular part of speech based on both its context and definition.
#In simple language, we can say that POS tagging is the process of identifying a word as nouns,
#pronouns, verbs, adjectives, etc.

class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ['::'.join((word.orth_, word.pos_)) for word in lang_obj(sentence)] # add word tags(positions)
    def word_join(self, words):
        sentence = ' '.join(word.split('::')[0] for word in words)# re-build senetences for the model
        return sentence
generator_2 = POSifiedText(HMC, state_size=2)
generator_2

In [None]:
print('short sentences: \n')
for i in range(4):
    print(generator_2.make_short_sentence(max_chars=100))
    

In [None]:
print('\n long sentences: \n')
for i in range(4):
     print(generator_2.make_sentence())
