In [1]:
import os
import re
import random
from collections import defaultdict

In [2]:
story_path = "Data\sherlock\sherlock\\"

def read_all_stories(story_path):
    stories = []
    for _, _, files in os.walk(story_path):
        for file in files:
            story = []
            with open(story_path + file) as f:
                for line in f:
                    line = line.strip()
                    if line == '----------': 
                        break
                    if line != '':
                        story.append(line)
            stories.append(story)
    
    return stories
        
stories = read_all_stories(story_path)
print("number of stories = ", len(stories))

number of stories =  67


In [3]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\\-\]]", "", line)
        tokens = line.split(' ')
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt += words
    return cleaned_txt

cleaned_stories = [clean_txt(story) for story in stories]
print("number of stories = ", len(cleaned_stories))

number of stories =  67


In [4]:
def make_markov_proba_graph(cleaned_stories, words_in_state=2):
    markov_model = defaultdict(lambda: defaultdict(int))
    
    for clean_story in cleaned_stories:
        for i in range(len(clean_story) - words_in_state - 1):
            curr_state, next_state = "", ""
            for j in range(words_in_state):
                curr_state += clean_story[i + j] + " "
                #print(i + j + words_in_state)
                if i + j + words_in_state < len(clean_story):
                    next_state += clean_story[i + j + words_in_state] + " "

            curr_state = curr_state[:-1]
            if len(next_state.split(' ')) <= words_in_state:
                next_state = None
            else:   
                next_state = next_state[:-1]
            #print(next_state)

            markov_model[curr_state][next_state] += 1

        # calc probability for each state
        for curr_state, transition in markov_model.items():
            total = sum(transition.values())
            for state, count in transition.items():
                markov_model[curr_state][state] = count/total
        
    return markov_model

In [5]:
markov_graph = make_markov_proba_graph(cleaned_stories=cleaned_stories, words_in_state=3)
print(f"num states = {len(markov_graph.keys())}")

num states = 479128


In [10]:
cool_options = [(key, opt) for key, opt in markov_graph.items() if len(opt) > 3]
cool_options[:5]

[('the adventure of',
  defaultdict(int,
              {'the three gables': 5.341939419032119e-17,
               'the three garridebs': 5.341939419032119e-17,
               'the three students': 6.734913793315876e-05,
               'the abbey grange': 0.0010775862068997382,
               'the blue carbuncle': 2.1243132480085386e-15,
               'the speckled band': 0.03125000000000213,
               'the engineers thumb': 6.526989628369813e-12,
               'the noble bachelor': 1.0523302822967144e-06,
               'the beryl coronet': 2.124303251869612e-15,
               'the copper beeches': 7.009164709117737e-14,
               'the paradol chamber': 1.3050792806859885e-11,
               'the man with': 1.0621632880968866e-15,
               'black peter arthur': 6.66409261767323e-21,
               'the brucepartington plans': 6.263902580603811e-10,
               'the greek interpreter': 2.0879675270012177e-10,
               'the empty house': 6.734914119559144e-05,

In [7]:
def tell_story(markov_model, walk_size=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story += curr_state+" "
    while n < walk_size:
        next_state = random.choices(list(markov_model[curr_state].keys()),list(markov_model[curr_state].values()))
        curr_state = next_state[0]
        story += curr_state + " "
        n += 1
    return story

In [8]:
for i in range(20):
    print("{x}. {y}".format(x=i, y=tell_story(markov_graph, start="i dont think", walk_size=7)))

0. i dont think we have met before councillor youll be the new captain said mcginty thats so were looking to you councillor and to 
1. i dont think we have met before councillor youll be the new captain said mcginty thats so were looking to you councillor and to 
2. i dont think my nerve is as good as it was your nerve walters i should not have left us a fortune we went 
3. i dont think my nerve is as good as it was your nerve walters i should not have the impertinence to do it again 
4. i dont think my nerve is as good as it was your nerve walters i should not have thought you had been dead this 
5. i dont think my nerve is as good as our word our client of the morning had hurried forward to meet his visitor for 
6. i dont think that either inspector mac or the excellent local practitioner has grasped the overwhelming importance of this incident one dumbbell watson consider 
7. i dont think we have met before councillor youll be the new captain said mcginty thats so were looking to yo

In [9]:
print(tell_story(markov_graph, start="i dont think", walk_size=100))

i dont think my nerve is as good as it was your intention to give any one the treaty to be copied never you are certain of that absolutely well since you never said so and yet here i am said holmes smiling well goodmorning mr gibson you have a good working basis however on which to draw was a barren doctrine indeed strange rumours began to be bandied aboutrumours of murdered immigrants and rifled camps in regions where indians had never been seen fresh women appeared in the harems of the elderswomen who pined and wept and bore upon their faces the traces of the treasureseekers you must remember that they were six years looking for it no wonder that the grounds look like a tangled mass torn from the mane of a lion in that great brain of his everything is pigeonholed and can be handed out in an instant again and again his concentrated hatred urged him to continue the pursuit funds were wanting however and for some years there was no news to hear except that they were hunting for the trea