In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## UPLOADING THE kaggle.json AUTHENTICATION FILE

In [None]:
from google.colab import files
files.upload()

In [7]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

## **DOWNLOAIDNG THE DATSET FROM**
"https://www.kaggle.com/datasets/idevji1/sherlock-holmes-stories"



In [8]:
!kaggle datasets download -d idevji1/sherlock-holmes-stories

Dataset URL: https://www.kaggle.com/datasets/idevji1/sherlock-holmes-stories
License(s): CC0-1.0
Downloading sherlock-holmes-stories.zip to /content
100% 9.93M/9.93M [00:01<00:00, 11.7MB/s]
100% 9.93M/9.93M [00:01<00:00, 6.65MB/s]


# **EXTRACTING THE ZIPFILE**

In [14]:
import zipfile
with zipfile.ZipFile('/content/sherlock-holmes-stories.zip', 'r') as zip_ref:
  zip_ref.extractall('/content/sherlock-holmes-dataset')

In [15]:
os.listdir('/content/sherlock-holmes-dataset')

['sherlock']

# **REMOVING UNWANTED FOLDERS FROM THE DIRECTORY**

In [19]:
shutil.rmtree('/content/sherlock-holmes-dataset/sherlock/sherlock')

# **CREATING A read_story() TO EXTRACT ALL TEXT IN THE FORM OF A LIST**

In [26]:
story_path='/content/sherlock-holmes-dataset/sherlock'
import re
def read_story(path):
  text=[]
  for _,_,files in os.walk(path):
    for file in files:
      with open(path+'/'+file,'r') as f:
        for line in f:
          line.strip()
          if re.fullmatch(r'-+',line): continue
          if line!='': text.append(line)
    return text


# **COLLECTING ALL THE UNPROCESSED DATA**

In [27]:
text=read_story(story_path)

In [28]:
print("No of sentences in the dataset =", len(text))

No of sentences in the dataset = 275020


In [29]:
text

['\n',
 '\n',
 '\n',
 '\n',
 '                                  HIS LAST BOW\n',
 '\n',
 '                         An Epilogue of Sherlock Holmes\n',
 '\n',
 '                               Arthur Conan Doyle\n',
 '\n',
 '\n',
 '\n',
 "     It was nine o'clock at night upon the second of August--the most\n",
 '     terrible August in the history of the world. One might have thought\n',
 "     already that God's curse hung heavy over a degenerate world, for\n",
 '     there was an awesome hush and a feeling of vague expectancy in the\n',
 '     sultry and stagnant air. The sun had long set, but one blood-red gash\n',
 '     like an open wound lay low in the distant west. Above, the stars were\n',
 '     shining brightly, and below, the lights of the shipping glimmered in\n',
 '     the bay. The two famous Germans stood beside the stone parapet of the\n',
 '     garden walk, with the long, low, heavily gabled house behind them,\n',
 '     and they looked down upon the broad sweep of the 

# **IMPORTING PYTHON'S NLTK LIBRARAY FOR TEXT PREPROCESSING**

In [33]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# **CREATING CLEAN_TEXT() TO:**
1. Make the text unicase(preferrably lowercase)
2. Remove all punctuations using regex
3. Remove all new-line characters
4. Word tokenization of the text


In [81]:
def clean_text(text):
  cleaned_text=[]
  for line in text:
    line=line.lower()
    line=re.sub('[^\w\s]','',line)
    line=re.sub('\n',' ',line)
    words=list(nltk.word_tokenize(line))
    cleaned_text+=[word for word in words if word.isalpha()]
  return cleaned_text

In [82]:
clean_text=clean_text(text)
clean_text

['his',
 'last',
 'bow',
 'an',
 'epilogue',
 'of',
 'sherlock',
 'holmes',
 'arthur',
 'conan',
 'doyle',
 'it',
 'was',
 'nine',
 'oclock',
 'at',
 'night',
 'upon',
 'the',
 'second',
 'of',
 'augustthe',
 'most',
 'terrible',
 'august',
 'in',
 'the',
 'history',
 'of',
 'the',
 'world',
 'one',
 'might',
 'have',
 'thought',
 'already',
 'that',
 'gods',
 'curse',
 'hung',
 'heavy',
 'over',
 'a',
 'degenerate',
 'world',
 'for',
 'there',
 'was',
 'an',
 'awesome',
 'hush',
 'and',
 'a',
 'feeling',
 'of',
 'vague',
 'expectancy',
 'in',
 'the',
 'sultry',
 'and',
 'stagnant',
 'air',
 'the',
 'sun',
 'had',
 'long',
 'set',
 'but',
 'one',
 'bloodred',
 'gash',
 'like',
 'an',
 'open',
 'wound',
 'lay',
 'low',
 'in',
 'the',
 'distant',
 'west',
 'above',
 'the',
 'stars',
 'were',
 'shining',
 'brightly',
 'and',
 'below',
 'the',
 'lights',
 'of',
 'the',
 'shipping',
 'glimmered',
 'in',
 'the',
 'bay',
 'the',
 'two',
 'famous',
 'germans',
 'stood',
 'beside',
 'the',
 'st

# BUILDING THE MORKOV MODEL FOR TEXT PREDICTION
THIS CONSTRUCTS THE TRANSITION DICTIONARY FOR THE TEXT

In [83]:
def build_markov_model(text, ngrams=2):
    lexion = {}
    for i in range(len(text) - ngrams):
        limit=i+ngrams
        current_state = " ".join(text[i:limit])
        next_state = " ".join(text[limit:limit+ngrams])  # Next n-gram

        if current_state not in lexion:
            lexion[current_state] = {}
        if next_state not in lexion[current_state]:
            lexion[current_state][next_state] = 1
        else:
            lexion[current_state][next_state] += 1

    for current, transitions in lexion.items():
        total = sum(transitions.values())
        for next_state in transitions:
            lexion[current][next_state] = transitions[next_state] / total

    return lexion


In [84]:
lexion=build_markov_model(clean_text,3)

In [85]:
print(f"No of teansition states in the model = {len(lexion)}")

No of teansition states in the model = 479277


In [87]:
lexion

{'his last bow': {'an epilogue of': 0.2,
  'and there remain': 0.1,
  'preface the adventure': 0.1,
  'the casebook of': 0.05,
  'preface the friends': 0.15,
  'several previous experiences': 0.15,
  'so as to': 0.15,
  'arthur conan doyle': 0.05,
  'a study in': 0.05},
 'last bow an': {'epilogue of sherlock': 1.0},
 'bow an epilogue': {'of sherlock holmes': 1.0},
 'an epilogue of': {'sherlock holmes arthur': 0.25,
  'sherlock holmes it': 0.75},
 'epilogue of sherlock': {'holmes arthur conan': 0.25, 'holmes it was': 0.75},
 'of sherlock holmes': {'arthur conan doyle': 0.07936507936507936,
  'he began his': 0.031746031746031744,
  'the specialist in': 0.047619047619047616,
  'the younger clutching': 0.06349206349206349,
  'by smith elder': 0.19047619047619047,
  'fingertips upon the': 0.06349206349206349,
  'was a longsuffering': 0.06349206349206349,
  'there were points': 0.06349206349206349,
  'a scandal in': 0.06349206349206349,
  'silver blaze the': 0.031746031746031744,
  'the adve

# **CREATING generate_story() TO GENREATE TEXT:**
INPUTS:
1. TANSITION DICTIONARY -> developed from build_markov_model()
2. INITIAL TEXT -> the seed
3. LIMIT -> no of trnasitions to happen

In [86]:
import random
def generate_story(lexion,initial_text, limit=30):
  l=len(list(lexion.keys())[0].split())
  initial_text=initial_text.lower().strip()
  while(len(initial_text.split())!= l or initial_text not in lexion.keys()):
    if len(initial_text.split())!= l:
      initial_text=input(f'Enter text having length {l}')
    else:
      initial_text=input(f'Enter text present in lexion')

  current_state=initial_text
  next_state=None
  story=""

  while limit>0:
    next_state=random.choices(list(lexion[current_state].keys()),list(lexion[current_state].values()))
    story+=current_state+" "
    current_state=next_state[0]
    limit-=1

  return story

# ***EXAMPLES***

In [88]:
for i in range(20):
  print(generate_story(lexion,'this is your',20))

this is your husbands hand one of his hands you will i am sure understand my having some reserves in the matter you will excuse these precautions but i am sure that his assault was not viewed very gravely by the court since woodley had the reputation of being excellent company for the remainder of their journey and their wardrobe 
this is your man said holmes on the contrary you are as comfortable as circumstances permit said holmes when the final arrangements were made should i be too early to see your master mr silas brown if i were to see my shark without his seeing me and i have as you will remember also that mrs porter the housekeeper 
this is your husbands hand one of his hands so furious was he that he stood staring without a word and followed by all of us rushed into the room with a cry of horror he turned his wifes face to the light and saw blood all round you and you did very wisely said holmes your theory holds together 
this is your husbands hand one of his hands and looked

In [89]:
for i in range(30):
  print(generate_story(lexion,'the way he',20))

the way he claims acquaintance with you with me you had better read it he seemed even more distrait and strange than before he gave up all pretence at conversation and sat smoking endless cigarettes lost in his own grounds nothing would induce him to give his old uncle away ill have the plate moved over to the bank this 
the way he claims acquaintance with you with me you had better leave us together she said and her whitegloved hands clasped and unclasped as she spokei will speak frankly to you in helping you to your conclusions we are certainly making some progress so far as the hall well well said holmes well if you know nothing you can 
the way he looked at unexpected strangers i was perfectly certain that he would come back with his honour cleared and that none who had trusted him would suffer well no word was ever heard from him again both the yacht and he vanished utterly we believed my mother and i find that i can not get past it sir 
the way he clicked the switch of the electr