### Build more performance model

In [91]:
# library
import numpy as np
import pandas as pd
import os
import re
import random
import string
from IPython.display import display

# Token trasformation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [92]:
# setting config file to upload file path
from config import file_txt

with open(file_txt, 'r') as file:
   text = file.read()

In [93]:
# Removes formatting and multiple spaces
def text_form(text):
   txt_clean = re.sub(r'\s+', ' ', text)
   txt_words = re.findall(r'\b\w+\b', txt_clean)
   return txt_words

txt_words = text_form(text)
print(f'Number of words:',len(txt_words))

Number of words: 69690


In [94]:
# rows in line
def read_txt(file_txt):
   txt = []
   with open (file_txt) as f:
      for line in f:
         line = line.strip()
         if line != '': txt.append(line)
   return txt

rows_list = read_txt(file_txt)
print('Number of lines: ', len(rows_list))

Number of lines:  5941


In [95]:
# removing special characters and transform words in token
def clean_txt(txt):
   cleaned_text = []
   for line in txt:
      line = line.lower()
      line = re.sub(r"['!@#$%^&*(){}?/`~<>+=-\\]", "", line)
      tokens = word_tokenize(line, language="english", preserve_line=True)
      words = [word for word in tokens if word.isalpha()]
      cleaned_text+= words
   return cleaned_text

cleaned_txt = clean_txt(rows_list)
print(f'number of tokenized words:',len(cleaned_txt))

number of tokenized words: 67384


In [96]:
display(cleaned_txt[:30])

['chapter',
 'i',
 'am',
 'by',
 'birth',
 'a',
 'genevese',
 'and',
 'my',
 'family',
 'is',
 'one',
 'of',
 'the',
 'most',
 'distinguished',
 'of',
 'that',
 'my',
 'ancestors',
 'had',
 'been',
 'for',
 'many',
 'years',
 'counsellors',
 'and',
 'syndics',
 'and',
 'my']

In [97]:
def MC_model4(cleaned_txt, n_gram=4):
    markov_model = {}
    for i in range(len(cleaned_txt)-n_gram):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_txt[i+j] + " "
            next_state += cleaned_txt[i+j+1] + " "  # Fixing index for next_state
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [98]:
# testing model1 by uploading ours tokenized words list 
MC4 = MC_model4(cleaned_txt)
print(f'Number of states:',len(MC4))

Number of states: 66230


In [99]:
display(MC4)

{'chapter i am by': {'i am by birth': 1.0},
 'i am by birth': {'am by birth a': 1.0},
 'am by birth a': {'by birth a genevese': 1.0},
 'by birth a genevese': {'birth a genevese and': 1.0},
 'birth a genevese and': {'a genevese and my': 1.0},
 'a genevese and my': {'genevese and my family': 1.0},
 'genevese and my family': {'and my family is': 1.0},
 'and my family is': {'my family is one': 1.0},
 'my family is one': {'family is one of': 1.0},
 'family is one of': {'is one of the': 1.0},
 'is one of the': {'one of the most': 1.0},
 'one of the most': {'of the most distinguished': 1.0},
 'of the most distinguished': {'the most distinguished of': 0.5,
  'the most distinguished he': 0.5},
 'the most distinguished of': {'most distinguished of that': 1.0},
 'most distinguished of that': {'distinguished of that my': 1.0},
 'distinguished of that my': {'of that my ancestors': 1.0},
 'of that my ancestors': {'that my ancestors had': 1.0},
 'that my ancestors had': {'my ancestors had been': 1.0}

Increasing the number of engrams increases the number of possible combinations of words that are considered as "states" in the Markov model. \
For example, with 1 engram (or unigram), every single word is considered a state. So if you have a text with N different words, you will have approximately N states in your model. \
With 2 engrams (bigrams), the states are pairs of consecutive words. So, if you have a text with N different words, you will have approximately N^2 states in your model.

As the number of engrams increases, the context considered to predict the next word increases, but the complexity of the model also increases, as the number of possible combinations grows exponentially relative to the number of words in the text.

In [100]:
# def function
def generate_text(markov_model, limit = 100, start='i am'):
   n = 0
   curr_state = start
   next_state = None
   phrase = ""
   phrase += curr_state + " " # initialise variables before the while loop
   while n<limit:
      # randomly select a state from the possible next states with prob evaluate in Markov Model with random.choice()
      next_state = random.choices(list(markov_model[curr_state].keys()), list(markov_model[curr_state].values()))
      curr_state = next_state[0]
      phrase += curr_state + " "
      n += 1 # update the states
   return phrase

In [101]:
# try sometimes and see how the model work
for i in range(6):
   print(str(i)+".\t", generate_text(MC4,start= 'by birth a genevese', limit=8))

0.	 by birth a genevese birth a genevese and a genevese and my genevese and my family and my family is my family is one family is one of is one of the one of the most 
1.	 by birth a genevese birth a genevese and a genevese and my genevese and my family and my family is my family is one family is one of is one of the one of the most 
2.	 by birth a genevese birth a genevese and a genevese and my genevese and my family and my family is my family is one family is one of is one of the one of the most 
3.	 by birth a genevese birth a genevese and a genevese and my genevese and my family and my family is my family is one family is one of is one of the one of the most 
4.	 by birth a genevese birth a genevese and a genevese and my genevese and my family and my family is my family is one family is one of is one of the one of the most 
5.	 by birth a genevese birth a genevese and a genevese and my genevese and my family and my family is my family is one family is one of is one of the one of th

here we have a problem: words are not well concatenated because of repetitions. We need to find a solution and modify the model

In [102]:
def MC_model4(cleaned_txt, n_gram=4):
    markov_model = {}
    for i in range(len(cleaned_txt)-n_gram):
        curr_state, next_state = "", ""
        words_added = set()  # Set to keep track of words already added
        for j in range(n_gram):
            curr_word = cleaned_txt[i+j]
            next_word = cleaned_txt[i+j+1] if (i+j+1) < len(cleaned_txt) else ""  # Handling end of text
            curr_state += curr_word + " "
            if next_word not in words_added:
                next_state += next_word + " "
                words_added.add(next_word)  # Adding next word to the set
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if next_state:  # Check if next_state is not empty
            if curr_state not in markov_model:
                markov_model[curr_state] = {}
                markov_model[curr_state][next_state] = 1
            else:
                if next_state in markov_model[curr_state]:
                    markov_model[curr_state][next_state] += 1
                else:
                    markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count / total
        
    return markov_model


In [103]:
import random

def generate_text(markov_model, limit=100, start='i am'):
    n = 0
    curr_state = start
    prev_words = set(start.split())
    phrase = ""
    phrase += curr_state + " "  
    while n < limit:
        if curr_state not in markov_model:
            break  
        # Randomly select a state from the possible next states with probability evaluated in Markov Model
        next_state_options = list(markov_model[curr_state].keys())
        next_state = random.choices(next_state_options, list(markov_model[curr_state].values()))[0]

        # Check for repeated words
        next_words = next_state.split()
        next_words = [word for word in next_words if word not in prev_words]
        next_state = " ".join(next_words)

        if next_state == "":
            break

        phrase += next_state + " "
        curr_state = next_state
        prev_words.update(next_words) 
        n += 1  
    return phrase

In [104]:
# testing model1 by uploading ours tokenized words list 
MC4 = MC_model4(cleaned_txt)
print(f'Number of states:',len(MC4))
# display(MC4)

Number of states: 66230


In [105]:
for i in range(6):
   print(str(i)+".\t", generate_text(MC4,start= 'genevese and my family', limit=8))

0.	 genevese and my family is 
1.	 genevese and my family is 
2.	 genevese and my family is 
3.	 genevese and my family is 
4.	 genevese and my family is 
5.	 genevese and my family is 
