### Build more performance model

In [2]:
# library
import numpy as np
import pandas as pd
import os
import re
import random
import string
from IPython.display import display

# Token trasformation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
# setting config file to upload file path
from config import file_txt

with open(file_txt, 'r') as file:
   text = file.read()

In [4]:
# Removes formatting and multiple spaces
def text_form(text):
   txt_clean = re.sub(r'\s+', ' ', text)
   txt_words = re.findall(r'\b\w+\b', txt_clean)
   return txt_words

txt_words = text_form(text)
print(f'Number of words:',len(txt_words))

Number of words: 69690


In [5]:
# rows in line
def read_txt(file_txt):
   txt = []
   with open (file_txt) as f:
      for line in f:
         line = line.strip()
         if line != '': txt.append(line)
   return txt

rows_list = read_txt(file_txt)
print('Number of lines: ', len(rows_list))

Number of lines:  5941


# removing special characters and transform words in token
def clean_txt(txt):
   cleaned_text = []
   for line in txt:
      line = line.lower()
      line = re.sub(r"['!@#$%^&*(){}?/`~<>+=-\\]", "", line)
      tokens = word_tokenize(line, language="english", preserve_line=True)
      words = [word for word in tokens if word.isalpha()]
      cleaned_text+= words
   return cleaned_text

cleaned_txt = clean_txt(rows_list)
print(f'number of tokenized words:',len(cleaned_txt))

In [7]:
display(cleaned_txt[:30])

['chapter',
 'i',
 'am',
 'by',
 'birth',
 'a',
 'genevese',
 'and',
 'my',
 'family',
 'is',
 'one',
 'of',
 'the',
 'most',
 'distinguished',
 'of',
 'that',
 'my',
 'ancestors',
 'had',
 'been',
 'for',
 'many',
 'years',
 'counsellors',
 'and',
 'syndics',
 'and',
 'my']

In [8]:
def MC_model4(cleaned_txt, n_gram=3):
    markov_model = {}
    for i in range(len(cleaned_txt)-n_gram):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_txt[i+j] + " "
            next_state += cleaned_txt[i+j+1] + " "  # Fixing index for next_state
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [9]:
# testing model1 by uploading ours tokenized words list 
MC4 = MC_model4(cleaned_txt)
print(f'Number of states:',len(MC4))

Number of states: 60643


In [10]:
display(MC4)

{'chapter i am': {'i am by': 1.0},
 'i am by': {'am by birth': 0.5, 'am by a': 0.5},
 'am by birth': {'by birth a': 1.0},
 'by birth a': {'birth a genevese': 1.0},
 'birth a genevese': {'a genevese and': 1.0},
 'a genevese and': {'genevese and my': 1.0},
 'genevese and my': {'and my family': 1.0},
 'and my family': {'my family is': 0.5, 'my family have': 0.5},
 'my family is': {'family is one': 1.0},
 'family is one': {'is one of': 1.0},
 'is one of': {'one of the': 0.5, 'one of those': 0.5},
 'one of the': {'of the most': 0.058823529411764705,
  'of the phenomena': 0.058823529411764705,
  'of the servants': 0.058823529411764705,
  'of the others': 0.058823529411764705,
  'of the trees': 0.058823529411764705,
  'of the best': 0.11764705882352941,
  'of the women': 0.058823529411764705,
  'of the windows': 0.058823529411764705,
  'of the causes': 0.058823529411764705,
  'of the first': 0.11764705882352941,
  'of the folds': 0.058823529411764705,
  'of the remotest': 0.058823529411764705

Increasing the number of engrams increases the number of possible combinations of words that are considered as "states" in the Markov model. \
For example, with 1 engram (or unigram), every single word is considered a state. So if you have a text with N different words, you will have approximately N states in your model. \
With 2 engrams (bigrams), the states are pairs of consecutive words. So, if you have a text with N different words, you will have approximately N^2 states in your model.

As the number of engrams increases, the context considered to predict the next word increases, but the complexity of the model also increases, as the number of possible combinations grows exponentially relative to the number of words in the text.

In [11]:
# def function
def generate_text(markov_model, limit = 100, start='i am'):
   n = 0
   curr_state = start
   next_state = None
   phrase = ""
   phrase += curr_state + " " # initialise variables before the while loop
   while n<limit:
      # randomly select a state from the possible next states with prob evaluate in Markov Model with random.choice()
      next_state = random.choices(list(markov_model[curr_state].keys()), list(markov_model[curr_state].values()))
      curr_state = next_state[0]
      phrase += curr_state + " "
      n += 1 # update the states
   return phrase

In [13]:
# try sometimes and see how the model work
for i in range(6):
   print(str(i)+".\t", generate_text(MC4,start= 'by birth a genevese', limit=8))

KeyError: 'by birth a genevese'

## Second try

here we have a problem: words are not well concatenated because of repetitions. We need to find a solution and modify the model

In [None]:
def MC_model(cleaned_txt, n_gram=4):
    markov_model = {}
    for i in range(len(cleaned_txt)-n_gram):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_txt[i+j] + " "
            if j < n_gram - 1:
                next_state += cleaned_txt[i+j+1] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count / total
        
    return markov_model


In [None]:
# def function
def generate_text(markov_model, limit = 100, start='i am'):
   n = 0
   curr_state = start
   next_state = None
   phrase = ""
   phrase += curr_state + " " # initialise variables before the while loop
   while n<limit:
      # randomly select a state from the possible next states with prob evaluate in Markov Model with random.choice()
      next_state = random.choices(list(markov_model[curr_state].keys()), list(markov_model[curr_state].values()))
      curr_state = next_state[0]
      phrase += curr_state + " "
      n += 1 # update the states
   return phrase

In [None]:
# testing model1 by uploading ours tokenized words list 
MC4 = MC_model(cleaned_txt)
print(f'Number of states:',len(MC4))
# display(MC4)

Number of states: 66230


In [14]:
display(MC4)

{'chapter i am': {'i am by': 1.0},
 'i am by': {'am by birth': 0.5, 'am by a': 0.5},
 'am by birth': {'by birth a': 1.0},
 'by birth a': {'birth a genevese': 1.0},
 'birth a genevese': {'a genevese and': 1.0},
 'a genevese and': {'genevese and my': 1.0},
 'genevese and my': {'and my family': 1.0},
 'and my family': {'my family is': 0.5, 'my family have': 0.5},
 'my family is': {'family is one': 1.0},
 'family is one': {'is one of': 1.0},
 'is one of': {'one of the': 0.5, 'one of those': 0.5},
 'one of the': {'of the most': 0.058823529411764705,
  'of the phenomena': 0.058823529411764705,
  'of the servants': 0.058823529411764705,
  'of the others': 0.058823529411764705,
  'of the trees': 0.058823529411764705,
  'of the best': 0.11764705882352941,
  'of the women': 0.058823529411764705,
  'of the windows': 0.058823529411764705,
  'of the causes': 0.058823529411764705,
  'of the first': 0.11764705882352941,
  'of the folds': 0.058823529411764705,
  'of the remotest': 0.058823529411764705

In [None]:
for i in range(6):
   print(str(i)+".\t", generate_text(MC4,start= 'genevese', limit=8))

0.	 genevese 
1.	 genevese 
2.	 genevese 
3.	 genevese 
4.	 genevese 
5.	 genevese 


## Generate text using NLP models

In [None]:
# pip install transformers

In [28]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# upload tokenizer GPT-2 pre-trained
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Definisci l'input
input_text = "I am by birth a Genevese; and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics; and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family. As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant, who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition, and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished for his rank and magnificence. Having paid his debts, therefore, in the most honourable manner, he retreated with his daughter to the town of Lucerne, where he lived unknown and in wretchedness. My father loved Beaufort with the truest friendship, and was deeply grieved by his retreat in these unfortunate circumstances. He bitterly deplored the false pride which led his friend to a conduct so little worthy of the affection that united them. He lost no time in endeavouring to seek him out, with the hope of persuading him to begin the world again through his credit and assistance."

# Tokenizza l'input
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Genera del testo basato sull'input
output = model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7, pad_token_id=tokenizer.eos_token_id)

# Decodifica il testo generato
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Testo generato:", generated_text)


Testo generato: I am by birth a Genevese; and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics; and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family. As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant, who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition, and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished f