In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from markov import Markov
import markovify

from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import sacrebleu

import nltk
#nltk.download("vader_lexicon")
#nltk.download("punkt")
#nltk.download("wordnet")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_text(file_path: str) -> str:
    """
        Reads a text from a given file.
        
        :param file_path: A readable file.
        
        :rtype: str
        :return: The read text from the file.
    """
    
    text = []
    
    with open(file_path, 'r') as file:
        for line in file:
            text.append(line.strip())
            
    return ' '.join(text)

In [3]:
def generate(m : Markov | markovify.Text, _input: str, first_word: str, no_words: int = 20) -> str:
    if isinstance(m, Markov):
        return m.generate(_input, first_word, no_words)
    if isinstance(m, markovify.Text):
        return m.make_sentence()
    

In [4]:
def generate_from_dataset(m: Markov | markovify.Text, _input: str, first_word: str = None, no_words: int = 20):
    if isinstance(m, Markov):
        return m.generate(_input, first_word, no_words)
    if isinstance(m, markovify.Text):
        return m.make_short_sentence(min_chars=500, max_chars=1000)

In [5]:
proverbe_path = "./data/proverbe.txt"
poezii_path = "./data/corpus_complet.txt"
gutenberg_poetry = load_dataset("biglam/gutenberg-poetry-corpus")

In [27]:
# Poezie fara model
my_m = Markov(no_states=3, is_file=True)
my_text = generate(my_m, _input=poezii_path, first_word=None, no_words=20)
print(f"Generated text (no model): \n{my_text}")

# Proverb cu model markovify
text = read_text(proverbe_path)
markovify_m = markovify.Text(text, state_size=1)
markovify_text = generate(markovify_m, _input=proverbe_path, first_word=None, no_words=20)
print(f"Generated text (markovify): \n{markovify_text}")

Generated text (no model): 
Oarbe  de glod şi dintro
 parte  co sã ştii
 ce trăi pe străzi şi
 pe catafalcul falnic domnind
Generated text (markovify): 
Bunul gospodar isi aduce anul ce imi e cald.


In [7]:
def compute_sentiment(sia: SentimentIntensityAnalyzer, text: str):
    return sia.polarity_scores(text)

In [8]:
def replace_with_synonyms(text: str) -> str:
    import random
    tokens = word_tokenize(text)
    
    for i in range(len(tokens)):
        if random.random() > 0.2:
            synonyms = wordnet.synsets(tokens[i])
            if synonyms:
                syn = random.choice(synonyms).lemmas()[0].name()
                tokens[i] = syn
                
    new_text = ''
    for i in range(len(tokens)):
        new_text += ' ' + tokens[i]
        
        if (i + 1) % 5 == 0:
            new_text += "\n"
    return new_text

In [22]:
# English poem
markovify_m = markovify.Text(' '.join(gutenberg_poetry['train']['line'][:100]), state_size=2)
my_m = Markov(no_states=3, is_file=False)

my_poem = generate_from_dataset(my_m, _input=' '.join(gutenberg_poetry['train']['line'][:100]), no_words=100)
print(f"English poem (no model): \n{my_poem}")

sia = SentimentIntensityAnalyzer()
print(f"\nSentiment: {compute_sentiment(sia, ' '.join(my_poem.split('\n')))}")

new_poem = replace_with_synonyms(my_poem)
print(f"\nEnglish poem with synonyms: \n{new_poem}")

# reference = [[word.lower().strip()] for word in ' '.join(my_poem.split('\n')).split()]
# actual = [word.lower().strip() for word in ' '.join(new_poem.split('\n')).split()]

reference = [sentence.strip() for sentence in my_poem.split('\n')]
actual = [sentence.strip() for sentence in new_poem.split('\n')]

bleu = sacrebleu.corpus_bleu(actual, reference)

print(f"\nBLEU score: {bleu.score}")

English poem (no model): 
Margaret caroline davenport was assured however
 it soon as i should
 tell you ask where nawadaha
 found these legends he might
 prosper that he had used
 to rock me tell us
 of the green and the
 ojibway tribe at la pointe
 wisconsin and toiled and the
 green and compiled much of
 the green and being how
 he completed it soon as
 i close my eyelids should
 you ask where nawadaha found
 these legends he lived and
 the ojibways from the green
 prairie who love the poem
 was assured however it soon
 as follow in the great
 rivers with the fenlands

Sentiment: {'neg': 0.0, 'neu': 0.876, 'pos': 0.124, 'compound': 0.9231}

English poem with synonyms: 
 Margaret Caroline davenport was assured
 however information_technology soon as I
 should tell you necessitate where
 nawadaha discover these legend he
 might prosper that helium have
 use to Rock Maine tell
 U of the green and
 the Ojibwa tribe astatine la
 pointe Wisconsin and toiled and
 the green and compiled muc