# Markov Chains

### with one state

In [7]:
import numpy as np
import random

In [8]:
def generate_sentence_one_state(words,length):
    sentence = []
    for i in range(length):
        sentence.append(random.choice(words))
    proverb= ' '.join(sentence)
    return proverb

In [17]:
import re
from nltk import word_tokenize


def normalizeText(text, include_punctuation=False):
    cleaned = []
    for line in text:
        line = line.lower()

        # Optionally, use regex to ensure punctuation is separated from words
        line = re.sub(r'([,.!?"])', r' \1 ', line)  # Add space around specific punctuation marks

        # Tokenize the line
        tokens = word_tokenize(line)

        # Optionally filter to remove any unwanted characters (kept simple here)
        words = [token for token in tokens if token.isalpha() or token in [".", ",", "!", "?", '"']]

        # Append tokens to the cleaned list
        cleaned.extend(words)
    return cleaned

# with n - states

In [18]:
from nltk import ngrams
from collections import defaultdict


class MyMarkov2:
    def __init__(self, n=1):
        self.n = n
        self.markov_chain = defaultdict(lambda: defaultdict(int))
        self.tex_saved = ""

    def generate_n_gram(self, text):
        text = normalizeText(text)
        return list(ngrams(text.split(), self.n))

    def get_start_words(self, text):
        tokenized_sentences = [word_tokenize(re.sub(r'([,.!?"])', r' \1 ', s)) for s in text]
        start_words = set()
        for sentence in tokenized_sentences:
            if len(sentence) >= self.n:
                start_words.update([' '.join(sentence[:i]).lower() for i in range(1, self.n+1)])
        if not start_words:
            return "Error: No valid starting words found."
        return random.choice(list(start_words))

    def train(self, text):
        self.tex_saved = text.copy()
        text = normalizeText(text)
        for i in range(len(text) - 2 * self.n + 1):
            curr_state = ' '.join(text[i:i+self.n])
            next_state = ' '.join(text[i+self.n:i+2*self.n])
            self.markov_chain[curr_state][next_state] += 1

        # Calculate probabilities after filling the dictionary
        for curr_state, transitions in self.markov_chain.items():
            total = sum(transitions.values())
            for state in transitions:
                transitions[state] /= total

    def generate_sentence(self, limit=100, start='', sentences=None):
        story = []
        size=0
        if not start:
            start = self.get_start_words(self.tex_saved)
        story.append(start.capitalize())
        curr_state = start
        size+=len(curr_state.split())

        while size < limit:
            if curr_state not in self.markov_chain:
                break
            next_states = list(self.markov_chain[curr_state].keys())
           
            weights = list(self.markov_chain[curr_state].values())
            curr_state = random.choices(next_states, weights)[0]
            story.append(curr_state)
            size+=len(curr_state.split())

        return ' '.join(story)

In [19]:
def generate_some_sentences_n(sentences,lenghts=[10],states=2):
    markov=MyMarkov2(states)
    markov.train(sentences)
    for length in lenghts:
        print(markov.generate_sentence(sentences=sentences,limit=length))
        print('\n')
        print('----------------------------------------------------------\n')

In [20]:

%run Utils.ipynb

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Deea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
proverbe=read_file("data/proverbe.txt")
generate_some_sentences_n(proverbe,[10,20,15,5],4)
# proverbe[:10]

Vrei sa


----------------------------------------------------------

A da perle la porci . a despica firul in patru . a duce cu presul . adevarul este intotdeauna


----------------------------------------------------------

Cauta o femeie care iti place tie nu la altii . calatorului ii sade bine cu


----------------------------------------------------------

Lacomia


----------------------------------------------------------



In [22]:
def generate_one_State(sentences,lenghts=[10]):
    generate_some_sentences_n(sentences,lenghts,1)

In [23]:
def generate_markov_chain_n(sentences,lengths=[10,20,30,50]):
    markov=MyMarkov2()
    for length in lengths:
        print(markov.nstates(sentences,length=length))
        print('\n')
        print('----------------------------------------------------------\n')


In [24]:
import re
import markovify

In [25]:
def markovifyDo(sentences,length=10,states=2):
    if isinstance(sentences, list) and all(isinstance(sentence, str) for sentence in sentences):
        sentences = ' '.join(sentences)  # Join sentences with a space
    sentences = re.sub(r'\n', ' ', sentences)  # Replace newlines with spaces for consistent processing
    sentences = re.sub(r'[ ]+', ' ', sentences)  # Remove extra spaces
    sentences=sentences.strip()
    text_model = markovify.Text(sentences, state_size=states)
    return text_model.make_short_sentence(length)
        

In [28]:
from textblob import TextBlob

In [29]:
def emotion_calculate(poetry):
    blob = TextBlob(poetry)
    return blob.sentiment

In [30]:
import nltk
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Deea\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
def sentiment_calc_nltk(sentence):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(sentence)