In [1]:
import re
from collections import defaultdict
import math
import pandas as pd

In [2]:
# Paso 1: Definir el corpus (texto de Kafka)
corpus = """One morning, when Gregor Samsa woke from troubled dreams, he found
himself transformed in his bed into a horrible vermin. He lay on his
armour-like back, and if he lifted his head a little he could see his
brown belly, slightly domed and divided by arches into stiff sections.
The bedding was hardly able to cover it and seemed ready to slide off
any moment. His many legs, pitifully thin compared with the size of the
rest of him, waved about helplessly as he looked.

“What’s happened to me?” he thought. It wasn’t a dream. His room, a
proper human room although a little too small, lay peacefully between
its four familiar walls. A collection of textile samples lay spread out
on the table—Samsa was a travelling salesman—and above it there hung a
picture that he had recently cut out of an illustrated magazine and
housed in a nice, gilded frame. It showed a lady fitted out with a fur
hat and fur boa who sat upright, raising a heavy fur muff that covered
the whole of her lower arm towards the viewer.

Gregor then turned to look out the window at the dull weather. Drops of
rain could be heard hitting the pane, which made him feel quite sad.
“How about if I sleep a little bit longer and forget all this
nonsense”, he thought, but that was something he was unable to do
because he was used to sleeping on his right, and in his present state
couldn’t get into that position. However hard he threw himself onto his
right, he always rolled back to where he was. He must have tried it a
hundred times, shut his eyes so that he wouldn’t have to look at the
floundering legs, and only stopped when he began to feel a mild, dull
pain there that he had never felt before.
"""

N-Gramas

In [3]:
# Funcion para generar n-gramas
def ngram_tokenize(text, n=2):
    tokens = re.findall(r"\b\w+\b", text.lower())
    if n == 1:
        return tokens
    else:
        ngrams = list(zip(*[tokens[i:] for i in range(n)]))
    return ngrams

In [4]:
# Paso 1.2: Tokenización y creación de ngramas   
unigrams = ngram_tokenize(corpus, n=1)
print("Lista de unigramas:")
print(unigrams)
bigrams = ngram_tokenize(corpus, n=2)
print("Lista de bigramas:")
print(bigrams)
trigrams = ngram_tokenize(corpus, n=3)
print("Lista de trigramas:")
print(trigrams)

Lista de unigramas:
['one', 'morning', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'he', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'the', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'his', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', 'what', 's', 'happened', 'to', 'me', 'he', 'thought', 'it', 'wasn', 't', 'a', 'dream', 'his', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'its', 'four', 'familiar', 'walls', 'a', 'collection'

Entropia y perplejidad

In [5]:
# Paso 2: Conteo de unigramas y bigramas
def count_ngrams(tokens):
    ngram_counts = defaultdict(int)
    for t in tokens:
        ngram_counts[t] += 1
    return ngram_counts

unigram_counts = count_ngrams(unigrams)
bigram_counts = count_ngrams(bigrams)
trigram_counts = count_ngrams(trigrams) 
print("\nConteo de unigramas:")
print(unigram_counts)
print("\nConteo de bigramas:")
print(bigram_counts)
print("\nConteo de trigramas:")
print(trigram_counts)

V = len(unigram_counts)
print("\nTamaño del vocabulario (V):", V)
print("Vocabulario:", set(unigram_counts.keys()))


Conteo de unigramas:
defaultdict(<class 'int'>, {'one': 1, 'morning': 1, 'when': 2, 'gregor': 2, 'samsa': 2, 'woke': 1, 'from': 1, 'troubled': 1, 'dreams': 1, 'he': 17, 'found': 1, 'himself': 2, 'transformed': 1, 'in': 3, 'his': 10, 'bed': 1, 'into': 3, 'a': 15, 'horrible': 1, 'vermin': 1, 'lay': 3, 'on': 3, 'armour': 1, 'like': 1, 'back': 2, 'and': 9, 'if': 2, 'lifted': 1, 'head': 1, 'little': 3, 'could': 2, 'see': 1, 'brown': 1, 'belly': 1, 'slightly': 1, 'domed': 1, 'divided': 1, 'by': 1, 'arches': 1, 'stiff': 1, 'sections': 1, 'the': 10, 'bedding': 1, 'was': 6, 'hardly': 1, 'able': 1, 'to': 9, 'cover': 1, 'it': 5, 'seemed': 1, 'ready': 1, 'slide': 1, 'off': 1, 'any': 1, 'moment': 1, 'many': 1, 'legs': 2, 'pitifully': 1, 'thin': 1, 'compared': 1, 'with': 2, 'size': 1, 'of': 6, 'rest': 1, 'him': 2, 'waved': 1, 'about': 2, 'helplessly': 1, 'as': 1, 'looked': 1, 'what': 1, 's': 1, 'happened': 1, 'me': 1, 'thought': 2, 'wasn': 1, 't': 3, 'dream': 1, 'room': 2, 'proper': 1, 'human': 1, 

In [6]:
# Paso 3: Calculo de la entropía de Shannon
def shannon_entropy(counts):
    total_count = sum(counts.values())
    entropy = 0.0
    for count in counts.values():
        if count > 0:
            probability = count / total_count
            entropy -= probability * math.log(probability, 2)
    return entropy

In [7]:
# Paso 4: Calculo de perplejidad
def perplexity(counts):
    entropy = shannon_entropy(counts)
    return 2 ** entropy

In [8]:
# Ejemplo de uso
entropy_unigrams = shannon_entropy(unigram_counts)
entropy_bigrams = shannon_entropy(bigram_counts)
entropy_trigrams = shannon_entropy(trigram_counts)
print("\nEntropía de Shannon para unigramas:", entropy_unigrams)
print("Entropía de Shannon para bigramas:", entropy_bigrams)
print("Entropía de Shannon para trigramas:", entropy_trigrams)

perplexity_unigrams = perplexity(unigram_counts)
perplexity_bigrams = perplexity(bigram_counts)
perplexity_trigrams = perplexity(trigram_counts)
print("\nPerplejidad para unigramas:", perplexity_unigrams)
print("Perplejidad para bigramas:", perplexity_bigrams)
print("Perplejidad para trigramas:", perplexity_trigrams)



Entropía de Shannon para unigramas: 7.0770005113797945
Entropía de Shannon para bigramas: 8.21917605693286
Entropía de Shannon para trigramas: 8.297451634253077

Perplejidad para unigramas: 135.0173044431346
Perplejidad para bigramas: 298.0015567213239
Perplejidad para trigramas: 314.6167420388865


In [9]:
# Prueba con corpus simple para entropia y perplejidad 
corpus_simple = "El perro ladra. El gato maulla. El perro corre."
# Paso 1.2: Tokenización y creación de ngramas
unigrams_simple = ngram_tokenize(corpus_simple, n=1)
# Calculo de unigramas
unigram_counts_simple = count_ngrams(unigrams_simple)
entropy_unigrams_simple = shannon_entropy(unigram_counts_simple)
perplexity_unigrams_simple = perplexity(unigram_counts_simple)
print("\nEntropía de unigramas para corpus simple:", entropy_unigrams_simple)
print("Perplejidad de unigramas para corpus simple:", perplexity_unigrams_simple)


Entropía de unigramas para corpus simple: 2.4193819456463714
Perplejidad de unigramas para corpus simple: 5.349418023423782


Smoothing

In [10]:
# Funciones de probabilidad (dos técnicas de smoothing)
def laplace_probability(w1, w2, k=1):
    return (bigram_counts.get((w1, w2), 0) + k) / (unigram_counts.get(w1, 0) + k * V)

def add_k_probability(w1, w2, k=0.5):
    return (bigram_counts.get((w1, w2), 0) + k) / (unigram_counts.get(w1, 0) + k * V)


In [14]:
#  prueba
frase = "la araña trabaja"
tokens_test = re.findall(r"\b\w+\b", frase.lower())
bigrams_test = list(zip(tokens_test, tokens_test[1:]))

log_prob_laplace = sum(math.log2(laplace_probability(w1, w2)) for w1, w2 in bigrams_test)
entropy_laplace = -log_prob_laplace / len(bigrams_test)
perplexity_laplace = 2 ** entropy_laplace

log_prob_addk = sum(math.log2(add_k_probability(w1, w2)) for w1, w2 in bigrams_test)
entropy_addk = -log_prob_addk / len(bigrams_test)
perplexity_addk = 2 ** entropy_addk

print(f"\nEntropía Laplace: {entropy_laplace:.4f}")
print(f"Perplejidad Laplace: {perplexity_laplace:.4f}")
print(f"\nEntropía Add-K: {entropy_addk:.4f}")
print(f"Perplejidad Add-K: {perplexity_addk:.4f}")


Entropía Laplace: 7.6221
Perplejidad Laplace: 197.0000

Entropía Add-K: 7.6221
Perplejidad Add-K: 197.0000


In [15]:
#conteos y probabilidades

df = pd.DataFrame(bigrams_test, columns=["P1", "P2"])
df["Conteo Bigram"] = df.apply(lambda r: bigram_counts.get((r.P1, r.P2), 0), axis=1)
df["Conteo Unigram (P1)"] = df["P1"].apply(lambda w: unigram_counts.get(w, 0))
df["Probabilidad Laplace"] = df.apply(lambda r: round(laplace_probability(r.P1, r.P2), 4), axis=1)
df["Probabilidad Add-K (0.5)"] = df.apply(lambda r: round(add_k_probability(r.P1, r.P2), 4), axis=1)

df

Unnamed: 0,P1,P2,Conteo Bigram,Conteo Unigram (P1),Probabilidad Laplace,Probabilidad Add-K (0.5)
0,la,araña,0,0,0.0051,0.0051
1,araña,trabaja,0,0,0.0051,0.0051
