In [None]:
import re
from collections import Counter
import powerlaw
import matplotlib.pyplot as plt

# ======= Step 1: Carica e tokenizza =======
with open("generated/merged_horror_stories.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

# Tokenizzazione molto semplice: solo parole con lettere
words = re.findall(r"\b[a-z]+\b", text)

# Frequenza delle parole
word_counts = Counter(words)
frequencies = list(word_counts.values())

print(word_counts)
print(sorted(frequencies, reverse = True))

In [None]:
# ======= Step 2: Analisi con Powerlaw =======
results = powerlaw.Fit(frequencies, discrete=True)

# Stampa i parametri stimati
print(f"α (alpha): {results.alpha:.4f}")
print(f"xmin: {results.xmin}")

# ======= Step 3: Plot =======
fig = plt.figure()
results.plot_ccdf(color='b', label='Empirical')
results.power_law.plot_ccdf(color='r', linestyle='--', label='Power law fit')
plt.xlabel('Word frequency')
plt.ylabel('CCDF')
plt.title('Zipf Law Fit to Word Frequencies')
plt.legend()
plt.show()