import bibliotek 

In [None]:
import os
import json
import string
import morfeusz2
import pandas as pd
import operator as op
import itertools as it
import numpy as np
from nltk.util import ngrams
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import PlaintextCorpusReader
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


1. Utworzenie korpusu dokumentów

In [None]:
corpus_dir="./literatura"
corpus = PlaintextCorpusReader(corpus_dir, ".*\.txt")
files = corpus.fileids()
files

2. Wstępne przygotowanie dokumentów

In [None]:
documents = {}
for f in files:
    documents[f] = corpus.raw(f)
print(json.dumps(documents, indent=4, ensure_ascii=False))

In [None]:
stoplist_file = open("./stopwords_pl.txt", "r", encoding="UTF-8")
stoplist = stoplist_file.read().splitlines()
stoplist_file.close()
stoplist = stoplist[4:]
stoplist

In [None]:
def lemmatize(text):
    morf = morfeusz2.Morfeusz()
    segments = it.groupby(morf.analyse(text), op.itemgetter(0,1))
    def disabiguate(group):
        pairs = ((len(descr), lemma)
            for _, _, (_, lemma, descr, _,_,) in group)
        perpl, lemma = min(pairs)
        return lemma.split(":")[0]
    lemmas = (disabiguate(group) for key, group in segments)
    return " ".join(filter(str.isalpha, lemmas))

In [None]:
for key in documents:
    documents[key] = documents[key].lower()
    documents[key] = "".join([char for char in documents[key] if char not in string.punctuation])
    documents[key] = lemmatize(documents[key])
    documents[key] = " ".join([word for word in word_tokenize(documents[key], language='polish') if word not in stoplist])

print(json.dumps(documents, indent=4, ensure_ascii=False))

3. Utworzenie macirzy częstości

In [None]:
docs =  pd.DataFrame.from_dict(documents, orient='index', columns=['content'])
docs

In [None]:
count_vectorizer = CountVectorizer()
count_tf = count_vectorizer.fit_transform(docs['content'])
count_tf.toarray()

In [None]:

tfidf_vectorizer = TfidfVectorizer()
counts_tfidf = tfidf_vectorizer.fit_transform(docs['content'])
counts_tfidf.toarray()

In [None]:
wordcloud = WordCloud(
    background_color='white',
    max_words=5000,
    contour_width=3,
    contour_color='steelblue'
)
if not os.path.exists("./chmury"):
    os.mkdir("./chmury")
if not os.path.exists("./tematy"):
    os.mkdir("./tematy")
for index, row in docs.iterrows():
    wordcloud.generate(row['content'])
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.title(index.replace(".txt", ""))
    plt.savefig("./chmury/{}".format(index.replace(".txt", ".png")))

5. Topic modeling

In [None]:
def plot_top_words(model, feature_names, n_top_words, title, subplots):
    fig, axes = plt.subplots(*subplots, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig(f"./tematy/{title}.png")

In [None]:

def plot_documents(model, counts, files_names, n_components, title):
  colors = ['forestgreen', 'lightskyblue', 'hotpink', 'turquoise', 'steelblue', 'crimson', 'seagreen', 'orange']
  docs_topics = pd.DataFrame(model.transform(counts), columns=[f"Topic {x}" for x in range(n_components)])
  docs_topics.index = [file_name.replace(".txt", "") for file_name in files_names]
  plt.figure(figsize=(7,4))
  left = [0] * len(docs_topics)
  for i, col in enumerate(docs_topics.columns):
    plt.barh(docs_topics.index, docs_topics[col], left=left, label=col, color = colors[i])
    left = [left[j]+docs_topics[col].iloc[j] for j in range(len(docs_topics))]
  plt.gcf().subplots_adjust(left=0.5)
  plt.savefig(f"./tematy/{title}_docs.png")
  plt.close()
  

In [None]:
n_components = 3
features_names = count_vectorizer.get_feature_names_out()
n_top_words = 20
subplots = (1,3)

In [None]:
lda = LDA(n_components=n_components, max_iter=5, learning_method='online', learning_offset= 50, random_state=0)
lda.fit(count_tf)
plot_top_words(lda, features_names, n_top_words, "TematyLDA", subplots)
plot_documents(lda, count_tf, files, n_components, "LDA")

In [None]:
nmf_fn = NMF(n_components=n_components, random_state=1, alpha_H=.00005, alpha_W=.00005, l1_ratio=.5)
nmf_fn.fit(counts_tfidf)
plot_top_words(nmf_fn, features_names, n_top_words, "TematyNMF", subplots)
plot_documents(nmf_fn, counts_tfidf, files, n_components, "NMF")

In [None]:
nmf_kl = NMF(
    n_components=n_components,
    random_state=1,
    beta_loss='kullback-leibler',
    solver='mu',
    max_iter=1000,
    alpha_H=.00005,
    alpha_W=.00005,
    l1_ratio=.5
)
nmf_kl.fit(counts_tfidf)
plot_top_words(nmf_kl, features_names, n_top_words, "Tematy w modelu NMF (KL)", subplots)
plot_documents(nmf_kl, counts_tfidf, files, n_components, "Tematy w modelu NMF (KL)")

6. Analiza skupień

In [None]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # Leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    plt.figure(figsize=(7,4))
    dendrogram(linkage_matrix, **kwargs)

In [None]:
cs = cosine_similarity(counts_tfidf, counts_tfidf).flatten().reshape(len(files), len(files))
cs

In [None]:
n_clusters = 3

In [None]:
if not os.path.exists("./skupienia"):
    os.mkdir("./skupienia")

clustering = AgglomerativeClustering(
    n_clusters=n_clusters,
    metric='euclidean',
    linkage='ward',
    compute_distances=True
).fit(cs)
clustering.labels_

In [None]:
ed = euclidean_distances(count_tf, count_tf).flatten().reshape(len(files), len(files))
ed

In [None]:

plot_dendrogram(clustering, labels=files, orientation='right')
plt.gcf().subplots_adjust(left=0.6)
plt.savefig(f"./skupienia/cousineWards_docs.png")
plt.close()

In [None]:
clustering = AgglomerativeClustering(
    n_clusters=n_clusters,
    metric='precomputed',
    linkage='complete',
    compute_distances=True
).fit(ed)
clustering.labels_

In [None]:
plot_dendrogram(clustering, labels=files, orientation='right')
plt.gcf().subplots_adjust(left=0.6)
plt.savefig(f"./skupienia/euclideanWards_docs.png")
plt.close()

7. N-gramy

In [135]:
if not os.path.exists("./ngramy"):
    os.mkdir("./ngramy")


In [153]:
n = 3
if not os.path.exists(f"./ngramy/{n}"):
        os.mkdir(f"./ngramy/{n}")

In [154]:
documents_tokenized = {}
for key in documents:
    documents_tokenized[key] = word_tokenize(documents[key], language='polish')

print(json.dumps(documents_tokenized, indent=4, ensure_ascii=False))

{
    "Harry Potter i Czara Ognia.txt": [
        "lord",
        "voldemort",
        "wraz",
        "sługa",
        "glizdogonem",
        "przybywać",
        "dom",
        "należący",
        "niegdyś",
        "rodzina",
        "riddleów",
        "położony",
        "little",
        "hangleton",
        "układać",
        "plan",
        "dotyczyć",
        "zamordować",
        "harryego",
        "pottera",
        "mieć",
        "pomóc",
        "czarny",
        "odzyskanie",
        "dawny",
        "moc",
        "mowa",
        "wierna",
        "sługa",
        "czarny",
        "wkrótce",
        "mieć",
        "znaleźć",
        "hogwarcie",
        "słyszeć",
        "mieszkać",
        "nieopodal",
        "ogrodnik",
        "frank",
        "bryka",
        "voldemort",
        "odkrywać",
        "mężczyzna",
        "podsłuchiwać",
        "zapraszać",
        "środek",
        "uśmiercać",
        "Harry",
        "budzić",
        "przerażający",
        

In [155]:
for filename in documents_tokenized:
    n_grams = pd.Series(
        ngrams(documents_tokenized[filename], n)
    ).value_counts()
    n_grams[:5].plot.barh()
    plt.gcf().subplots_adjust(left=0.5)
    plt.savefig(f"./ngramy/{n}/{filename.replace('.txt', '.png')}")
    plt.close()

In [156]:
texts = ' '.join(docs['content']).split(" ")
n_grams = pd.Series(
    ngrams(texts, n)
).value_counts()
n_grams[:15].plot.barh()
plt.gcf().subplots_adjust(left=0.5)
plt.savefig(f"./ngramy/{n}/all.png")
plt.close()