In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("breastcancer-2020-linguistics.parquet")

In [3]:
len(df)

41812

# Topics

In [4]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [5]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def wordcloud_topic_model_summary(model, feature_names, no_top_words, prefix):
    for topic_idx, topic in enumerate(model.components_):
        freq = {}
        for i in topic.argsort()[:-no_top_words - 1:-1]:
            val = int(100000.0 * topic[i])
            freq[feature_names[i].replace(" ", "_")] = val+1
        wc = WordCloud(background_color="white", max_words=100, width=960, height=540)
        wc.generate_from_frequencies(freq)
        plt.figure(figsize=(12,12))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off");
        plt.savefig("%s-%02d.png" % (prefix, topic_idx))
        plt.close()
            
def show_topic_model_stat(nmf, tfidf_vectors, feature_names):
    res = []
    v = nmf.transform(tfidf_vectors)
    topic_sizes = v.sum(axis=0)
    topic_total = topic_sizes.sum()
    df = pd.DataFrame()
    for t in range(0, nmf.n_components):
        first_index = nmf.components_[t].argsort()[-1]
        res.append(["TOPIC %s (%02d)" % (feature_names[first_index], t), topic_sizes[t]/topic_total, None, 1.0])
        #print("\nTOPIC %s (%02d)\t%2.2f%%\t\t100%%" % (feature_names[first_index], t, topic_sizes[t]/topic_total*100.0))
        # personen indizes in topic 0 aufsteigend sortiert
        for i in nmf.components_[t].argsort()[-30:][::-1]:
            #print("TOPIC %s (%02d)\t%2.2f%%\t%s\t%2.2f%%" % (feature_names[first_index], t, topic_sizes[t]/topic_total*100.0, feature_names[i], 100.0*nmf.components_[t, i]/sum(nmf.components_[t, :])))    
            res.append(["TOPIC %s (%02d)" % (feature_names[first_index], t), 
                        topic_sizes[t]/topic_total, feature_names[i], 
                        nmf.components_[t, i]/sum(nmf.components_[t, :])])
    return pd.DataFrame(res)

In [None]:
#df["text"] = df["lemmas"].map(lambda x: " ".join(x))
#df["text"] = df["text"].str.replace("datum", "data", case=False)

In [13]:
for w in """appreciation appreciate birthday breastcancer comment community day didn discussion 
 don feel forum glad goes good happy https know like luck mods new org post posting sharing 
 thank thanks think thread today tomorrow ve welcome wishes www year yesterday got went want going
 ll walking wednesday ruth karen said""".split(" "):
    stopwords.add(w)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,1), min_df=20, max_df=0.7)
tfidf_vectors = tfidf.fit_transform(df["text"])



In [15]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

for no_topics in [6, 12, 24]:
    nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, \
              init='nndsvd').fit(tfidf_vectors)
    wordcloud_topic_model_summary(nmf, tfidf.get_feature_names_out(), 40, "breastcancer-%02d" % no_topics)
    res = show_topic_model_stat(nmf, tfidf_vectors, tfidf.get_feature_names_out())
    res.set_index(0).to_excel("breastcancer-%02d.xlsx" % no_topics)

