In [1]:
!pip install wordcloud
!pip install gensim
!pip install -U sentence-transformers



# Etapa 1: Modelagem de Sentenças

In [2]:
# MODEL = 'bert-base-cased'
# MODEL = 'dmis-lab/biobert-base-cased-v1.1'
MODEL = 'fagner/envoy'

In [3]:
import os.path
from sentence_transformers import SentenceTransformer

DATASET = 'clicr'
DATASET_PATH = '../datasets/'+DATASET+'/clinicalcases.tsv'

sentences = []
with open(DATASET_PATH) as f:
    sentences = f.readlines()

OUTPUT_PATH = 'output/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += 'MNT/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += DATASET + '/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += MODEL + '/'
if not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH, exist_ok=True)

In [4]:
# model = SentenceTransformer(MODEL)
# embeddings = model.encode(sentences)

In [5]:
import pandas as pd

TSV_INPUT_PATH = '../from_embeddings_to_tsv/output_from_2/' + DATASET + '/' + MODEL + '/'

embeddings_dataframe = pd.read_csv(TSV_INPUT_PATH + 'word_embeddings.tsv', sep='\t',header=None)
embeddings_numpy = embeddings_dataframe.to_numpy()

embeddings = embeddings_numpy[:, :-1]

In [6]:
# sentences_dataframe = pd.read_csv(FILE, sep='\t',header=None)
# sentences = sentences_dataframe.to_numpy()

In [None]:
from matplotlib import pyplot as plt 
from sklearn import manifold

X_2d = manifold.SpectralEmbedding(n_components=2).fit_transform(embeddings)

x = X_2d[:,0]
y = X_2d[:,1]

plt.figure(figsize=(8, 6))
plt.scatter(x, y, color='black', s=1)

# for i, sentence in enumerate(sentences):
#     plt.annotate(sentence, (x[i], y[i]), fontsize=15)

plt.savefig(OUTPUT_PATH + 'sentence-embeddings')

# Etapa 2: Agrupamento de Sentenças

In [None]:
DISTANCE_THRESHOLD = 110

In [None]:
from sklearn.cluster import AgglomerativeClustering

clustering_model = AgglomerativeClustering(linkage='ward', distance_threshold=DISTANCE_THRESHOLD, n_clusters=None)
clustering_model = clustering_model.fit(embeddings)

colors = [plt.cm.nipy_spectral(label / 10.) for label in clustering_model.labels_]
n_clusters = clustering_model.n_clusters_

print('Limiar de distância: ', DISTANCE_THRESHOLD)
print('Agrupamentos resultantes',  n_clusters)

In [None]:
plt.clf()
plt.figure(figsize=(6, 4))

plt.scatter(x, y, color=colors, s=10, cmap='hsv_r')
plt.savefig(OUTPUT_PATH + 'clusters')

In [None]:
from scipy.cluster.hierarchy import dendrogram
import numpy as np

plt.clf()
# plt.figure(figsize=(2, 1))

counts = np.zeros(clustering_model.children_.shape[0])
n_samples = len(clustering_model.labels_)
for i, merge in enumerate(clustering_model.children_):
    current_count = 0
    for child_idx in merge:
        if child_idx < n_samples:
            current_count += 1  # leaf node
        else:
            current_count += counts[child_idx - n_samples]
    counts[i] = current_count

linkage_matrix = np.column_stack([clustering_model.children_, clustering_model.distances_, counts]).astype(float)

dendrogram(linkage_matrix, truncate_mode=None, p=5)

plt.savefig(OUTPUT_PATH + 'dendrograma')


# Etapa 3: Extração de Tópicos

In [None]:
MAX_DF = 0.95

In [None]:
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

topics = [''] * n_clusters
for i in range(len(sentences)):
    topic_index = clustering_model.labels_[i]
    topics[topic_index] += sentences[i] + " "
    
tfidf_model = TfidfVectorizer(max_df=MAX_DF)

document_term_matrix = tfidf_model.fit_transform(topics)
terms = tfidf_model.get_feature_names_out()
terms_by_topic = tfidf_model.inverse_transform(document_term_matrix)
tfidf_matrix = document_term_matrix.toarray()

top_terms_by_topic = []
top_tfidfs_by_topic = []
wordclouds = []

for topic in range(len(topics)):
    sorted_tfidf = np.argsort(tfidf_matrix[topic])

    top_tfidf_indexes = sorted_tfidf[:]

    top_terms,top_tfidfs = [],[]
    for i in sorted_tfidf:
        top_terms.append(terms[topic])
        top_tfidfs.append(tfidf_matrix[topic][i])
       
    top_terms_by_topic.append(top_terms)
    top_tfidfs_by_topic.append(top_tfidfs)
    
    dense = document_term_matrix[topic].todense()
    lst1 = dense.tolist()
    tfidf_data_frame = pd.DataFrame(lst1, columns=terms)

    wordcloud = WordCloud(background_color="white", max_words=50).generate_from_frequencies(tfidf_data_frame.T.sum(axis=1))
    wordclouds.append(wordcloud)

In [None]:
plt.clf()

for i, wc in enumerate(wordclouds):
    plt.imshow(wc)
    plt.axis("off")
    plt.savefig(OUTPUT_PATH + 'wordcloud'+str(i))
    plt.show()

In [None]:
print(top_terms_by_topic[0])


In [None]:
# fig, axes = plt.subplots(1, len(top_terms_by_topic), figsize=(30, 15), sharex=True)
plt.clf()
fig, ax = plt.subplots()
# axes = axes.flatten()

print(top_terms_by_topic[0])
for i in range(len(top_terms_by_topic)):
    # ax = axes[i]
    # print(top_terms_by_topic[i])
    ax.barh(top_terms_by_topic[i], top_tfidfs_by_topic[i], align='center')
    ax.set_title(f"topic-distribution{i +1}", fontdict={"fontsize": 30})
    ax.invert_yaxis()
    ax.tick_params(axis="both", which="major", labelsize=1)
    for j in "top right left".split():
        ax.spines[j].set_visible(True)
    # fig.suptitle('Tópicos', fontsize=30)

# plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    
    plt.savefig(OUTPUT_PATH + 'topic-distribution'+str(i))
    plt.show()

In [None]:
len(top_terms_by_topic[0])

# Validação

In [None]:
TOP_WORDS = 80

In [None]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

dictionary = Dictionary(terms_by_topic)
corpus = [dictionary.doc2bow(t) for t in terms_by_topic]
texts = [[dictionary[word_id] for word_id, freq in c] for c in corpus]

cm = CoherenceModel(topics=top_terms_by_topic, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v',topn=TOP_WORDS)

coherence = cm.get_coherence()
coherence_per_topic = cm.get_coherence_per_topic()

print('Coerencia: ', coherence)
print('Coerencia por Topico: ', coherence_per_topic)    

with open(OUTPUT_PATH + '/mntresults.txt', "a") as file:
    print('Limiar de distância: '+str(DISTANCE_THRESHOLD), file=file)   
    print('Qtdd de Tópicos: '+str(n_clusters), file=file)   
    print('Limiar TF-IDF: '+str(MAX_DF), file=file)  
    print('Coerência total: '+str(coherence), file=file)  
    print('Coerência por Tópicos: '+str(coherence_per_topic), file=file)  
    print('Top Words: '+str(TOP_WORDS), file=file)  

    print('----------------------------------------------------------------------------', file=file)  