In [1]:
!pip install wordcloud
!pip install gensim
!pip install -U sentence-transformers



# Etapa 1: Modelagem de Sentenças

In [2]:
# MODEL = 'bert-base-cased'
# MODEL = 'dmis-lab/biobert-base-cased-v1.1'
MODEL = 'fagner/envoy'

In [3]:
import os.path
from sentence_transformers import SentenceTransformer

DATASET = 'clicr'
DATASET_PATH = '../datasets/'+DATASET+'/clinicalcases.tsv'

sentences = []
with open(DATASET_PATH) as f:
    sentences = f.readlines()

OUTPUT_PATH = 'output/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += 'MNT/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += DATASET + '/'
if not os.path.exists(OUTPUT_PATH): os.mkdir(OUTPUT_PATH)
OUTPUT_PATH += MODEL + '/'
if not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH, exist_ok=True)

In [4]:
# model = SentenceTransformer(MODEL)
# embeddings = model.encode(sentences)

In [5]:
import pandas as pd

TSV_INPUT_PATH = '../from_embeddings_to_tsv/output_from_2/' + DATASET + '/' + MODEL + '/'

embeddings_dataframe = pd.read_csv(TSV_INPUT_PATH + 'word_embeddings.tsv', sep='\t',header=None)
embeddings_numpy = embeddings_dataframe.to_numpy()

embeddings = embeddings_numpy[:, :-1]

In [6]:
# sentences_dataframe = pd.read_csv(FILE, sep='\t',header=None)
# sentences = sentences_dataframe.to_numpy()

# Etapa 2: Agrupamento de Sentenças

In [15]:
DISTANCE_THRESHOLD = 54

In [16]:
from sklearn.cluster import AgglomerativeClustering

clustering_model = AgglomerativeClustering(linkage='ward', distance_threshold=DISTANCE_THRESHOLD, n_clusters=None)
clustering_model = clustering_model.fit(embeddings)

n_clusters = clustering_model.n_clusters_

print("Clusters: " ,n_clusters)

Clusters:  7


# Etapa 3: Extração de Tópicos

In [17]:
MAX_DF = 0.95

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

topics = [''] * n_clusters
for i in range(len(sentences)):
    topic_index = clustering_model.labels_[i]
    topics[topic_index] += sentences[i] + " "
    
tfidf_model = TfidfVectorizer(max_df=MAX_DF)

document_term_matrix = tfidf_model.fit_transform(topics)
terms = tfidf_model.get_feature_names_out()
terms_by_topic = tfidf_model.inverse_transform(document_term_matrix)
tfidf_matrix = document_term_matrix.toarray()

top_terms_by_topic = []
top_tfidfs_by_topic = []

for topic in range(len(topics)):
    sorted_tfidf = np.argsort(tfidf_matrix[topic])

    top_tfidf_indexes = sorted_tfidf[:]

    top_terms,top_tfidfs = [],[]
    for i in sorted_tfidf:
        top_terms.append(terms[i])
        top_tfidfs.append(tfidf_matrix[topic][i])
       
    top_terms_by_topic.append(top_terms)
    top_tfidfs_by_topic.append(top_tfidfs)

# Validação

In [19]:
TOP_WORDS = 80

In [20]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

dictionary = Dictionary(terms_by_topic)
corpus = [dictionary.doc2bow(t) for t in terms_by_topic]
texts = [[dictionary[word_id] for word_id, freq in c] for c in corpus]

cm = CoherenceModel(topics=top_terms_by_topic, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v',topn=TOP_WORDS)

coherence = cm.get_coherence()
coherence_per_topic = cm.get_coherence_per_topic()

print('Coerencia: ', coherence)
print('Coerencia por Topico: ', coherence_per_topic)    

with open(OUTPUT_PATH + '/mntresults.txt', "a") as file:
    print('Limiar de distância: '+str(DISTANCE_THRESHOLD), file=file)   
    print('Qtdd de Tópicos: '+str(n_clusters), file=file)   
    print('Limiar TF-IDF: '+str(MAX_DF), file=file)  
    print('Coerência total: '+str(coherence), file=file)  
    print('Coerência por Tópicos: '+str(coherence_per_topic), file=file)  
    print('Top Words: '+str(TOP_WORDS), file=file)  

    print('----------------------------------------------------------------------------', file=file)  

Coerencia:  0.41735399374197124
Coerencia por Topico:  [0.6083477707327855, 0.3306320050354231, 0.5223900401253625, 0.31982931582884955, 0.36884993640581226, 0.3661440630199128, 0.4052848250456533]
