In [2]:

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# import nltk
# nltk.download()

# Etapy pracy BERTopic
1. embedding_model - default 
                   - mozna podac stringiem z https://www.sbert.net/docs/pretrained_models.html
                   - albo zalaczyc swoj embedding_model  -   w fit_transform:  embeddings: Pre-trained document embeddings. These can be used
                        instead of the sentence-transformer model
                    - tak jak powyzej mozna tez zamiast embedingu uzyc  modeli statystycznych np. TF-IDF (  ctfidf_model)
2. Clustering - dimensionality reduction algorithms, default:UMAP
             -  clustering algorithm; default HDBSAN 

3. Topic reduction - manulal/auto : liczba topicow [nr_topics: numer/ 'auto' - wtedy zostawia sie to dla dbscanna]
                - po treningu - funkcja .reduce_topics()
4. Topic Representation - default CountVectorizer - atrybut w init: vectorizer_model
                        - mozna tez zmienic po treningu .update_topics()
                                    n_gram_range: The n-gram range for the CountVectorizer.
                                    vectorizer_model: Pass in your own CountVectorizer from scikit-learn
                                    ctfidf_model: Pass in your own c-TF-IDF model to update the representations

In [8]:
data = pd.read_csv("data/LitCovid.csv")
data = data.dropna(subset=['abstract'])
data.columns

Index(['pmid', 'journal', 'title', 'abstract', 'keywords', 'label', 'pub_type',
       'authors', 'date1', 'doi', 'date2', 'label_category'],
      dtype='object')

# Stop words removal

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
data['abstract_no_stop_words'] = data['abstract'].apply(lambda x: ' '.join([w for w in x.split(' ') if not w.lower() in stop_words]))

# Topic modelling

In [9]:
topic_model = BERTopic(nr_topics =8)
topics, probs = topic_model.fit_transform(list(data['abstract_no_stop_words']))

In [10]:
data['topic_detected'] = topic_model.topics_

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,25444,-1_covid19_patients_pandemic_sarscov2
1,0,967,0_sarscov2_detection_igg_samples
2,1,932,1_patients_covid19_injury_cardiac
3,2,879,2_ct_patients_covid19_chest
4,3,707,3_children_pediatric_covid19_disease
5,4,697,4_food_pandemic_health_covid19
6,5,658,5_model_epidemic_cases_covid19
7,6,636,6_anxiety_sleep_depression_covid19
8,7,580,7_women_pregnant_pregnancy_covid19


In [19]:
gr = data[data["topic_detected"]!=-1][['label','topic_detected']].groupby('topic_detected')

In [26]:
for g in gr.groups:
    print(f"Group nr {g} with len {len(gr.get_group(g))}")
    print(gr.get_group(g)['label'].value_counts().iloc[:2])

Group nr 0 with len 967
Diagnosis     711
Prevention     35
Name: label, dtype: int64
Group nr 1 with len 932
Treatment;Diagnosis    242
Case Report            153
Name: label, dtype: int64
Group nr 2 with len 879
Diagnosis      509
Case Report    122
Name: label, dtype: int64
Group nr 3 with len 707
Prevention     124
Case Report    114
Name: label, dtype: int64
Group nr 4 with len 697
Prevention      173
General Info      9
Name: label, dtype: int64
Group nr 5 with len 658
Prevention                         178
Prevention;Epidemic Forecasting    103
Name: label, dtype: int64
Group nr 6 with len 636
Prevention              131
Prevention;Diagnosis      7
Name: label, dtype: int64
Group nr 7 with len 580
Prevention     163
Case Report     81
Name: label, dtype: int64


In [85]:
for i in range(7):
    print([k for k,v in topic_model.get_topic(i)])

['sarscov2', 'detection', 'igg', 'samples', 'assay', 'testing', 'positive', 'sensitivity', 'test', 'antibodies']
['patients', 'covid19', 'injury', 'cardiac', 'liver', 'disease', 'myocardial', 'cardiovascular', 'mortality', 'acute']
['ct', 'patients', 'covid19', 'chest', 'pneumonia', 'lung', 'imaging', 'disease', 'images', 'findings']
['children', 'pediatric', 'covid19', 'disease', 'sarscov2', 'patients', 'infection', 'severe', 'cases', 'coronavirus']
['food', 'pandemic', 'health', 'covid19', 'economic', 'crisis', 'public', 'global', 'countries', 'policy']
['model', 'epidemic', 'cases', 'covid19', 'number', 'data', 'models', 'spread', 'china', 'countries']
['anxiety', 'sleep', 'depression', 'covid19', 'nurses', 'health', 'stress', 'psychological', 'mental', 'nursing']


# OWL

In [52]:
from owlready2 import *
onto = owlready2.get_ontology("data/go.owl")
onto.load()

get_ontology("http://purl.obolibrary.org/obo/go.owl#")

In [90]:
nmspc = pd.Series([None if len(cls.hasOBONamespace) == 0 else cls.hasOBONamespace[0] for cls in list(onto.classes())])

In [93]:
lbl = pd.Series([None if len(cls.label) == 0 else cls.label[0] for cls in list(onto.classes())])

In [94]:
lbl.value_counts()

obsolete cell wall inner membrane                                                                                                               2
obsolete elastin                                                                                                                                2
obsolete positive regulation of induction of conjugation with cellular fusion by regulation of transcription from RNA polymerase II promoter    2
obsolete small nucleolar RNA                                                                                                                    2
obsolete negative regulation of transcription from RNA polymerase II promoter involved in heart development                                     2
                                                                                                                                               ..
glycosinolate biosynthetic process                                                                                          

In [92]:
nmspc.value_counts()

biological_process    30485
molecular_function    12404
cellular_component     4464
dtype: int64

In [83]:
for cls in list(onto.classes())[:35]:
    #print(cls.get_class_properties())
    print(cls.hasOBONamespace,  cls.label, cls.IAO_0000115)

['biological_process'] ['mitochondrion inheritance'] ['The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.']
['biological_process'] ['organelle inheritance'] ['The partitioning of organelles between daughter cells at cell division.']
['biological_process'] ['mitochondrion distribution'] ['Any process that establishes the spatial arrangement of mitochondria between and within cells.']
['biological_process'] ['mitochondrial genome maintenance'] ['The maintenance of the structure and integrity of the mitochondrial genome; includes replication and segregation of the mitochondrial chromosome.']
['biological_process'] ['mitochondrion organization'] ['A process that is carried out at the cellular level which results in the assembly, arrangement of constituent parts, or disassembly of a mitochondrion; includes mitochondrial morphogenesis and distribution, and repl

# Embedded Topic Model

In [None]:
#!pip install -U embedded_topic_model

Collecting embedded_topic_model
  Using cached embedded_topic_model-1.0.2-py3-none-any.whl (17 kB)
Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp38-cp38-win_amd64.whl (24.2 MB)
     --------------------------------------- 24.2/24.2 MB 10.4 MB/s eta 0:00:00
Collecting embedded_topic_model
  Using cached embedded_topic_model-1.0.1-py3-none-any.whl (17 kB)
  Using cached embedded_topic_model-1.0.0-py3-none-any.whl (17 kB)
  Using cached embedded_topic_model-0.1.1-py3-none-any.whl (17 kB)
  Using cached embedded_topic_model-0.1.0-py3-none-any.whl (17 kB)
Collecting numpy==1.20.0
  Downloading numpy-1.20.0-cp38-cp38-win_amd64.whl (13.7 MB)
     ---------------------------------------- 13.7/13.7 MB 6.9 MB/s eta 0:00:00

The conflict is caused by:
    embedded-topic-model 1.0.2 depends on torch==1.6.0
    embedded-topic-model 1.0.1 depends on torch==1.6.0
    embedded-topic-model 1.0.0 depends on torch==1.6.0
    embedded-topic-model 0.1.1 depends on torch==1.6.0
    embedded-topic-mo

ERROR: Cannot install embedded-topic-model==0.1.0, embedded-topic-model==0.1.1, embedded-topic-model==1.0.0, embedded-topic-model==1.0.1 and embedded-topic-model==1.0.2 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts


In [13]:
from embedded_topic_model.utils import preprocessing
from embedded_topic_model.utils import embedding
import pandas as pd
from embedded_topic_model.models.etm import ETM

In [12]:
vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
    list(data['abstract']), 
    min_df=0.01, 
    max_df=0.75, 
    train_size=0.85, 
)

In [19]:
embeddings_mapping = embedding.create_word2vec_embedding_from_dataset(list(data['abstract']))

In [20]:
etm_instance = ETM(
    vocabulary,
    embeddings=embeddings_mapping, # You can pass here the path to a word2vec file or
                                   # a KeyedVectors instance
    num_topics=8,
    epochs=20,
    debug_mode=True,
    train_embeddings=False, # Optional. If True, ETM will learn word embeddings jointly with
                            # topic embeddings. By default, is False. If 'embeddings' argument
                            # is being passed, this argument must not be True
)

etm_instance.fit(train_dataset)

Topics before training: [['mellitus', '56', '69', 'reactions', 'male', 'diabetes', 'diarrhea', 'animal', 'hazard', 'software'], ['adapt', 'detect', 'weight', 'molecules', 'ranging', 'spectrum', 'pneumonia', 'pandemics', 'shock', 'protease'], ['78', 'make', 'late', 'versus', 'normal', 'nine', '73', '87', 'corona', 'delayed'], ['mg', 'lopinavir', 'confirmed', 'strongly', 'problem', 'domain', 'dependent', 'pro', 'taking', 'south'], ['spike', 'domain', 'droplets', 'male', 'random', 'compliance', 'admission', 'receptor', 'wearing', 'genetic'], ['receptors', '95', 'interleukin', 'species', 'reactive', 'expressed', 'likelihood', 'he', 'cardiac', 'confidence'], ['free', 'severely', 'energy', 'themselves', '31', 'lead', 'airway', 'allocation', 'explore', 'angiotensin'], ['self', 'hazard', 'above', 'imposed', 'discharged', 'themselves', 'mellitus', 'linear', 'alpha', 'confidence']]
Epoch 1 - Learning Rate: 0.005 - KL theta: 0.27 - Rec loss: 650.14 - NELBO: 650.41
Epoch 2 - Learning Rate: 0.005 -

<embedded_topic_model.models.etm.ETM at 0x24b743f13c8>

In [21]:
topics = etm_instance.get_topics(3)

In [22]:
topics

[['patients', 'were', 'was'],
 ['is', 'that', 'coronavirus'],
 ['care', 'health', 'pandemic'],
 ['that', 'is', 'this'],
 ['cases', 'from', 'was'],
 ['were', 'during', 'was'],
 ['on', 'as', 'not'],
 ['was', 'be', 'is']]