In [57]:
import warnings
warnings.filterwarnings('ignore')

import spacy
import pandas as pd
from tqdm.notebook import trange, tqdm

from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_sm")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

<scispacy.abbreviation.AbbreviationDetector at 0x7fae58989370>

In [58]:
from scispacy.linking import EntityLinker

nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

<scispacy.linking.EntityLinker at 0x7fae589bcbe0>

In [59]:
doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  bulbar
CUI: C1947952, Name: anatomical bulb
Definition: A rounded dilation or expansion in a canal, vessel, or organ.
TUI(s): T017
Aliases: (total: 2): 
	 Bulbar, Bulb
CUI: C0032372, Name: Poliomyelitis, Bulbar
Definition: A form of paralytic poliomyelitis affecting neurons of the MEDULLA OBLONGATA of the brain stem. Clinical features include impaired respiration, HYPERTENSION, alterations of vasomotor control, and dysphagia. Weakness and atrophy of the limbs and trunk due to spinal cord involvement is usually associated. (From Adams et al., Principles of Neurology, 6th ed, p765)
TUI(s): T047
Aliases (abbreviated, total: 23): 
	 Acute bulbar polioencephalitis, Bulbar Polio, Poliomyelitis, Medullary Involvement, BULBAR POLIO, Acute paralytic poliomyelitis specified as bulbar, Polio, Bulbar, Bulbar Poliomyelitis, Anterior acute poliomyelitis, Acute infantile paralysis, Acute paralytic poliomyelitis, bulbar
CUI: C2586323, Name: Structure of fascial sheath of eyeball
Definition: She

In [4]:
f = open("topics/AAA.txt", "r")
output = f.read().replace('※','')
f.close()

In [5]:
doc = nlp(output)

In [6]:
for ent in doc.ents:
    print(ent._.kb_ents)


[('C0178866', 0.7777882218360901), ('C0016094', 0.7575173377990723), ('C0035848', 0.7249699831008911), ('C3830127', 0.7205051183700562)]
[('C0002940', 0.715607225894928)]
[('C1504464', 1.0)]
[('C0035647', 1.0), ('C4552904', 1.0), ('C3272281', 0.8170900344848633), ('C3538919', 0.8170900344848633), ('C5202762', 0.8170900344848633)]
[('C0162869', 0.9103761315345764), ('C0741160', 0.7873152494430542), ('C0265010', 0.7485358119010925)]
[('C0040223', 1.0), ('C1547403', 1.0), ('C1548318', 1.0), ('C3541383', 1.0), ('C1998882', 0.8618380427360535)]
[]
[('C0237881', 1.0), ('C0750502', 1.0), ('C1546944', 1.0), ('C0682323', 0.864962637424469), ('C3854148', 0.7805472612380981)]
[('C0035647', 1.0), ('C4552904', 1.0), ('C3272281', 0.8170900344848633), ('C3538919', 0.8170900344848633), ('C5202762', 0.8170900344848633)]
[('C1881712', 0.9999999403953552), ('C3203359', 0.9999999403953552), ('C0155760', 0.8360527157783508), ('C0018813', 0.822380781173706), ('C0151937', 0.8201714158058167)]
[('C0178866', 

In [7]:
entity = doc.ents[1]

print("Name: ", entity)

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  intubation
CUI: C0021925, Name: Intubation
Definition: Introduction of a tube into a hollow organ to restore or maintain patency if obstructed. It is differentiated from CATHETERIZATION in that the insertion of a catheter is usually performed for the introducing or withdrawing of fluids from the body.
TUI(s): T061
Aliases (abbreviated, total: 12): 
	 Intubation, intubations, intubation procedures, Intubation, NOS, Intubation - action (qualifier value), intubation procedure, Intubations, Intubation Procedure, Intubation - action, Intubation (procedure)
CUI: C0021932, Name: Intubation, Intratracheal
Definition: A procedure involving placement of a tube into the trachea through the mouth or nose in order to provide a patient with oxygen and anesthesia.
TUI(s): T061
Aliases (abbreviated, total: 21): 
	 Intubations, Intratracheal, Intratracheal Intubation, Insertion of endotracheal tube, Intubations, Endotracheal, Tracheal intubation, Intubation, Endotracheal, Insertion of endotrache

In [10]:
entity._.kb_ents

[('C0021925', 1.0), ('C0021932', 0.78619384765625)]

In [28]:
cui_set = set()
for entity in doc.ents:
    try:
        for cui in entity._.kb_ents:
            cui_set.add(cui[0])
    except:
        continue
print(cui_set)

34705', 'C1704435', 'C0155733', 'C4551669', 'C0005889', 'C0723712', 'C0796617', 'C0596164', 'C0001554', 'C4082936', 'C2985739', 'C2827486', 'C1559265', 'C3245481', 'C0023977', 'C4086674', 'C1171258', 'C3830526', 'C0023038', 'C1260873', 'C3274035', 'C1261322', 'C1518005', 'C1513770', 'C4687754', 'C1158765', 'C1704814', 'C0040053', 'C0024050', 'C0199230', 'C4319951', 'C1368999', 'C2735310', 'C1561559', 'C2936329', 'C0225851', 'C0700321', 'C3472245', 'C3899162', 'C0392148', 'C1704637', 'C4745009', 'C0037458', 'C1710459', 'C0019010', 'C0008307', 'C0751003', 'C0019345', 'C0021107', 'C1956346', 'C0021208', 'C0443318', 'C0282674', 'C5206952', 'C0040404', 'C4763881', 'C1883433', 'C0032790', 'C0151705', 'C0426747', 'C0042402', 'C1706180', 'C0034573', 'C0020649', 'C4745110', 'C5230958', 'C0277885', 'C4552820', 'C0220900', 'C1964018', 'C3827302', 'C0178916', 'C1762617', 'C1705556', 'C1527425', 'C1710524', 'C0025266', 'C0011382', 'C4721829', 'C2004284', 'C1882120', 'C1522704', 'C0038163', 'C402559

In [33]:
for cui in sorted(cui_set):
    cui_to_ent = linker.kb.cui_to_entity[cui]
    if 'T184' in cui_to_ent[3]:
        print(cui_to_ent[1])


Abdomen, Acute
Abdominal Pain
Back Pain
Eye Manifestations
Fever
Flank Pain
Low Back Pain
Pain
Pain, Postoperative
Rectal pain
Signs and Symptoms
Pain in testicle
Chronic pain
Acute onset pain
Malaise
Lower abdominal pain
Fetal Extension
back discomfort
Absence of sensation
Dull pain
gastrointestinal gas
Symptoms
Mobility Limitation
Discomfort
Discharge, body substance
Weakness
Medically Unexplained Symptoms


In [22]:
a = linker.kb.cui_to_entity[cui][3]

In [24]:
if 'T023' in a:
    print('yes')

yes


In [11]:
old_cuis = pd.read_csv('listOfCuis.csv')
old_cuis

Unnamed: 0,cui
0,C0332158
1,C0449381
2,C0439606
3,C0005767
4,C0007595
...,...
496,C0002766
497,C0056562
498,C0443131
499,C0589120


In [12]:
new_cuis = pd.DataFrame(cui_set, columns=['cui'])
new_cuis

Unnamed: 0,cui
0,C0162578
1,C0425358
2,C0444706
3,C2316467
4,C0036679
...,...
496,C0001122
497,C0020538
498,C0015801
499,C1514756


In [13]:
new_df = old_cuis.append(new_cuis, ignore_index=False).drop_duplicates(keep='first')
new_df.to_csv('listOfCuis.csv', index=False)

In [15]:
for cui in cui_set:
    print(linker.kb.cui_to_entity[cui])

ot, fat, and muscle as emboli. It has been used in the treatment of spinal cord and INTRACRANIAL ARTERIOVENOUS MALFORMATIONS, renal arteriovenous fistulas, gastrointestinal bleeding, epistaxis, hypersplenism, certain highly vascular tumors, traumatic rupture of blood vessels, and control of operative hemorrhage.
TUI(s): T061
Aliases (abbreviated, total: 24): 
	 Embolization Therapy, Embolization - action, Embolisation procedure, therapy, embolization, Embolotherapy, Therapeutic Embolizations, embolotherapy, Embolisation - action, Embolisation, embolization procedure
CUI: C0032659, Name: geographic population
Definition: The total number of individuals inhabiting a particular region or area.
TUI(s): T081
Aliases: (total: 5): 
	 populations, Population (social concept), Population, population, Populations
CUI: C0205396, Name: Identified
Definition: The procedure of having an identity established.
TUI(s): T080
Aliases: (total: 6): 
	 Identification, Identified, Discernible, identified, Id

In [18]:
f = open("topics/AAA.txt", "r")
output = f.read().replace('※','')
f.close()
doc = nlp(output)

In [19]:
cui_set = set()
for entity in doc.ents:
    try:
        cui_set.add(entity._.kb_ents[0][0])
    except:
        continue
print(cui_set)

{'C0162578', 'C0425358', 'C0444706', 'C2316467', 'C0036679', 'C0679688', 'C1554961', 'C0266835', 'C0034896', 'C0231220', 'C0205087', 'C0002921', 'C0184512', 'C0600210', 'C0205216', 'C0556984', 'C0003855', 'C0017181', 'C0029266', 'C0205283', 'C0205108', 'C0086805', 'C0031166', 'C0332197', 'C1279919', 'C0007820', 'C0003483', 'C4706466', 'C0020488', 'C0035273', 'C0934502', 'C0079809', 'C0456389', 'C0020283', 'C0220912', 'C3824793', 'C0547047', 'C0162871', 'C0018802', 'C0018943', 'C0444976', 'C0003860', 'C1517945', 'C0225317', 'C0003835', 'C0020792', 'C0302828', 'C0445204', 'C0220920', 'C0043251', 'C0443318', 'C0032854', 'C0032214', 'C0036492', 'C0020887', 'C0078988', 'C0332181', 'C0001811', 'C0332152', 'C0586909', 'C0549177', 'C5204628', 'C0003850', 'C0016199', 'C0019080', 'C0700321', 'C0239307', 'C0013227', 'C0728940', 'C4086268', 'C1504464', 'C0004793', 'C0037088', 'C0695347', 'C1762617', 'C0333205', 'C0175566', 'C0439658', 'C0003486', 'C0028754', 'C0005779', 'C1523987', 'C0003842', 'C0

In [22]:
old_cuis = pd.read_csv('listOfCuis.csv')
new_cuis = pd.DataFrame(cui_set, columns=['cui'])
new_df = old_cuis.append(new_cuis, ignore_index=False).drop_duplicates(keep='first')
new_df

Unnamed: 0,cui
0,C0332158
1,C0449381
2,C0439606
3,C0005767
4,C0007595
...,...
496,C0002766
497,C0056562
498,C0443131
499,C0589120


In [21]:
new_cuis

Unnamed: 0,cui
0,C0162578
1,C0425358
2,C0444706
3,C2316467
4,C0036679
...,...
496,C0001122
497,C0020538
498,C0015801
499,C1514756


In [None]:
df = pd.read_excel('corependium.xlsx')

In [None]:
df.head()

In [43]:
df = pd.read_csv('scraped2.csv')

In [44]:
df

Unnamed: 0,content,topic
0,Rapid Access,Abdominal Aortic Aneurysm
1,Approach to the Critical Patient,Abdominal Aortic Aneurysm
2,"When a ruptured AA is suspected, consider imme...",Abdominal Aortic Aneurysm
3,Intraperitoneal rupture will lead to intra-abd...,Abdominal Aortic Aneurysm
4,Retroperitoneal rupture may be difficult to id...,Abdominal Aortic Aneurysm
...,...,...
61964,An increased risk of infection has been used a...,Wound Management
61965,"In general, accepted indications for prophylac...",Wound Management
61966,Antibiotics for prophylaxis are similar to tho...,Wound Management
61967,Although IV antibiotics achieve therapeutic ti...,Wound Management


In [53]:
corependium_text = '\n'.join(df.content.astype(str))


In [85]:
symptom_cuis = []
for line in tqdm(df.content.astype(str).tolist()):
    doc = nlp(line)
    # print(doc.ents)
    for entity in doc.ents:
        try:
            # print(entity._.kb_ents)
            for cui in entity._.kb_ents:
                # print(cui)
                cui_to_ent = linker.kb.cui_to_entity[cui[0]]
                # print(cui_to_ent)
                if 'T184' in cui_to_ent[3]:
                    # print('success')
                    if cui[0] not in symptom_cuis:
                        symptom_cuis.append(cui[0])    
        except:
            continue
symptom_cuis
    

  0%|          | 0/61969 [00:00<?, ?it/s]

['C0000737',
 'C0232495',
 'C3839861',
 'C1457887',
 'C0150055',
 'C0030193',
 'C0184567',
 'C0037088',
 'C1565249',
 'C2926602',
 'C0015411',
 'C0004604',
 'C0024031',
 'C0000727',
 'C0278144',
 'C0235706',
 'C2364135',
 'C0016199',
 'C0039591',
 'C0034886',
 'C0278134',
 'C0233226',
 'C0015967',
 'C0231218',
 'C0596601',
 'C3714552',
 'C0030201',
 'C0233763',
 'C2985299',
 'C2985336',
 'C0037011',
 'C0027498',
 'C0013404',
 'C0016204',
 'C0002962',
 'C0039070',
 'C0008031',
 'C0008035',
 'C0030200',
 'C0877303',
 'C0476273',
 'C0242706',
 'C0745417',
 'C0029166',
 'C0037285',
 'C3665346',
 'C0221423',
 'C0521989',
 'C0232493',
 'C0239377',
 'C0027497',
 'C0012833',
 'C0220870',
 'C0015672',
 'C0700200',
 'C0023380',
 'C0596240',
 'C0235710',
 'C0085624',
 'C0234230',
 'C0241057',
 'C0013395',
 'C1456399',
 'C0458259',
 'C0232292',
 'C0028643',
 'C0278145',
 'C0236000',
 'C0232288',
 'C0042963',
 'C1321587',
 'C0008033',
 'C0027424',
 'C0010200',
 'C0231528',
 'C0231617',
 'C0001726',

In [88]:
symptom_df = pd.DataFrame(symptom_cuis, columns=['cui'])
symptom_df = symptom_df.sort_values(by=['cui'])
symptom_df

Unnamed: 0,cui
13,C0000727
310,C0000729
0,C0000737
76,C0001726
34,C0002962
...,...
443,C4551516
132,C4551520
180,C4552061
487,C4732758


In [None]:
symptom_df

In [98]:
def get_cui_name(cui):
    return linker.kb.cui_to_entity[cui][1]
get_cui_name('C0000737')

'Abdominal Pain'

In [100]:
symptom_df['symptom'] = symptom_df.cui.apply(get_cui_name)
symptom_df

Unnamed: 0,cui,symptom
13,C0000727,"Abdomen, Acute"
310,C0000729,Abdominal Cramps
0,C0000737,Abdominal Pain
76,C0001726,Affective Symptoms
34,C0002962,Angina Pectoris
...,...,...
443,C4551516,Hip pain
132,C4551520,Intention tremor
180,C4552061,Mandibular pain
487,C4732758,Gastrocnemius myalgia


In [101]:
symptom_df.to_csv('signs_or_symptoms.csv', index=False)

In [103]:
injury_or_poisoning_cuis = []
for line in tqdm(df.content.astype(str).tolist()):
    doc = nlp(line)
    # print(doc.ents)
    for entity in doc.ents:
        try:
            # print(entity._.kb_ents)
            for cui in entity._.kb_ents:
                # print(cui)
                cui_to_ent = linker.kb.cui_to_entity[cui[0]]
                # print(cui_to_ent)
                if 'T037' in cui_to_ent[3]:
                    # print('success')
                    if cui[0] not in injury_or_poisoning_cuis:
                        injury_or_poisoning_cuis.append(cui[0])    
        except:
            continue
injury_or_poisoning_df = pd.DataFrame(injury_or_poisoning_cuis, columns=['cui'])
injury_or_poisoning_df = injury_or_poisoning_df.sort_values(by=['cui'])
injury_or_poisoning_df['name'] = injury_or_poisoning_df.cui.apply(get_cui_name)
injury_or_poisoning_df.to_csv('injury_or_poisoning.csv', index=False)


  0%|          | 0/61969 [00:00<?, ?it/s]

In [104]:
disease_or_syndrome_cuis = []
for line in tqdm(df.content.astype(str).tolist()):
    doc = nlp(line)
    # print(doc.ents)
    for entity in doc.ents:
        try:
            # print(entity._.kb_ents)
            for cui in entity._.kb_ents:
                # print(cui)
                cui_to_ent = linker.kb.cui_to_entity[cui[0]]
                # print(cui_to_ent)
                if 'T047' in cui_to_ent[3]:
                    # print('success')
                    if cui[0] not in disease_or_syndrome_cuis:
                        disease_or_syndrome_cuis.append(cui[0])    
        except:
            continue
disease_or_syndrome_df = pd.DataFrame(disease_or_syndrome_cuis, columns=['cui'])
disease_or_syndrome_df = disease_or_syndrome_df.sort_values(by=['cui'])
disease_or_syndrome_df['name'] = disease_or_syndrome_df.cui.apply(get_cui_name)
disease_or_syndrome_df.to_csv('disease_or_syndrome.csv', index=False)

  0%|          | 0/61969 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
finding_cuis = []
for line in tqdm(df.content.astype(str).tolist()):
    doc = nlp(line)
    # print(doc.ents)
    for entity in doc.ents:
        try:
            # print(entity._.kb_ents)
            for cui in entity._.kb_ents:
                # print(cui)
                cui_to_ent = linker.kb.cui_to_entity[cui[0]]
                # print(cui_to_ent)
                if 'T033' in cui_to_ent[3]:
                    # print('success')
                    if cui[0] not in finding_cuis:
                        finding_cuis.append(cui[0])    
        except:
            continue
finding_df = pd.DataFrame(finding_cuis, columns=['cui'])
finding_df = finding_df.sort_values(by=['cui'])
finding_df['name'] = finding_df.cui.apply(get_cui_name)
finding_df.to_csv('finding_df.csv', index=False)

In [115]:
sem_types_df = pd.read_csv('semtypes.txt', sep='|', names=['abbr','TUI','name']).set_index('TUI')
sem_types_df

Unnamed: 0_level_0,abbr,name
TUI,Unnamed: 1_level_1,Unnamed: 2_level_1
T116,aapp,"Amino Acid, Peptide, or Protein"
T020,acab,Acquired Abnormality
T052,acty,Activity
T100,aggp,Age Group
T087,amas,Amino Acid Sequence
...,...,...
T079,tmco,Temporal Concept
T061,topp,Therapeutic or Preventive Procedure
T005,virs,Virus
T127,vita,Vitamin


In [120]:
sem_types_df.loc['T190']['name']

'Anatomical Abnormality'

In [121]:
sem_type_targets = ['T190', 'T017', 'T195', 'T123', 'T007', 'T031', 'T022', 'T053', 'T038', 'T029', 'T091', 'T122', 'T023', 'T030', 'T019', 'T201', 'T200', 'T060', 'T203', 'T047', 'T033', 'T169', 'T093', 'T058', 'T131', 'T125', 'T078', 'T129', 'T037', 'T170', 'T130', 'T059', 'T034', 'T074', 'T041', 'T048', 'T191', 'T090', 'T042', 'T046', 'T039', 'T121', 'T101', 'T098', 'T167', 'T054', 'T184', 'T079', 'T061', 'T005', 'T127']
for sem_type in tqdm(sem_type_targets):
    cuis = []
    for line in tqdm(df.content.astype(str).tolist()):
        doc = nlp(line)
        # print(doc.ents)
        for entity in doc.ents:
            try:
                # print(entity._.kb_ents)
                for cui in entity._.kb_ents:
                    # print(cui)
                    cui_to_ent = linker.kb.cui_to_entity[cui[0]]
                    # print(cui_to_ent)
                    if sem_type in cui_to_ent[3]:
                        # print('success')
                        if cui[0] not in cuis:
                            cuis.append(cui[0])    
            except:
                continue
    cuis_df = pd.DataFrame(cuis, columns=['cui'])
    cuis_df = cuis_df.sort_values(by=['cui'])
    cuis_df['name'] = cuis_df.cui.apply(get_cui_name)
    cuis_df.to_csv('cuis/'+sem_type+' - '+sem_types_df.loc[sem_type]['name']+'.csv', index=False)

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/61969 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [122]:
cuis_df = pd.DataFrame(cuis, columns=['cui'])
cuis_df = cuis_df.sort_values(by=['cui'])
cuis_df['name'] = cuis_df.cui.apply(get_cui_name)
cuis_df.to_csv('cuis/'+sem_type+' - '+sem_types_df.loc[sem_type]['name']+'.csv', index=False)

In [None]:
sem_type_targets = ['T190', 'T017', 'T195', 'T123', 'T007', 'T031', 'T022', 'T053', 'T038', 'T029', 'T091', 'T122', 'T023', 'T030', 'T019', 'T201', 'T200', 'T060', 'T203', 'T047', 'T033', 'T169', 'T093', 'T058', 'T131', 'T125', 'T078', 'T129', 'T037', 'T170', 'T130', 'T059', 'T034', 'T074', 'T041', 'T048', 'T191', 'T090', 'T042', 'T046', 'T039', 'T121', 'T101', 'T098', 'T167', 'T054', 'T184', 'T079', 'T061', 'T005', 'T127']
for sem_type in tqdm(sem_type_targets):
    cuis = []
    for line in tqdm(df.content.astype(str).tolist()):
        doc = nlp(line)
        # print(doc.ents)
        for entity in doc.ents:
            try:
                # print(entity._.kb_ents)
                for cui in entity._.kb_ents:
                    # print(cui)
                    cui_to_ent = linker.kb.cui_to_entity[cui[0]]
                    # print(cui_to_ent)
                    if cui_to_ent[3][0] in sem_type_targets:
                        # print('success')
                        cuis.append(cui[0])    
            except:
                continue
    cuis_df = pd.DataFrame(cuis, columns=['cui'])
    cuis_df = cuis_df.sort_values(by=['cui'])
    cuis_df['name'] = cuis_df.cui.apply(get_cui_name)
    cuis_df.to_csv('cuis/'+sem_type+' - '+sem_types_df.loc[sem_type]['name']+'.csv', index=False)