In [None]:
!pip install -U spacy
!pip install -U scispacy
!pip install -U pandas
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz

## Import libraries

In [1]:
import scispacy
import spacy
import pandas as pd
import regex as re
from spacy.pipeline import EntityRuler

In [2]:
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token
from spacy.tokens.span import Span
from spacy.lang.en import English

In [3]:
GENERAL_PATH = "../DATASET"
DATA_PATH = f"{GENERAL_PATH}/data"

### Source of sciSpaCy model : https://allenai.github.io/scispacy/

In [4]:
nlp_sci = spacy.load('en_core_sci_lg')
nlp_bc5cdr = spacy.load('en_ner_bc5cdr_md')
nlp_bionlp13cg = spacy.load('en_ner_bionlp13cg_md')

In [5]:
papers = pd.read_csv(f'{DATA_PATH}/papers_raw/final_papers.csv')
papers.head(5)

Unnamed: 0,PMID,Title,Abstract
0,34293224,Rapid onset of functional tic-like behaviours ...,Clinicians have reported an increase in functi...
1,32848902,Cannabis Improves Obsessive-Compulsive Disorde...,Although several lines of evidence support the...
2,33942911,Is Persistent Motor or Vocal Tic Disorder a Mi...,Persistent motor or vocal tic disorder (PMVT) ...
3,32980398,Rage attacks in Tourette Syndrome and Chronic ...,Tourette syndrome (TS) and chronic motor/vocal...
4,32170503,Pharmacotherapy for tics in adult patients wit...,Tourette syndrome (TS) and persistent motor/vo...


## Get the entities from each model to find out which is the most suitable

In [6]:
example_text = papers['Abstract'][1]
example_text

'Although several lines of evidence support the hypothesis of a dysregulation of serotoninergic neurotransmission in the pathophysiology of obsessive-compulsive disorder (OCD), there is also evidence for an involvement of other pathways such as the GABAergic, glutamatergic, and dopaminergic systems. Only recently, data obtained from a small number of animal studies alternatively suggested an involvement of the endocannabinoid system in the pathophysiology of OCD reporting beneficial effects in OCD-like behavior after use of substances that stimulate the endocannabinoid system. In humans, until today, only two case reports are available reporting successful treatment with dronabinol (tetrahydrocannabinol, THC), an agonist at central cannabinoid CB1 receptors, in patients with otherwise treatment refractory OCD. In addition, data obtained from a small open uncontrolled trial using the THC analogue nabilone suggest that the combination of nabilone plus exposure-based psychotherapy is more

### EN_SCI_LG

In [7]:
doc = nlp_sci(example_text)
spacy.displacy.render(doc, style='ent', jupyter=True)

### BC5_CDR

In [8]:
doc = nlp_bc5cdr(example_text)
spacy.displacy.render(doc, style='ent', jupyter=True)

### Bio_13CG

In [9]:
doc = nlp_bionlp13cg(example_text)
spacy.displacy.render(doc, style='ent', jupyter=True)

## Final choice: BC5CDR NLP

-----------

## Custom the bc5cdr_md model

### Get New Entities

In [11]:
with open(f'{GENERAL_PATH}/results/entities/nutrition_entities.txt', 'r') as f:
    nutritions = f.readlines()

with open(f'{GENERAL_PATH}/results//entities/mental_health_entities.txt', 'r') as f:
    mental_healths = f.readlines()

with open(f'{GENERAL_PATH}/results//entities/chebi_entities.txt', 'r') as f:
    chebis = f.readlines()

print('There are', len(nutritions), 'NUTRITION entities.')
print('There are', len(mental_healths), 'MENTAL_HEALTH entities.')
print('There are', len(chebis), 'CHEBI entities.')

There are 2921 NUTRITION entities.
There are 676 MENTAL_HEALTH entities.
There are 1480 CHEBI entities.


### Add New Entities to model

In [12]:
def create_pattern(label:str, word:str):
    """
    Create entity from a given label and a given keyword
    """
    tokens = word.split()
    lowers = []
    for i in range(len(tokens)):
        lower = {'LOWER': tokens[i].lower()}
        lowers.append(lower)
    pattern = [{'label': label, 'pattern': lowers}]
    return pattern

In [5]:
# Call bc5cdr_md model as base model
nlp_final = spacy.load('en_ner_bc5cdr_md')

# Define a ruler
ruler = nlp_final.add_pipe('entity_ruler', before='ner')

# Create patterns
for nutrition in nutritions:
    pattern = create_pattern('NUTRITION', nutrition.replace('\n', ''))
    ruler.add_patterns(pattern)
    if nutrition.replace('\n', '').replace('-', ' ') != nutrition.replace('\n', ''):
        pattern = create_pattern('NUTRITION', nutrition.replace('\n', '').replace('-', ' '))
        ruler.add_patterns(pattern)

for chebi in chebis:
    pattern = create_pattern('BIOCHEMICAL', chebi.replace('\n', ''))
    ruler.add_patterns(pattern)
    if chebi.replace('\n', '').replace('-', ' ') != chebi.replace('\n', ''):
        pattern = create_pattern('BIOCHEMICAL', chebi.replace('\n', '').replace('-', ' '))
        ruler.add_patterns(pattern)

for mental in mental_healths:
    pattern = create_pattern('MENTAL_HEALTH', mental.replace('\n', ''))
    ruler.add_patterns(pattern)
    if mental.replace('\n', '').replace('-', ' ') != mental.replace('\n', ''):
        pattern = create_pattern('MENTAL_HEALTH', mental.replace('\n', '').replace('-', ' '))
        ruler.add_patterns(pattern)

In [6]:
nlp_final.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2aa8974f130>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2aa89771810>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2aa8973f1c0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2aa8977aac0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2aa895ab8e0>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x2aa89503440>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2aa895ab5e0>)]

### Save the model to reuse in the next steps

In [7]:
nlp_final.to_disk(f'{GENERAL_PATH}/results/model/en_gena_sm')

--------------------------

## Filtering sentences

In [3]:
papers = pd.read_csv('final_papers_3.csv')
papers.head(5)

Unnamed: 0,PMID,Title,Abstract
0,28470822,Pericyte-derived bone morphogenetic protein 4 ...,Subcortical small vessel disease (SVD) is char...
1,31792039,The risk of malnutrition in children with auti...,A 9-year-old boy presented with a 2-day histor...
2,31019473,"Neurological, Psychiatric, and Biochemical Asp...",Thiamine (vitamin B1) is an essential nutrient...
3,21453474,Prenatal exposure of a girl with autism spectr...,Autism is a complex neurodevelopmental disorde...
4,34990378,[Clinical characteristics and treatment of tre...,Depression represents the predominant mood pol...


### Call the model

In [14]:
gena_nlp = spacy.load(f'{GENERAL_PATH}/results/model/en_gena_sm')
test_sentence = "Sex-hormone have role in sexual desire disorder"
doc = gena_nlp(test_sentence)
for ent in doc.ents:
    print(ent, '\t', ent.label_)

Sex-hormone 	 CHEMICAL
sexual desire disorder 	 MENTAL_HEALTH


In [15]:
origin_nlp = spacy.load('en_ner_bc5cdr_md')
test_sentence = "Sex-hormone have role in sexual desire disorder"
doc = origin_nlp(test_sentence)
for ent in doc.ents:
    print(ent, '\t', ent.label_)

Sex-hormone 	 CHEMICAL
sexual desire disorder 	 DISEASE


### Find out which sentence contains 2 types of entities Health (mental_health, disease) and Nutrition (chemical, nutrition)

In [16]:
def contain_entities(sent:Doc, entities_1:list=["MENTAL_HEALTH", "DISEASE"], entities_2:list=["CHEMICAL", "NUTRITION", "BIOCHEMICAL"])->bool:
    """
    Check if a sentence is containing entities in first entity list and the second one
    """
    contain_1 = False
    contain_2 = False
    for ent in sent.ents:
        if ent.label_ in entities_1:
            contain_1 = True
        if ent.label_ in entities_2:
            contain_2 = True
        if contain_1 and contain_2:
            return True
    return False

In [17]:
for i in range(30):
    example_text = papers['Abstract'][i]
    doc = gena_nlp(example_text.replace('-', ' '))
    print(i,')')
    for sent in doc.sents:
        text = gena_nlp(sent.text)
        if contain_entities(text):
            spacy.displacy.render(text, style='ent', jupyter=True)

0 )
1 )


2 )
3 )


4 )
5 )
6 )
7 )
8 )


9 )


10 )
11 )


12 )
13 )
14 )
15 )


16 )
17 )


18 )


19 )
20 )


21 )


22 )


23 )
24 )
25 )


26 )


27 )


28 )


29 )


### Implement for the whole papers

In [7]:
sentences = []
for idx, row in papers.iterrows():
    text = row['Abstract']
    doc = gena_nlp(text)
    print("Abstract", idx, "...")
    for sent in doc.sents:
        text = gena_nlp(sent.text)
        if contain_entities(text):
            sentences.append((row['PMID'], sent.text))

Abstract 0 ...
Abstract 1 ...
Abstract 2 ...
Abstract 3 ...
Abstract 4 ...
Abstract 5 ...
Abstract 6 ...
Abstract 7 ...
Abstract 8 ...
Abstract 9 ...
Abstract 10 ...
Abstract 11 ...
Abstract 12 ...
Abstract 13 ...
Abstract 14 ...
Abstract 15 ...
Abstract 16 ...
Abstract 17 ...
Abstract 18 ...
Abstract 19 ...
Abstract 20 ...
Abstract 21 ...
Abstract 22 ...
Abstract 23 ...
Abstract 24 ...
Abstract 25 ...
Abstract 26 ...
Abstract 27 ...
Abstract 28 ...
Abstract 29 ...
Abstract 30 ...
Abstract 31 ...
Abstract 32 ...
Abstract 33 ...
Abstract 34 ...
Abstract 35 ...
Abstract 36 ...
Abstract 37 ...
Abstract 38 ...
Abstract 39 ...
Abstract 40 ...
Abstract 41 ...
Abstract 42 ...
Abstract 43 ...
Abstract 44 ...
Abstract 45 ...
Abstract 46 ...
Abstract 47 ...
Abstract 48 ...
Abstract 49 ...
Abstract 50 ...
Abstract 51 ...
Abstract 52 ...
Abstract 53 ...
Abstract 54 ...
Abstract 55 ...
Abstract 56 ...
Abstract 57 ...
Abstract 58 ...
Abstract 59 ...
Abstract 60 ...
Abstract 61 ...
Abstract 62 ...
Ab

Abstract 489 ...
Abstract 490 ...
Abstract 491 ...
Abstract 492 ...
Abstract 493 ...
Abstract 494 ...
Abstract 495 ...
Abstract 496 ...
Abstract 497 ...
Abstract 498 ...
Abstract 499 ...
Abstract 500 ...
Abstract 501 ...
Abstract 502 ...
Abstract 503 ...
Abstract 504 ...
Abstract 505 ...
Abstract 506 ...
Abstract 507 ...
Abstract 508 ...
Abstract 509 ...
Abstract 510 ...
Abstract 511 ...
Abstract 512 ...
Abstract 513 ...
Abstract 514 ...
Abstract 515 ...
Abstract 516 ...
Abstract 517 ...
Abstract 518 ...
Abstract 519 ...
Abstract 520 ...
Abstract 521 ...
Abstract 522 ...
Abstract 523 ...
Abstract 524 ...
Abstract 525 ...
Abstract 526 ...
Abstract 527 ...
Abstract 528 ...
Abstract 529 ...
Abstract 530 ...
Abstract 531 ...
Abstract 532 ...
Abstract 533 ...
Abstract 534 ...
Abstract 535 ...
Abstract 536 ...
Abstract 537 ...
Abstract 538 ...
Abstract 539 ...
Abstract 540 ...
Abstract 541 ...
Abstract 542 ...
Abstract 543 ...
Abstract 544 ...
Abstract 545 ...
Abstract 546 ...
Abstract 547 .

Abstract 972 ...
Abstract 973 ...
Abstract 974 ...
Abstract 975 ...
Abstract 976 ...
Abstract 977 ...
Abstract 978 ...
Abstract 979 ...
Abstract 980 ...
Abstract 981 ...
Abstract 982 ...
Abstract 983 ...
Abstract 984 ...
Abstract 985 ...
Abstract 986 ...
Abstract 987 ...
Abstract 988 ...
Abstract 989 ...
Abstract 990 ...
Abstract 991 ...
Abstract 992 ...
Abstract 993 ...
Abstract 994 ...
Abstract 995 ...
Abstract 996 ...
Abstract 997 ...
Abstract 998 ...
Abstract 999 ...
Abstract 1000 ...
Abstract 1001 ...
Abstract 1002 ...
Abstract 1003 ...
Abstract 1004 ...
Abstract 1005 ...
Abstract 1006 ...
Abstract 1007 ...
Abstract 1008 ...
Abstract 1009 ...
Abstract 1010 ...
Abstract 1011 ...
Abstract 1012 ...
Abstract 1013 ...
Abstract 1014 ...
Abstract 1015 ...
Abstract 1016 ...
Abstract 1017 ...
Abstract 1018 ...
Abstract 1019 ...
Abstract 1020 ...
Abstract 1021 ...
Abstract 1022 ...
Abstract 1023 ...
Abstract 1024 ...
Abstract 1025 ...
Abstract 1026 ...
Abstract 1027 ...
Abstract 1028 ...
Ab

Abstract 1429 ...
Abstract 1430 ...
Abstract 1431 ...
Abstract 1432 ...
Abstract 1433 ...
Abstract 1434 ...
Abstract 1435 ...
Abstract 1436 ...
Abstract 1437 ...
Abstract 1438 ...
Abstract 1439 ...
Abstract 1440 ...
Abstract 1441 ...
Abstract 1442 ...
Abstract 1443 ...
Abstract 1444 ...
Abstract 1445 ...
Abstract 1446 ...
Abstract 1447 ...
Abstract 1448 ...
Abstract 1449 ...
Abstract 1450 ...
Abstract 1451 ...
Abstract 1452 ...
Abstract 1453 ...
Abstract 1454 ...
Abstract 1455 ...
Abstract 1456 ...
Abstract 1457 ...
Abstract 1458 ...
Abstract 1459 ...
Abstract 1460 ...
Abstract 1461 ...
Abstract 1462 ...
Abstract 1463 ...
Abstract 1464 ...
Abstract 1465 ...
Abstract 1466 ...
Abstract 1467 ...
Abstract 1468 ...
Abstract 1469 ...
Abstract 1470 ...
Abstract 1471 ...
Abstract 1472 ...
Abstract 1473 ...
Abstract 1474 ...
Abstract 1475 ...
Abstract 1476 ...
Abstract 1477 ...
Abstract 1478 ...
Abstract 1479 ...
Abstract 1480 ...
Abstract 1481 ...
Abstract 1482 ...
Abstract 1483 ...
Abstract 1

Abstract 1885 ...
Abstract 1886 ...
Abstract 1887 ...
Abstract 1888 ...
Abstract 1889 ...
Abstract 1890 ...
Abstract 1891 ...
Abstract 1892 ...
Abstract 1893 ...
Abstract 1894 ...
Abstract 1895 ...
Abstract 1896 ...
Abstract 1897 ...
Abstract 1898 ...
Abstract 1899 ...
Abstract 1900 ...
Abstract 1901 ...
Abstract 1902 ...
Abstract 1903 ...
Abstract 1904 ...
Abstract 1905 ...
Abstract 1906 ...
Abstract 1907 ...
Abstract 1908 ...
Abstract 1909 ...
Abstract 1910 ...
Abstract 1911 ...
Abstract 1912 ...
Abstract 1913 ...
Abstract 1914 ...
Abstract 1915 ...
Abstract 1916 ...
Abstract 1917 ...
Abstract 1918 ...
Abstract 1919 ...
Abstract 1920 ...
Abstract 1921 ...
Abstract 1922 ...
Abstract 1923 ...
Abstract 1924 ...
Abstract 1925 ...
Abstract 1926 ...
Abstract 1927 ...
Abstract 1928 ...
Abstract 1929 ...
Abstract 1930 ...
Abstract 1931 ...
Abstract 1932 ...
Abstract 1933 ...
Abstract 1934 ...
Abstract 1935 ...
Abstract 1936 ...
Abstract 1937 ...
Abstract 1938 ...
Abstract 1939 ...
Abstract 1

Abstract 2342 ...
Abstract 2343 ...
Abstract 2344 ...
Abstract 2345 ...
Abstract 2346 ...
Abstract 2347 ...
Abstract 2348 ...
Abstract 2349 ...
Abstract 2350 ...
Abstract 2351 ...
Abstract 2352 ...
Abstract 2353 ...
Abstract 2354 ...
Abstract 2355 ...
Abstract 2356 ...
Abstract 2357 ...
Abstract 2358 ...
Abstract 2359 ...
Abstract 2360 ...
Abstract 2361 ...
Abstract 2362 ...
Abstract 2363 ...
Abstract 2364 ...
Abstract 2365 ...
Abstract 2366 ...
Abstract 2367 ...
Abstract 2368 ...
Abstract 2369 ...
Abstract 2370 ...
Abstract 2371 ...
Abstract 2372 ...
Abstract 2373 ...
Abstract 2374 ...
Abstract 2375 ...
Abstract 2376 ...
Abstract 2377 ...
Abstract 2378 ...
Abstract 2379 ...
Abstract 2380 ...
Abstract 2381 ...
Abstract 2382 ...
Abstract 2383 ...
Abstract 2384 ...
Abstract 2385 ...
Abstract 2386 ...
Abstract 2387 ...
Abstract 2388 ...
Abstract 2389 ...
Abstract 2390 ...
Abstract 2391 ...
Abstract 2392 ...
Abstract 2393 ...
Abstract 2394 ...
Abstract 2395 ...
Abstract 2396 ...
Abstract 2

Abstract 2798 ...
Abstract 2799 ...
Abstract 2800 ...
Abstract 2801 ...
Abstract 2802 ...
Abstract 2803 ...
Abstract 2804 ...
Abstract 2805 ...
Abstract 2806 ...
Abstract 2807 ...
Abstract 2808 ...
Abstract 2809 ...
Abstract 2810 ...
Abstract 2811 ...
Abstract 2812 ...
Abstract 2813 ...
Abstract 2814 ...
Abstract 2815 ...
Abstract 2816 ...
Abstract 2817 ...
Abstract 2818 ...
Abstract 2819 ...
Abstract 2820 ...
Abstract 2821 ...
Abstract 2822 ...
Abstract 2823 ...
Abstract 2824 ...
Abstract 2825 ...
Abstract 2826 ...
Abstract 2827 ...
Abstract 2828 ...
Abstract 2829 ...
Abstract 2830 ...
Abstract 2831 ...
Abstract 2832 ...
Abstract 2833 ...
Abstract 2834 ...
Abstract 2835 ...
Abstract 2836 ...
Abstract 2837 ...
Abstract 2838 ...
Abstract 2839 ...
Abstract 2840 ...
Abstract 2841 ...
Abstract 2842 ...
Abstract 2843 ...
Abstract 2844 ...
Abstract 2845 ...
Abstract 2846 ...
Abstract 2847 ...
Abstract 2848 ...
Abstract 2849 ...
Abstract 2850 ...
Abstract 2851 ...
Abstract 2852 ...
Abstract 2

Abstract 3254 ...
Abstract 3255 ...
Abstract 3256 ...
Abstract 3257 ...
Abstract 3258 ...
Abstract 3259 ...
Abstract 3260 ...
Abstract 3261 ...
Abstract 3262 ...
Abstract 3263 ...
Abstract 3264 ...
Abstract 3265 ...
Abstract 3266 ...
Abstract 3267 ...
Abstract 3268 ...
Abstract 3269 ...
Abstract 3270 ...
Abstract 3271 ...
Abstract 3272 ...
Abstract 3273 ...
Abstract 3274 ...
Abstract 3275 ...
Abstract 3276 ...
Abstract 3277 ...
Abstract 3278 ...
Abstract 3279 ...
Abstract 3280 ...
Abstract 3281 ...
Abstract 3282 ...
Abstract 3283 ...
Abstract 3284 ...
Abstract 3285 ...
Abstract 3286 ...
Abstract 3287 ...
Abstract 3288 ...
Abstract 3289 ...
Abstract 3290 ...
Abstract 3291 ...
Abstract 3292 ...
Abstract 3293 ...
Abstract 3294 ...
Abstract 3295 ...
Abstract 3296 ...
Abstract 3297 ...
Abstract 3298 ...
Abstract 3299 ...
Abstract 3300 ...
Abstract 3301 ...
Abstract 3302 ...
Abstract 3303 ...
Abstract 3304 ...
Abstract 3305 ...
Abstract 3306 ...
Abstract 3307 ...
Abstract 3308 ...
Abstract 3

Abstract 3710 ...
Abstract 3711 ...
Abstract 3712 ...
Abstract 3713 ...
Abstract 3714 ...
Abstract 3715 ...
Abstract 3716 ...
Abstract 3717 ...
Abstract 3718 ...
Abstract 3719 ...
Abstract 3720 ...
Abstract 3721 ...
Abstract 3722 ...
Abstract 3723 ...
Abstract 3724 ...
Abstract 3725 ...
Abstract 3726 ...
Abstract 3727 ...
Abstract 3728 ...
Abstract 3729 ...
Abstract 3730 ...
Abstract 3731 ...
Abstract 3732 ...
Abstract 3733 ...
Abstract 3734 ...
Abstract 3735 ...
Abstract 3736 ...
Abstract 3737 ...
Abstract 3738 ...
Abstract 3739 ...
Abstract 3740 ...
Abstract 3741 ...
Abstract 3742 ...
Abstract 3743 ...
Abstract 3744 ...
Abstract 3745 ...
Abstract 3746 ...
Abstract 3747 ...
Abstract 3748 ...
Abstract 3749 ...
Abstract 3750 ...
Abstract 3751 ...
Abstract 3752 ...
Abstract 3753 ...
Abstract 3754 ...
Abstract 3755 ...
Abstract 3756 ...
Abstract 3757 ...
Abstract 3758 ...
Abstract 3759 ...
Abstract 3760 ...
Abstract 3761 ...
Abstract 3762 ...
Abstract 3763 ...
Abstract 3764 ...
Abstract 3

Abstract 4166 ...
Abstract 4167 ...
Abstract 4168 ...
Abstract 4169 ...
Abstract 4170 ...
Abstract 4171 ...
Abstract 4172 ...
Abstract 4173 ...
Abstract 4174 ...
Abstract 4175 ...
Abstract 4176 ...
Abstract 4177 ...
Abstract 4178 ...
Abstract 4179 ...
Abstract 4180 ...
Abstract 4181 ...
Abstract 4182 ...
Abstract 4183 ...
Abstract 4184 ...
Abstract 4185 ...
Abstract 4186 ...
Abstract 4187 ...
Abstract 4188 ...
Abstract 4189 ...
Abstract 4190 ...
Abstract 4191 ...
Abstract 4192 ...
Abstract 4193 ...
Abstract 4194 ...
Abstract 4195 ...
Abstract 4196 ...
Abstract 4197 ...
Abstract 4198 ...
Abstract 4199 ...
Abstract 4200 ...
Abstract 4201 ...
Abstract 4202 ...
Abstract 4203 ...
Abstract 4204 ...
Abstract 4205 ...
Abstract 4206 ...
Abstract 4207 ...
Abstract 4208 ...
Abstract 4209 ...
Abstract 4210 ...
Abstract 4211 ...
Abstract 4212 ...
Abstract 4213 ...
Abstract 4214 ...
Abstract 4215 ...
Abstract 4216 ...
Abstract 4217 ...
Abstract 4218 ...
Abstract 4219 ...
Abstract 4220 ...
Abstract 4

Abstract 4622 ...
Abstract 4623 ...
Abstract 4624 ...
Abstract 4625 ...
Abstract 4626 ...
Abstract 4627 ...
Abstract 4628 ...
Abstract 4629 ...
Abstract 4630 ...
Abstract 4631 ...
Abstract 4632 ...
Abstract 4633 ...
Abstract 4634 ...
Abstract 4635 ...
Abstract 4636 ...
Abstract 4637 ...
Abstract 4638 ...
Abstract 4639 ...
Abstract 4640 ...
Abstract 4641 ...
Abstract 4642 ...
Abstract 4643 ...
Abstract 4644 ...
Abstract 4645 ...
Abstract 4646 ...
Abstract 4647 ...
Abstract 4648 ...
Abstract 4649 ...
Abstract 4650 ...
Abstract 4651 ...
Abstract 4652 ...
Abstract 4653 ...
Abstract 4654 ...
Abstract 4655 ...
Abstract 4656 ...
Abstract 4657 ...
Abstract 4658 ...
Abstract 4659 ...
Abstract 4660 ...
Abstract 4661 ...
Abstract 4662 ...
Abstract 4663 ...
Abstract 4664 ...
Abstract 4665 ...
Abstract 4666 ...
Abstract 4667 ...
Abstract 4668 ...
Abstract 4669 ...
Abstract 4670 ...
Abstract 4671 ...
Abstract 4672 ...
Abstract 4673 ...
Abstract 4674 ...
Abstract 4675 ...
Abstract 4676 ...
Abstract 4

Abstract 5078 ...
Abstract 5079 ...
Abstract 5080 ...
Abstract 5081 ...
Abstract 5082 ...
Abstract 5083 ...
Abstract 5084 ...
Abstract 5085 ...
Abstract 5086 ...
Abstract 5087 ...
Abstract 5088 ...
Abstract 5089 ...
Abstract 5090 ...
Abstract 5091 ...
Abstract 5092 ...
Abstract 5093 ...
Abstract 5094 ...
Abstract 5095 ...
Abstract 5096 ...
Abstract 5097 ...
Abstract 5098 ...
Abstract 5099 ...
Abstract 5100 ...
Abstract 5101 ...
Abstract 5102 ...
Abstract 5103 ...
Abstract 5104 ...
Abstract 5105 ...
Abstract 5106 ...
Abstract 5107 ...
Abstract 5108 ...
Abstract 5109 ...
Abstract 5110 ...
Abstract 5111 ...
Abstract 5112 ...
Abstract 5113 ...
Abstract 5114 ...
Abstract 5115 ...
Abstract 5116 ...
Abstract 5117 ...
Abstract 5118 ...
Abstract 5119 ...
Abstract 5120 ...
Abstract 5121 ...
Abstract 5122 ...
Abstract 5123 ...
Abstract 5124 ...
Abstract 5125 ...
Abstract 5126 ...
Abstract 5127 ...
Abstract 5128 ...
Abstract 5129 ...
Abstract 5130 ...
Abstract 5131 ...
Abstract 5132 ...
Abstract 5

Abstract 5535 ...
Abstract 5536 ...
Abstract 5537 ...
Abstract 5538 ...
Abstract 5539 ...
Abstract 5540 ...
Abstract 5541 ...
Abstract 5542 ...
Abstract 5543 ...
Abstract 5544 ...
Abstract 5545 ...
Abstract 5546 ...
Abstract 5547 ...
Abstract 5548 ...
Abstract 5549 ...
Abstract 5550 ...
Abstract 5551 ...
Abstract 5552 ...
Abstract 5553 ...
Abstract 5554 ...
Abstract 5555 ...
Abstract 5556 ...
Abstract 5557 ...
Abstract 5558 ...
Abstract 5559 ...
Abstract 5560 ...
Abstract 5561 ...
Abstract 5562 ...
Abstract 5563 ...
Abstract 5564 ...
Abstract 5565 ...
Abstract 5566 ...
Abstract 5567 ...
Abstract 5568 ...
Abstract 5569 ...
Abstract 5570 ...
Abstract 5571 ...
Abstract 5572 ...
Abstract 5573 ...
Abstract 5574 ...
Abstract 5575 ...
Abstract 5576 ...
Abstract 5577 ...
Abstract 5578 ...
Abstract 5579 ...
Abstract 5580 ...
Abstract 5581 ...
Abstract 5582 ...
Abstract 5583 ...
Abstract 5584 ...
Abstract 5585 ...
Abstract 5586 ...
Abstract 5587 ...
Abstract 5588 ...
Abstract 5589 ...
Abstract 5

Abstract 5991 ...
Abstract 5992 ...
Abstract 5993 ...
Abstract 5994 ...
Abstract 5995 ...
Abstract 5996 ...
Abstract 5997 ...
Abstract 5998 ...
Abstract 5999 ...
Abstract 6000 ...
Abstract 6001 ...
Abstract 6002 ...
Abstract 6003 ...
Abstract 6004 ...
Abstract 6005 ...
Abstract 6006 ...
Abstract 6007 ...
Abstract 6008 ...
Abstract 6009 ...
Abstract 6010 ...
Abstract 6011 ...
Abstract 6012 ...
Abstract 6013 ...
Abstract 6014 ...
Abstract 6015 ...
Abstract 6016 ...
Abstract 6017 ...
Abstract 6018 ...
Abstract 6019 ...
Abstract 6020 ...
Abstract 6021 ...
Abstract 6022 ...
Abstract 6023 ...
Abstract 6024 ...
Abstract 6025 ...
Abstract 6026 ...
Abstract 6027 ...
Abstract 6028 ...
Abstract 6029 ...
Abstract 6030 ...
Abstract 6031 ...
Abstract 6032 ...
Abstract 6033 ...
Abstract 6034 ...
Abstract 6035 ...
Abstract 6036 ...
Abstract 6037 ...
Abstract 6038 ...
Abstract 6039 ...
Abstract 6040 ...
Abstract 6041 ...
Abstract 6042 ...
Abstract 6043 ...
Abstract 6044 ...
Abstract 6045 ...
Abstract 6

Abstract 6448 ...
Abstract 6449 ...
Abstract 6450 ...
Abstract 6451 ...
Abstract 6452 ...
Abstract 6453 ...
Abstract 6454 ...
Abstract 6455 ...
Abstract 6456 ...
Abstract 6457 ...
Abstract 6458 ...
Abstract 6459 ...
Abstract 6460 ...
Abstract 6461 ...
Abstract 6462 ...
Abstract 6463 ...
Abstract 6464 ...
Abstract 6465 ...
Abstract 6466 ...
Abstract 6467 ...
Abstract 6468 ...
Abstract 6469 ...
Abstract 6470 ...
Abstract 6471 ...
Abstract 6472 ...
Abstract 6473 ...
Abstract 6474 ...
Abstract 6475 ...
Abstract 6476 ...
Abstract 6477 ...
Abstract 6478 ...
Abstract 6479 ...
Abstract 6480 ...
Abstract 6481 ...
Abstract 6482 ...
Abstract 6483 ...
Abstract 6484 ...
Abstract 6485 ...
Abstract 6486 ...
Abstract 6487 ...
Abstract 6488 ...
Abstract 6489 ...
Abstract 6490 ...
Abstract 6491 ...
Abstract 6492 ...
Abstract 6493 ...
Abstract 6494 ...
Abstract 6495 ...
Abstract 6496 ...
Abstract 6497 ...
Abstract 6498 ...
Abstract 6499 ...
Abstract 6500 ...
Abstract 6501 ...
Abstract 6502 ...
Abstract 6

Abstract 6905 ...
Abstract 6906 ...
Abstract 6907 ...
Abstract 6908 ...
Abstract 6909 ...
Abstract 6910 ...
Abstract 6911 ...
Abstract 6912 ...
Abstract 6913 ...
Abstract 6914 ...
Abstract 6915 ...
Abstract 6916 ...
Abstract 6917 ...
Abstract 6918 ...
Abstract 6919 ...
Abstract 6920 ...
Abstract 6921 ...
Abstract 6922 ...
Abstract 6923 ...
Abstract 6924 ...
Abstract 6925 ...
Abstract 6926 ...
Abstract 6927 ...
Abstract 6928 ...
Abstract 6929 ...
Abstract 6930 ...
Abstract 6931 ...
Abstract 6932 ...
Abstract 6933 ...
Abstract 6934 ...
Abstract 6935 ...
Abstract 6936 ...
Abstract 6937 ...
Abstract 6938 ...
Abstract 6939 ...
Abstract 6940 ...
Abstract 6941 ...
Abstract 6942 ...
Abstract 6943 ...
Abstract 6944 ...
Abstract 6945 ...
Abstract 6946 ...
Abstract 6947 ...
Abstract 6948 ...
Abstract 6949 ...
Abstract 6950 ...
Abstract 6951 ...
Abstract 6952 ...
Abstract 6953 ...
Abstract 6954 ...
Abstract 6955 ...
Abstract 6956 ...
Abstract 6957 ...
Abstract 6958 ...
Abstract 6959 ...
Abstract 6

Abstract 7361 ...
Abstract 7362 ...
Abstract 7363 ...
Abstract 7364 ...
Abstract 7365 ...
Abstract 7366 ...
Abstract 7367 ...
Abstract 7368 ...
Abstract 7369 ...
Abstract 7370 ...
Abstract 7371 ...
Abstract 7372 ...
Abstract 7373 ...
Abstract 7374 ...
Abstract 7375 ...
Abstract 7376 ...
Abstract 7377 ...
Abstract 7378 ...
Abstract 7379 ...
Abstract 7380 ...
Abstract 7381 ...
Abstract 7382 ...
Abstract 7383 ...
Abstract 7384 ...
Abstract 7385 ...
Abstract 7386 ...
Abstract 7387 ...
Abstract 7388 ...
Abstract 7389 ...
Abstract 7390 ...
Abstract 7391 ...
Abstract 7392 ...
Abstract 7393 ...
Abstract 7394 ...
Abstract 7395 ...
Abstract 7396 ...
Abstract 7397 ...
Abstract 7398 ...
Abstract 7399 ...
Abstract 7400 ...
Abstract 7401 ...
Abstract 7402 ...
Abstract 7403 ...
Abstract 7404 ...
Abstract 7405 ...
Abstract 7406 ...
Abstract 7407 ...
Abstract 7408 ...
Abstract 7409 ...
Abstract 7410 ...
Abstract 7411 ...
Abstract 7412 ...
Abstract 7413 ...
Abstract 7414 ...
Abstract 7415 ...
Abstract 7

Abstract 7817 ...
Abstract 7818 ...
Abstract 7819 ...
Abstract 7820 ...
Abstract 7821 ...
Abstract 7822 ...
Abstract 7823 ...
Abstract 7824 ...
Abstract 7825 ...
Abstract 7826 ...
Abstract 7827 ...
Abstract 7828 ...
Abstract 7829 ...
Abstract 7830 ...
Abstract 7831 ...
Abstract 7832 ...
Abstract 7833 ...
Abstract 7834 ...
Abstract 7835 ...
Abstract 7836 ...
Abstract 7837 ...
Abstract 7838 ...
Abstract 7839 ...
Abstract 7840 ...
Abstract 7841 ...
Abstract 7842 ...
Abstract 7843 ...
Abstract 7844 ...
Abstract 7845 ...
Abstract 7846 ...
Abstract 7847 ...
Abstract 7848 ...
Abstract 7849 ...
Abstract 7850 ...
Abstract 7851 ...
Abstract 7852 ...
Abstract 7853 ...
Abstract 7854 ...
Abstract 7855 ...
Abstract 7856 ...
Abstract 7857 ...
Abstract 7858 ...
Abstract 7859 ...
Abstract 7860 ...
Abstract 7861 ...
Abstract 7862 ...
Abstract 7863 ...
Abstract 7864 ...
Abstract 7865 ...
Abstract 7866 ...
Abstract 7867 ...
Abstract 7868 ...
Abstract 7869 ...
Abstract 7870 ...
Abstract 7871 ...
Abstract 7

In [8]:
print(f"There are {len(sentences)} sentences that related to Disease (or Mental health) and Nutrition.")

There are 11099 sentences that related to Disease (or Mental health) and Nutrition.


In [9]:
# Avarage len
import statistics
import math
data = [len(sent[1].split()) for sent in sentences]
avg = statistics.mean(data)
std = statistics.stdev(data)
print(f"Average of words from {avg - 1.96*math.sqrt(std)} to {avg + 1.96*math.sqrt(std)}")

Average of words from 21.072005613159575 to 34.97826918637191


### Save these sentences to file for reusing

In [10]:
sentences_df = pd.DataFrame(sentences, columns=["PMID", "Sentence"])
sentences_df.head(5)

Unnamed: 0,PMID,Sentence
0,28470822,Transforming growth factor beta 1 (TGFB1) is d...
1,28470822,The aim of this study was to characterize sign...
2,28470822,We examined immunostaining of TGFB1 and BMPs (...
3,28470822,"Furthermore, adult mice were subjected to chro..."
4,31792039,"He had autism spectrum disorder, and restricte..."


In [11]:
sentences_df.to_csv(f"{DATA_PATH}/sentences/sentences_3.csv", index=False)