In [7]:
!pip install -U biopython
!pip install -U pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 5.0 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79


## Import libraries

In [1]:
import itertools
import os
import pandas as pd
import re

In [2]:
def search(query:str, max_results:int):
    """
    Search for results in Pubmed with a given query
    """
    Entrez.email = 'your.email@example.com'
    try:
        handle = Entrez.esearch(db='pubmed', 
                                sort='relevance', 
                                retmax=max_results,
                                retmode='xml', 
                                term=query
                                )
        results = Entrez.read(handle)
        return results
    except:
        return None

def fetch_details(id_list:list):
    """
    Get details from a given list of id papers
    """
    try:
        ids = ','.join(id_list)
        Entrez.email = 'your.email@example.com'
        handle = Entrez.efetch(db='pubmed',
                               retmode='xml',
                               id=ids)
        results = Entrez.read(handle)
        return results
    except:
        return None

#### ✎ Only some keywords that render the revelant results

#### Try the most effective keywords

In [None]:
effective_kw = ['nutrition and mental health', 'nutrition and feel', 'nutrition and mood',
               'food and psychology', 'food and mental health', 'food and mood',
               'nutrient and psychology', 'nutrient and mental health']

from Bio import Entrez

for kw in effective_kw:
    results = search(kw, 3)
    id_list = results['IdList']
    papers = fetch_details(id_list)
    print('-----------\n\nQuery:', kw)
    if papers is not None:
        for i, paper in enumerate(papers['PubmedArticle']):
            print("\n{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle']))
            if (paper['MedlineCitation']['Article'].get('Abstract') is not None):
                print("----\n{}".format(paper['MedlineCitation']['Article'].get('Abstract').get('AbstractText')))


-----------

Query: nutrition and mental health

1) Correlates associated with mental health and nutritional status in Lebanese older adults: A cross-sectional study.
----
[StringElement('To assess correlates of mental and nutritional health among elderly in Lebanon, inside nursing homes compared to their private homes.', attributes={'Label': 'OBJECTIVE'}), StringElement('This cross-sectional study was conducted between June and August 2016 on 500 elderly.', attributes={'Label': 'METHODS'}), StringElement('Higher somatic (Beta\u2009=\u20090.259) and cognitive anxiety (Beta\u2009=\u20090.508), increased age (Beta\u2009=\u20090.174) were significantly associated with higher depression, whereas having a secondary (Beta\u2009=\u2009-4.006) and a university (Beta\u2009=\u2009-6.829) levels of education compared to illiteracy, living home (Beta\u2009=\u2009-2.557) compared to living in a nursing home and male gender (Beta\u2009=\u2009-1.280) were significantly associated with lower depressio

-----------

Query: nutrition and feel

1) Remaining Socially Connected at 100 and Beyond Reduces Impact of Loneliness on Nutritional Status.
----
[StringElement("Understanding factors influencing centenarians' nutritional status can offer insight into effective nutrition interventions to improve quality of life among this population.", attributes={'Label': 'BACKGROUND', 'NlmCategory': 'UNASSIGNED'}), StringElement('This cross-sectional study was conducted to evaluate the role of social support and loneliness on nutritional status among Oklahoma centenarians (<i>N</i>\u2009=\u2009151).', attributes={'Label': 'OBJECTIVE', 'NlmCategory': 'UNASSIGNED'}), StringElement('Nutritional status was assessed with the Short Form Mini Nutrition Assessment (MNA-SF). Perceived social support was assessed with the 24-item Social Provisions Scale. Loneliness was examined with the 10-item UCLA loneliness scale.', attributes={'Label': 'METHODS', 'NlmCategory': 'UNASSIGNED'}), StringElement('Ordinal logis

-----------

Query: food and psychology

1) COVID-19 disease and nutritional choices: How will the pandemic reconfigure our food psychology and habits? A case study of the Italian population.
----
[StringElement("In Italy, the spread of the novel coronavirus (SARS-Cov-2) required lifestyle changes that have affected food choices and people's health condition. We explore people's perception of the role of food consumption as a preventive measure and how it reconfigures consumption habits.", attributes={'Label': 'BACKGROUND AND AIMS'}), StringElement('We conducted an online survey of a representative sample of 1004 Italian citizens. Around 40% of the population perceive that strengthening the immune defences through nutrition is not important to reduce the risk of coronavirus disease 2019 (COVID-19) infection. People with lower levels of perceived importance are slightly younger and have a less healthy lifestyle. They are less worried about the emergency. During the last months, they hav

-----------

Query: food and mood

1) Food for mood: Experimentally induced negative affect triggers loss of control over eating in adolescents with low inhibitory control.
----
[StringElement('Loss of control over eating (LOC) is common among adolescents and is associated with negative developmental outcomes. Low self-regulation, and specifically low inhibitory control, is increasingly emphasized as an underlying factor in LOC. However, the specific context in which these capacities fail remains unclear. The affect regulation model proposes that negative affect may trigger LOC; however, research has mostly assessed trait negative affect using questionnaires, whereas measuring state negative affect is needed to determine its triggering role. Therefore, this study examined the interaction between inhibitory control and state negative affect in predicting LOC among adolescents using an experimental mood-induction design.', attributes={'Label': 'OBJECTIVE'}), StringElement('Participants w

-----------

Query: nutrient and mental health

1) Mental health problems in relation to eating behavior patterns, nutrient intakes and health related quality of life among Iranian female adolescents.
----
[StringElement('To identify the association between mental health problems, eating behavior patterns, nutrient intakes and health related quality of life (HRQoL) among Iranian female adolescents.', attributes={'Label': 'AIMS'}), StringElement('The current cross-sectional study conducted among three high-schools randomly selected from 10-day-public high schools in the selected sub-county from Tabriz city-Iran between December 2015 through March 2016. Participants were a sample of 107 adolescent girls aged 15-17 years old. Anthropometric parameters were measured and assessments of HRQoL, mental health problems and eating behavioral patterns were performed by Short Form 36 (SF-36), Strengths and Difficulties Questionnaires (SDQ) and Eating Behavioral Pattern Questionnaire (EBPQ) respect

#### ✎ The abstract of results is not really clear for relationship between Food and Mental Health.

-------------------------

# Get papers with Food terms and Mental Health terms

## Create keywords

**Notes:** These entity files have been manually modified some keywords.

In [13]:
GENERAL_PATH = "../DATASET"
DATA_PATH = GENERAL_PATH + "/data"
RESULT_PATH = GENERAL_PATH + "/results"

In [12]:
with open(f'{RESULT_PATH}/entities/nutrition_entities.txt', 'r', encoding='utf-8') as f:
    foods = f.readlines()
with open(f'{RESULT_PATH}/entities/chebi_entities.txt', 'r', encoding='utf-8') as f:
    chebis = f.readlines()
with open(f'{RESULT_PATH}/entities/mental_health_entities.txt', 'r', encoding='utf-8') as f:
    mental_healths = f.readlines()

len(foods), len(chebis), len(mental_healths)

(2921, 1480, 676)

In [5]:
combine_keywords = [p for p in itertools.product(*[list([f.replace('\n', '') for f in foods + chebis]), list([m.replace('\n', '') for m in mental_healths])])]
print('There are totally', len(combine_keywords), 'keywords.')
print('Some examples:')
print(combine_keywords[:20])

There are totally 2975076 keywords.
Some examples:
[('clubmoss plant', 'fetal alcohol syndrome'), ('clubmoss plant', 'visual verbal agnosia'), ('clubmoss plant', 'somatoform disorder'), ('clubmoss plant', 'zoophilia'), ('clubmoss plant', 'internet addiction'), ('clubmoss plant', 'neurotic depression reactive type'), ('clubmoss plant', 'pdd nos'), ('clubmoss plant', 'hallucinogen abuse'), ('clubmoss plant', 'dementia'), ('clubmoss plant', 'arbd'), ('clubmoss plant', 'syndromic intellectual disability'), ('clubmoss plant', 'classic apraxia'), ('clubmoss plant', 'drug-induced psychosis'), ('clubmoss plant', 'autistic disorder of childhood onset'), ('clubmoss plant', 'stereotyped repetitive movements nos'), ('clubmoss plant', 'exhibitionism'), ('clubmoss plant', 'assaultive behavior'), ('clubmoss plant', 'childhood neurosis'), ('clubmoss plant', 'adhd'), ('clubmoss plant', 'regression')]


## Get Papers

In [None]:
from Bio import Entrez
abstracts = []
for (food, mood) in combine_keywords[2950000:]:
    print(food, ' ==== ', mood)
    results = search(food + '[Title/Abstract] AND ' + mood + '[Title/Abstract]', max_results=3)
    if results is not None:
        id_list = results['IdList']
        papers = fetch_details(id_list)
        if papers is not None:
            for i, paper in enumerate(papers['PubmedArticle']):
                if (paper['MedlineCitation']['Article'].get('Abstract') is not None):
                    abstracts.append((paper['MedlineCitation']['PMID'], paper['MedlineCitation']['Article']['ArticleTitle'], paper['MedlineCitation']['Article'].get('Abstract')['AbstractText'][0]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
argentine sea bass  ====  withdrawal disorder
turmeric root  ====  mood disorder
dopaminergic agents  ====  drug psychosis
northern dogfish  ====  separation anxiety
mglur antagonist  ====  psychologic dyspareunia
excitatory amino acid antagonists  ====  antisocial personality disorder
cephalopholis fulva  ====  autism spectrum disorder
microtubule stabilising role  ====  hallucinogen abuse
artemiidae shrimp family  ====  nocturnal myoclonus
lady's thistle  ====  episodic mood disorder
common saltwort  ====  resistance
black cap raspberry  ====  fear of open spaces
white-tailed deer  ====  dependent personality disorder
anti-human herpes virus-ii agents  ====  somatosensory agnosia
yellow boletus  ====  amnestic syndrome
harpon  ====  neurotic depression
schinus terebinthifolius raddi  ====  schizophrenia
phenylpropanoid  ====  obsession
blue rockfish  ====  nominal aphasia
adrenoceptor antagonists  ====  borderline state

In [None]:
len(abstracts)

6161

In [None]:
abstracts_df = pd.DataFrame(abstracts)
abstracts_df.columns = ['PMID', 'Title', 'Abstract']
abstracts_df.tail(5)

Unnamed: 0,PMID,Title,Abstract
6156,33942911,Is Persistent Motor or Vocal Tic Disorder a Mi...,Persistent motor or vocal tic disorder (PMVT) ...
6157,19144772,"Cannabinoid receptor agonist 13, a novel canna...",Cannabinoid receptor agonist 13 (CRA13) is a n...
6158,35613882,Diagnostic Criteria for Moyamoya Disease - 202...,"In this report, we, the Research Committee on ..."
6159,35613842,Risk of cardiovascular events in patients havi...,Calcium pyrophosphate deposition (CPPD) diseas...
6160,35613728,Gestational diabetes mellitus and adverse preg...,To investigate the association between gestati...


In [None]:
abstracts_df.to_csv(f"{DATA_PATH}/papers_raw/papers_16.csv", index=False)

-------------------------------------

## Filter and clean papers

In [None]:
# Get all papers and drop duplicates
papers = pd.read_csv(f"{DATA_PATH}/papers_raw/papers.csv")
add_papers = [papers]

# 1. Read all papers csv
for i in range(15):
    add_papers.append(pd.read_csv(f"./papers_raw/papers_{i}.csv"))

papers = pd.concat(add_papers)

# Drop duplicates and null values
papers = papers[["Title", "Abstract"]]
papers = papers.drop_duplicates(keep='first')
papers = papers[papers['Title'].notna() & papers['Abstract'].notna()]
papers = papers.reset_index(drop=True)
papers.head(5)

Unnamed: 0,Title,Abstract
0,A structure-function mechanism for schizophrenia.,THE MULTIPLE ETIOLOGIES OF SCHIZOPHRENIA PROMP...
1,From membrane phospholipid defects to altered ...,Schizophrenia (SZ) is a devastating neuropsych...
2,Enriched Environment Enhances the Myelin Regul...,Long-term consequences of stress intervene in ...
3,Neurobiological effects of phospholipids <i>in...,Nutrition is a crucial component for maintenan...
4,Dietary phospholipids: Role in cognitive proce...,Chronic stress and ageing are two of the most ...


In [None]:
# Check null values
papers.isnull().sum()

Title       0
Abstract    0
dtype: int64

In [None]:
# Check some special case of title
import random
number = random.randint(0, len(papers))
print(papers.iloc[number, 0])

The effect of functional electrical stimulation cycling on late functional improvement in patients with chronic incomplete spinal cord injury.


In [None]:
# Check some special case of abstracts
import random
number = random.randint(0, len(papers))
print(papers.iloc[number, 1])

Hair transplant surgery using follicular unit extraction technique (FUE) is a common surgical procedure for the treatment of severe hair loss. Blood-derived autologous growth factors have also proved to promote hair regeneration in patients with different types of alopecia.


In [None]:
# Get only title between [], if exists
def getTitle(title):
    import re
    only_title_regex = r"^\[(.*)\]"
    if not isinstance(title, str):
        print(title)
        print(type(title))
    if re.match(only_title_regex, title):
        return re.match(only_title_regex, title).group(1)
    else:
        return title

In [None]:
# Get only title between [], if exists
papers['Title'] = papers['Title'].map(lambda t: getTitle(t))
papers.head(5)

Unnamed: 0,Title,Abstract
0,A structure-function mechanism for schizophrenia.,THE MULTIPLE ETIOLOGIES OF SCHIZOPHRENIA PROMP...
1,From membrane phospholipid defects to altered ...,Schizophrenia (SZ) is a devastating neuropsych...
2,Enriched Environment Enhances the Myelin Regul...,Long-term consequences of stress intervene in ...
3,Neurobiological effects of phospholipids <i>in...,Nutrition is a crucial component for maintenan...
4,Dietary phospholipids: Role in cognitive proce...,Chronic stress and ageing are two of the most ...


In [None]:
# remove html tag
def removeHTML(content):
    import re
    html_regex = r"\<[^>]*\>"
    return re.sub(html_regex, '', content)

In [None]:
# remove HTML tags for title and Abstract
papers['Abstract'] = papers['Abstract'].map(lambda t: removeHTML(t))
papers.head(5)

Unnamed: 0,Title,Abstract
0,A structure-function mechanism for schizophrenia.,THE MULTIPLE ETIOLOGIES OF SCHIZOPHRENIA PROMP...
1,From membrane phospholipid defects to altered ...,Schizophrenia (SZ) is a devastating neuropsych...
2,Enriched Environment Enhances the Myelin Regul...,Long-term consequences of stress intervene in ...
3,Neurobiological effects of phospholipids in vi...,Nutrition is a crucial component for maintenan...
4,Dietary phospholipids: Role in cognitive proce...,Chronic stress and ageing are two of the most ...


In [None]:
# write final papers csv
papers.to_csv(f"{DATA_PATH}/papers_raw/final_papers.csv", index_label="ID")