In [1]:
#%pip install Bio
#%pip install nltk

In [None]:
from Bio import Entrez
import pandas as pd 

import nltk
nltk.download('punkt')
import string
import re

nltk.download('wordnet')
nltk.download('omw-1.4')

Entrez.email = "sezc.gianmarco.russo@gmail.com"

Query on PubMed DB using the following queries: 
- "(((\"refined\" AND \"sugar\") OR (\"sugar\" AND \"diet\")) OR (\"sucrose\" AND \"diet\"))  AND \"microbiota\""
- "(((\"refined\" AND \"sugar\") OR (\"sugar\" AND \"diet\")) OR (\"sucrose\" AND \"diet\")) AND \"human\"  AND \"microbiota\""

In [12]:
handle_refsugar_diet = Entrez.esearch(db="pubmed", term="(((\"refined\" AND \"sugar\") OR (\"sugar\" AND \"diet\")) OR (\"sucrose\" AND \"diet\"))  AND \"microbiota\"", retmax=1000000)
record_refsugar_diet = Entrez.read(handle_refsugar_diet)
print(record_refsugar_diet['Count'])

675


In [11]:
handle_refsugar_diet_human = Entrez.esearch(db="pubmed", term="(((\"refined\" AND \"sugar\") OR (\"sugar\" AND \"diet\")) OR (\"sucrose\" AND \"diet\")) AND \"human\" AND \"microbiota\"", retmax=1000000)
record_refsugar_diet_human = Entrez.read(handle_refsugar_diet_human)
print(record_refsugar_diet_human['Count'])

161


Dataframe containing the IDs of the 675 found articles, 161 contain the word "human"

In [15]:
sugar_df = pd.DataFrame.from_dict({'id':record_refsugar_diet['IdList']})
sugar_human_df = pd.DataFrame.from_dict({'id':record_refsugar_diet_human['IdList']})

In [16]:
sugar_human_df

Unnamed: 0,id
0,37172822
1,37111220
2,37101636
3,36998151
4,36846996
...,...
156,20806900
157,20368178
158,20368177
159,17210919


In [37]:
sugar_df

Unnamed: 0,id
0,37240315
1,37210385
2,37207566
3,37172822
4,37169244
...,...
670,12514022
671,11870958
672,10867049
673,10222392


Example of info from an article

In [18]:
handle = Entrez.efetch(db="pubmed", id=str(sugar_df.iloc[0].id), retmode='xml')
record = Entrez.read(handle)
handle.close()

print("ID:\t\t",sugar_df.iloc[0].id)
print("Title:\t\t",record['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle'])
print("Abstract:\t",record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
print("Year:\t\t",record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])

ID:		 37240315
Title:		 SIRT2 Deficiency Aggravates Diet-Induced Nonalcoholic Fatty Liver Disease through Modulating Gut Microbiota and Metabolites.
Abstract:	 Non-alcoholic fatty liver disease (NAFLD), characterized by excessive lipid accumulation in hepatocytes, is an increasing global healthcare burden. Sirtuin 2 (SIRT2) functions as a preventive molecule for NAFLD with incompletely clarified regulatory mechanisms. Metabolic changes and gut microbiota imbalance are critical to the pathogenesis of NAFLD. However, their association with SIRT2 in NAFLD progression is still unknown. Here, we report that SIRT2 knockout (KO) mice are susceptible to HFCS (high-fat/high-cholesterol/high-sucrose)-induced obesity and hepatic steatosis accompanied with an aggravated metabolic profile, which indicates SIRT2 deficiency promotes NAFLD-NASH (nonalcoholic steatohepatitis) progression. Under palmitic acid (PA), cholesterol (CHO), and high glucose (Glu) conditions, SIRT2 deficiency promotes lipid dep

Functions to extract Title, Abs and Year from a Record object

In [38]:
def get_title(record):
  return record['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']

def get_abstract(record):
  return record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]

def get_year(record):
  return record['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']


In [39]:
def get_info(id):
  print(id)
  handle = Entrez.efetch(db="pubmed", id=str(id), retmode='xml')
  record = Entrez.read(handle)
  try:
    title = get_title(record)
  except:
    title = None
  try:
    abstract = get_abstract(record)
  except:
    abstract = None
  try: 
    year = get_year(record)
  except:
    year = None
  return title, abstract, year

In [None]:
title_list=[]
abstract_list=[]
year_list=[]
ids = []
for id in range(len(sugar_df['id'])):
  try:
    title, abstract, year = get_info(sugar_df['id'][id])
    title_list.append(title)
    abstract_list.append(abstract)
    year_list.append(year)
    ids.append(sugar_df['id'][id])
  except:
    print("Bad request")

In [33]:
sugar_df = pd.DataFrame({"id":ids,"title":title_list,"abstract":abstract_list,"year":year_list})

In [None]:
title_list=[]
abstract_list=[]
year_list=[]
ids = []
for id in range(len(sugar_human_df['id'])):
  try:
    title, abstract, year = get_info(sugar_human_df['id'][id])
    title_list.append(title)
    abstract_list.append(abstract)
    year_list.append(year)
    ids.append(sugar_human_df['id'][id])
  except:
    print("Bad request")

In [22]:
sugar_human_df = pd.DataFrame({"id":ids,"title":title_list,"abstract":abstract_list,"year":year_list})

Function that checks if the article has a Title/Abstract or if it's too short

In [23]:
def check_text(df):
  return df.title.isna() or len(df.title)<10 or df["abstract"].isna() or len(df["abstract"])<10

In [43]:
list_con = []
for i in range(sugar_df.shape[0]):
  tit = sugar_df.iloc[i].title
  abs = sugar_df.iloc[i].abstract

  if tit is None or abs is None or type(tit)==float or len(tit)< 10 or type(abs)==float or len(abs) < 10:
    list_con.append(False)
  else:
    list_con.append(True)

sugar_df = sugar_df.iloc[list_con]

Removing all the articles without a valid Title or Abstract

In [24]:
list_con = []
for i in range(sugar_human_df.shape[0]):
  tit = sugar_human_df.iloc[i].title
  abs = sugar_human_df.iloc[i].abstract

  if tit is None or abs is None or type(tit)==float or len(tit)< 10 or type(abs)==float or len(abs) < 10:
    list_con.append(False)
  else:
    list_con.append(True)

sugar_human_df = sugar_human_df.iloc[list_con]

Joining title and abstract in a single column

In [None]:
sugar_df["all_text"] = sugar_df["title"] +". "+ sugar_df["abstract"]
sugar_human_df["all_text"] = sugar_human_df["title"] +". "+ sugar_human_df["abstract"]

In [48]:
sugar_df.to_csv('data_text/sugar.csv',index=False)
sugar_human_df.to_csv('data_text/sugar_human.csv',index=False)

In [62]:
sugar_clean = pd.read_csv("data_text/sugar.csv")
sugar_human_clean = pd.read_csv("data_text/sugar_human.csv")

In [63]:
sugar_clean

Unnamed: 0,id,title,abstract,year,all_text
0,37240315,SIRT2 Deficiency Aggravates Diet-Induced Nonal...,"Non-alcoholic fatty liver disease (NAFLD), cha...",2023.0,SIRT2 Deficiency Aggravates Diet-Induced Nonal...
1,37210385,Ginger essential oil and citral ameliorates at...,"Recently, the role of the gut microbiota in di...",2023.0,Ginger essential oil and citral ameliorates at...
2,37207566,Antiobesity effect of L-arabinose via ameliora...,"The global prevalence of obesity, a chronicall...",2023.0,Antiobesity effect of L-arabinose via ameliora...
3,37172822,Excess Dietary Sugar Alters Colonocyte Metabol...,The colonic epithelium requires continuous ren...,2023.0,Excess Dietary Sugar Alters Colonocyte Metabol...
4,37169244,Intake of Caffeine Containing Sugar Diet Remod...,The diet-microbiome-immunity axis is one among...,2023.0,Intake of Caffeine Containing Sugar Diet Remod...
...,...,...,...,...,...
668,12514022,Potential rates of fermentation in digesta fro...,Microbial catabolic capacity in digesta from t...,2003.0,Potential rates of fermentation in digesta fro...
669,11870958,Cariogenicity of different types of milk: an e...,This study evaluated the cariogenic potential ...,2002.0,Cariogenicity of different types of milk: an e...
670,10867049,In vitro fermentation pattern of D-tagatose is...,Knowledge of the fermentation pattern of D-tag...,2000.0,In vitro fermentation pattern of D-tagatose is...
671,10222392,D-tagatose has low small intestinal digestibil...,"The digestibility of D-tagatose, its effect on...",1999.0,D-tagatose has low small intestinal digestibil...


In [64]:
sugar_human_df

Unnamed: 0,id,title,abstract,year,all_text
0,37172822,Excess Dietary Sugar Alters Colonocyte Metabol...,The colonic epithelium requires continuous ren...,2023,Excess Dietary Sugar Alters Colonocyte Metabol...
1,37111220,Blood and Tissue Advanced Glycation End Produc...,Cardiometabolic disorders are characterised by...,2023,Blood and Tissue Advanced Glycation End Produc...
2,37101636,Therapeutic potential of bioactive phytoconsti...,"Nonalcoholic fatty liver disease (NAFLD), a ch...",2023,Therapeutic potential of bioactive phytoconsti...
3,36998151,Select human milk oligosaccharide supplementat...,Feeding infants with human milk versus formula...,2023,Select human milk oligosaccharide supplementat...
4,36846996,The Prevention of Inflammation and the Mainten...,The human gut microbiome consists of a variety...,2023,The Prevention of Inflammation and the Mainten...
...,...,...,...,...,...
156,20806900,Dietary modulation of gut functional ecology s...,A major source of intestinal metabolites resul...,2010,Dietary modulation of gut functional ecology s...
157,20368178,The effect of diet on the human gut microbiome...,Diet and nutritional status are among the most...,2009,The effect of diet on the human gut microbiome...
158,20368177,Gut check: testing a role for the intestinal m...,By using germ-free mice transplanted with huma...,2009,Gut check: testing a role for the intestinal m...
159,17210919,Mechanisms underlying the resistance to diet-i...,The trillions of microbes that colonize our ad...,2007,Mechanisms underlying the resistance to diet-i...


In [65]:
def remove_punctuation(text):
    # String punctuation: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def character_repetition(text):
    # Pattern matching for all case alphabets
    # \1   It refers to the first capturing group.
    # {1,} It means we are matching for repetition that occurs more than one time.
    # r’\1\1' → It limits all the repetition to two characters.
    Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)
    
    Formatted_text = Pattern_alpha.sub(r"\1\1", text) 
    return Formatted_text

def remove_spaces(text):
    return text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('  ', ' ').replace('\n\n',' ')

Appling a pipeline that prepares text for the next jobs

In [66]:
def apply_pipeline(data, col):
    new_col = data[col].apply(lambda x: remove_punctuation(x))
    new_col = new_col.apply(lambda x: x.lower())
    new_col = new_col.apply(lambda x: re.sub(r'\d+', '', x))
    new_col = new_col.apply(lambda x: character_repetition(x))
    new_col = new_col.apply(lambda x: remove_spaces(x))
    return new_col


In [67]:
sugar_clean["clean_text"] = apply_pipeline(sugar_clean,"all_text")
sugar_clean["clean_title"] = apply_pipeline(sugar_clean,"title")
sugar_clean["clean_abstract"] = apply_pipeline(sugar_clean,"abstract")

In [68]:
sugar_human_clean["clean_text"] = apply_pipeline(sugar_human_clean,"all_text")
sugar_human_clean["clean_title"] = apply_pipeline(sugar_human_clean,"title")
sugar_human_clean["clean_abstract"] = apply_pipeline(sugar_human_clean,"abstract")

In [69]:
sugar_human_clean

Unnamed: 0,id,title,abstract,year,all_text,clean_text,clean_title,clean_abstract
0,37172822,Excess Dietary Sugar Alters Colonocyte Metabol...,The colonic epithelium requires continuous ren...,2023,Excess Dietary Sugar Alters Colonocyte Metabol...,excess dietary sugar alters colonocyte metabol...,excess dietary sugar alters colonocyte metabol...,the colonic epithelium requires continuous ren...
1,37111220,Blood and Tissue Advanced Glycation End Produc...,Cardiometabolic disorders are characterised by...,2023,Blood and Tissue Advanced Glycation End Produc...,blood and tissue advanced glycation end produc...,blood and tissue advanced glycation end produc...,cardiometabolic disorders are characterised by...
2,37101636,Therapeutic potential of bioactive phytoconsti...,"Nonalcoholic fatty liver disease (NAFLD), a ch...",2023,Therapeutic potential of bioactive phytoconsti...,therapeutic potential of bioactive phytoconsti...,therapeutic potential of bioactive phytoconsti...,nonalcoholic fatty liver disease nafld a chron...
3,36998151,Select human milk oligosaccharide supplementat...,Feeding infants with human milk versus formula...,2023,Select human milk oligosaccharide supplementat...,select human milk oligosaccharide supplementat...,select human milk oligosaccharide supplementat...,feeding infants with human milk versus formula...
4,36846996,The Prevention of Inflammation and the Mainten...,The human gut microbiome consists of a variety...,2023,The Prevention of Inflammation and the Mainten...,the prevention of inflammation and the mainten...,the prevention of inflammation and the mainten...,the human gut microbiome consists of a variety...
...,...,...,...,...,...,...,...,...
156,20806900,Dietary modulation of gut functional ecology s...,A major source of intestinal metabolites resul...,2010,Dietary modulation of gut functional ecology s...,dietary modulation of gut functional ecology s...,dietary modulation of gut functional ecology s...,a major source of intestinal metabolites resul...
157,20368178,The effect of diet on the human gut microbiome...,Diet and nutritional status are among the most...,2009,The effect of diet on the human gut microbiome...,the effect of diet on the human gut microbiome...,the effect of diet on the human gut microbiome...,diet and nutritional status are among the most...
158,20368177,Gut check: testing a role for the intestinal m...,By using germ-free mice transplanted with huma...,2009,Gut check: testing a role for the intestinal m...,gut check testing a role for the intestinal mi...,gut check testing a role for the intestinal mi...,by using germfree mice transplanted with human...
159,17210919,Mechanisms underlying the resistance to diet-i...,The trillions of microbes that colonize our ad...,2007,Mechanisms underlying the resistance to diet-i...,mechanisms underlying the resistance to dietin...,mechanisms underlying the resistance to dietin...,the trillions of microbes that colonize our ad...


In [70]:
sugar_clean = sugar_clean.reset_index(drop=True)
sugar_clean.to_csv("data_text/sugar_clean.csv",index=False)

In [71]:
sugar_human_clean = sugar_human_clean.reset_index(drop=True)
sugar_human_clean.to_csv("data_text/sugar_human_clean.csv",index=False)