In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%%capture
!pip install Bio
!python3 -m pip install pubmed2pdf

In [3]:
import os
import sys
import shutil
import numpy as np
import pandas as pd
from Bio import Entrez
import matplotlib as plt

import multiprocessing
import time

In [4]:
Entrez.email = 'g.mariani34@campus.unimib.it'

data_path = '/content/gdrive/MyDrive/Progetto Data Lab/Dati/'
pub_path = '/content/gdrive/MyDrive/Progetto Data Lab/Dati/pub/'
image_path = '/content/gdrive/MyDrive/Progetto Data Lab/Dati/images/'

## Ricerca pubblicazioni

In [None]:
# Ricerca delle pubblicazioni
handle = Entrez.esearch(db = 'pubmed', term = 'breast cancer lymph nodes metastasis',
                        retmax = 80000)
record = Entrez.read(handle, validate = True)

In [None]:
# Estrazione id delle pubblicazioni
ids = record['IdList']
print(len(ids))

11413


### PMID

In [None]:
# Estrae i PMID dei secondi 5000 articoli e li mette nella lista "pmid_list"
pmid_list = []
for id in ids:
  handle = Entrez.efetch(db = 'pubmed', id = id, retmode = 'xml')
  record = Entrez.read(handle)
  try:
    pmid = str(record['PubmedArticle'][0]['MedlineCitation']['PMID'])
    pmid_list.append(pmid)
  except:
    pass

In [None]:
len(pmid_list)

5483

In [None]:
pmid_df = pd.DataFrame(columns = ['pmid'])
pmid_df.pmid = pmid_list

pmid_df.to_csv(data_path + 'pmid_nuova_ricerca.csv', header = True, index = False)

## Download Articoli

### **Fetch Pdfs**

In [5]:
df = pd.read_csv(data_path + 'PMID/last.csv')

In [6]:
pmids = []
for i in range(len(df.pmid)):
  pmids.append(str(df.pmid[i]))

In [7]:
len(pmids)

4748

In [9]:
# Crea un file txt a partire dalla lista "pmid_list"
pmid_list = pmids #cerco i pdf dei primi 1000 pmid
print(len(pmid_list))
textfile = open("pmid.txt", "w")
for pmid in pmid_list:
    textfile.write(pmid + "\n")
textfile.close()

4748


In [10]:
# Scarica tutti i PDF che riesce a trovare a partire dai PMID elencati nel file "pmid_list.txt"
%%capture
!python '/content/gdrive/MyDrive/Progetto Data Lab/Notebook/fetch_pdfs.py' -pmf pmid.txt -out '/content/gdrive/MyDrive/Progetto Data Lab/Dati/pub_lymph_node/'

In [11]:
files = os.listdir('/content/gdrive/MyDrive/Progetto Data Lab/Dati/pub_lymph_node/')

In [12]:
len(files)

3662

## Estrazione Date

In [None]:
years = []
codes = []
for id in ids:
  handle = Entrez.efetch(db = 'pubmed', id = id, retmode = 'xml')
  record = Entrez.read(handle)
  try:
      year = record['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleDate'][0]['Year']
      years.append(year)
      codes.append(id)
  except:
      pass

In [None]:
len(codes)

4513

In [None]:
df = pd.DataFrame(columns = ['year','id'])
df.year = years
df.id = codes
df.to_csv(data_path + 'df_years_id.csv', header = True, index = False)

## Estrazione immagini

In [None]:
%%capture
!pip install PyMuPDF Pillow

In [None]:
import fitz
import io
from PIL import Image
from io import StringIO

In [None]:
#pdf = data_path + 'pub/Breast Cancer Metastasis_ Challenges and Opportunities.pdf'
pdf = '/content/gdrive/MyDrive/Progetto Data Lab/Dati/pub/Breast Cancer Metastasis_ Challenges and Opportunities.pdf'

# open the file
pdf_file = fitz.open(pdf)

# iterate over PDF pages
for page_index in range(len(pdf_file)):
    
    # get the page itself
    page = pdf_file[page_index]
    image_list = page.get_images()
      
    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    else:
        print("[!] No images found on page", page_index)
    for image_index, img in enumerate(page.get_images(), start = 1):
        
        # get the XREF of the image
        xref = img[0]
          
        # extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image['image']
          
        # get the image extension
        image_ext = base_image["ext"]

[!] No images found on page 0
[!] No images found on page 1
[!] No images found on page 2
[+] Found a total of 1 images in page 3


In [None]:
im = Image.open(StringIO(image_bytes))
plt.imshow(im)

In [None]:
pdf = '/content/gdrive/MyDrive/Progetto Data Lab/Dati/pub/Breast Cancer Metastasis_ Challenges and Opportunities.pdf'
doc = fitz.open(pdf)

for i in range(len(doc)):
    for img in doc.get_page_images(i):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)
        if pix.n < 5:       # this is GRAY or RGB
            pix.save("p%s-%s.png" % (i, xref))
        else:               # CMYK: convert to RGB first
            pix1 = fitz.save(fitz.csRGB, pix)
            pix1.save("p%s-%s.png" % (i, xref))
            pix1 = None
        pix = None

### Spostamento foto da **locale** a **images**

In [None]:
files = os.listdir('/content')
dir = ['.config', 'gdrive', 'sample_data']

local_path = '/content/'

In [None]:
for f in files:
  if f not in dir:
    shutil.move(local_path + f, image_path + f)
