In [2]:
#All these packages need to be installed from pip
import requests #for http requests
import bs4 #called `beautifulsoup4`, an html parser
import pandas as pd #gives us DataFrames
import docx #reading MS doc files, install as `python-docx`
import csv

#Stuff for pdfs
#Install as `pdfminer2`
import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage

#These come with Python

import re #for regexs
import urllib.parse #For joining urls
import io #for making http requests look like files
import json #For Tumblr API responses
import os.path #For checking if files exist
import os #For making directories

# nltk dependencies 
import nltk
import ssl

#pubmed specific stuff
from pubmed_lookup import PubMedLookup
from pubmed_lookup import Publication
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

## Scraping using PubMed_Lookup

In [2]:
pmids_list = []
with open("pmids_na_omit.txt") as csvfile:
    pmids = csv.reader(csvfile, delimiter=',')
    for row in pmids:
        for i in row: 
            pmids_list.append(i)
pmids_list[:5]

['30784590', '30692680', '30659290', '30609404', '30445657']

In [3]:
all_links = []
for pmid in pmids_list: 
    all_links.append("http://www.ncbi.nlm.nih.gov/pubmed/%s" % pmid)
all_links[:5]

['http://www.ncbi.nlm.nih.gov/pubmed/30784590',
 'http://www.ncbi.nlm.nih.gov/pubmed/30692680',
 'http://www.ncbi.nlm.nih.gov/pubmed/30659290',
 'http://www.ncbi.nlm.nih.gov/pubmed/30609404',
 'http://www.ncbi.nlm.nih.gov/pubmed/30445657']

In [None]:
# THIS IS A SCRAPE 
# abs_corpus = []
# for link in all_links: 
#     try: 
#         email = 'deblina@uchicago.edu'
#         lookup = PubMedLookup(link, email)
#         publication = Publication(lookup) 
#         abs_corpus.append(publication)
#     except TypeError: 
#         pass 
#     except RuntimeError: 
#         pass
#     except TimeoutError: 
#         pass

In [None]:
# CLEANING INTO DICTIONARY 
# dic = {}
# for pub in abs_corpus: 
#     dic[pub.title] = [pub.abstract]

In [None]:
# SAVING TO CSV 
# with open('more_corp.csv', 'w') as f:  # Just use 'w' mode in 3.x
#     w = csv.DictWriter(f, dic.keys())
#     w.writeheader()
#     w.writerow(dic)

## Scraping w. Entrez API
Want to have: 
- Major Topic 
- Abstract 
- Title 
- MESH Terms 

for every PMID. 

In [4]:
from Bio import Entrez as en
from Bio import Medline
en.email = "deblina@uchicago.edu"
en.api_key = "0de728102b2b58a83f3986d583c23f766008" 
en.sleep_between_tries = 10 

In [5]:
class Publication(): 
    def __init__(self, pmid): 
        self.pmid = pmid
            
        #pulling record 
        record = Medline.read(en.efetch(db="pubmed",id=pmid,rettype="medline",retmode="text"))
            
        #easy attributes 
        self.title = record.get("TI")
        self.abstract = record.get("AB")
        
        #mesh terms 
        self.major_topics = record.get("OT")
                
        #keywords 
        self.mesh_terms = record.get("MH")

In [None]:
publications = []
for pmid in pmids_list:
    print(pmid)
    p = Publication(pmid)
    publications.append(p)

30784590
30692680
30659290
30609404
30445657
1264230
1113041
5425661
5724688
5582959
30864334
30592451
30275053
30240647
29777175
10014315
10011709
3872519
6706255
5089140
30474576
30449211
30345364
30179987
29580902
13534298
13425430
12990522
14907503
14861110
30926926
30926925
30926924
30926922
30922499
20996781
19873382
18741625
16744786
20891193
30920417
30917518
30897527
30897179
30890620
21379958
21379949
21379938
21372959
21372866
5457638
5570771
5570780
16592017
4509649
29118206
29190358
29255778
29930110
30545854
30894395
30188541
29610250
29474918
29423234
8756646
7726173
7913883
8136842
7506603
30922461
30917804
30903564
30895787
30892738
1368750
2371972
2560594
2789738
2766066
30854722
30846484
30845981
30626636
30519676
5689503
6066721
5921379
5838400
13084255
30735129
30225343
30191425
29597197
28247015
21307056
18628294
17991292
17947320
9040042
30724261
30409642
30242936
30055435
29908446
8079737
7511137
8343277
1745918
16664890
30914511
30659188
29959346
29859085
29774

In [None]:
#trying to write to csv 
with open('data_full.csv', 'w',) as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['pmid','title', 'abstract', 'major', 'mesh'])
    for pub in publications:
        writer.writerow([pub.pmid, pub.title, pub.abstract, pub.major_topics, pub.mesh_terms])

In [None]:
# getting a sense of the nested data structure 
# for first in record['PubmedArticle']: 
#     for second in first.values():
#         for third in second.items(): 
#             key, value = third 
#             print(key, "\n--------\n", value); 

### Cleaning

Separating the `mesh` and `major` columns into single string rows, or just real lists would be great, because that's what I'm trying to classify by. 

In [3]:
df = pd.read_csv("../final-project/data_full.csv")
df = df.dropna()
df.head()

Unnamed: 0,pmid,title,abstract,major,mesh
20,30474576,The current practice and care of paediatric pa...,BACKGROUND: Literature is lacking to guide sta...,"['CHD', 'paediatric cardiac catheterisation', ...","['*Cardiac Care Facilities', 'Cardiac Catheter..."
21,30449211,Current pharmacological treatment guidelines f...,INTRODUCTION: Psoriasis is a common chronic sk...,"['Psoriasis', 'arthritis', 'biologics', 'guide...",['Antirheumatic Agents/administration & dosage...
24,29580902,Early life trauma: An exploratory study of eff...,"BACKGROUND: In animals, adverse early experien...","['*Early childhood trauma', '*Gene expression'...","['Adult', '*Adverse Childhood Experiences', 'F..."
41,30917518,Telomerase Impinges on the Cellular Response t...,Telomerase has cellular functions beyond telom...,"['autophagy', 'mitochondria', 'oxidative stres...","['*Autophagy', 'Cell Line', 'Fibroblasts/metab..."
55,29118206,Social Origins of Developmental Risk for Menta...,Adversity in early childhood exerts an endurin...,"['*EEG', '*foster care', '*limbic', '*neglect'...","['Adult', 'Child', 'Child Abuse/*psychology', ..."


In [7]:
def clean_cols(col, df): 
    txt = df['col'].tolist()
    x = re.split(',', txt)
    clean = []
    for i in x: 
        y = i.replace('[', '')
        z = y.replace(']', '')
        clean.append(z)
    df['col'] = clean
    return df['col']

In [29]:
import numpy as np
df['clean_mesh'] = np.nan
df['clean_mesh'] = df['mesh'].str.extract('([A-Z]\w{0,})', expand = True)
df['clean_mesh'].map((lambda word: for word in li.str.extract('([A-Z]\w{0,})')), li: for li in df['mesh'])

SyntaxError: invalid syntax (<ipython-input-29-b7041382d1a2>, line 4)

In [5]:
df

Unnamed: 0,pmid,title,abstract,major,mesh
20,30474576,The current practice and care of paediatric pa...,BACKGROUND: Literature is lacking to guide sta...,"['CHD', 'paediatric cardiac catheterisation', ...","['*Cardiac Care Facilities', 'Cardiac Catheter..."
21,30449211,Current pharmacological treatment guidelines f...,INTRODUCTION: Psoriasis is a common chronic sk...,"['Psoriasis', 'arthritis', 'biologics', 'guide...",['Antirheumatic Agents/administration & dosage...
24,29580902,Early life trauma: An exploratory study of eff...,"BACKGROUND: In animals, adverse early experien...","['*Early childhood trauma', '*Gene expression'...","['Adult', '*Adverse Childhood Experiences', 'F..."
41,30917518,Telomerase Impinges on the Cellular Response t...,Telomerase has cellular functions beyond telom...,"['autophagy', 'mitochondria', 'oxidative stres...","['*Autophagy', 'Cell Line', 'Fibroblasts/metab..."
55,29118206,Social Origins of Developmental Risk for Menta...,Adversity in early childhood exerts an endurin...,"['*EEG', '*foster care', '*limbic', '*neglect'...","['Adult', 'Child', 'Child Abuse/*psychology', ..."
...,...,...,...,...,...
4033,30519450,Breaking point: the genesis and impact of stru...,Somatic structural variants undoubtedly play i...,"['*DNA double-strand breaks', '*Structural var...","['Cell Transformation, Neoplastic/genetics', '..."
4035,28468917,The circadian dynamics of small nucleolar RNA ...,The circadian regulation of gene expression al...,"['*RNA dynamics', '*circadian rhythms', '*non-...","['Animals', 'Circadian Rhythm/*physiology', 'G..."
4036,28414085,Lost in space? Generalising subtree prune and ...,"Over the last fifteen years, phylogenetic netw...","['*Phylogenetic networks', '*Reticulation-visi...","['*Algorithms', 'Computational Biology/methods..."
4038,28358245,Changes in the demographics of intravenous dru...,Objectives The reported annual incidence of my...,"['Vascular', 'demographics', 'femoral', 'intra...","['Adult', 'Age Factors', 'Amputation', 'Aneury..."
