In [1]:
!pip install scholarly
!pip install tqdm
!pip install pandas
!pip install pymed



## PART 1: Fetch publications and abstracts from google scholar

In [2]:
from scholarly import scholarly #, ProxyGenerator
import pymed
from pymed import PubMed
from tqdm import tqdm
import requests
from xml.etree import ElementTree

In [3]:
AUTHOR = 'Marc Kirschner'
last_name = 'kirschner' # lower case is better

# Retrieve the author's data, fill-in, and print
try:
    search_query = scholarly.search_author(AUTHOR)
    author = scholarly.fill(next(search_query))
    
except:
    
    from scholarly import ProxyGenerator

    pg = ProxyGenerator()
    pg.FreeProxies()
    scholarly.use_proxy(pg)
    search_query = scholarly.search_author(AUTHOR)
    author = scholarly.fill(next(search_query))

In [4]:

for pub in tqdm(author['publications']):
    scholarly.fill(pub)

100%|██████████| 655/655 [18:07<00:00,  1.66s/it]


In [13]:
pub

{'container_type': 'Publication',
 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 2>,
 'bib': {'title': 'University ofWisconsin, Madison',
  'author': 'Joseph L Goldstein and Guido Guidotti and Leland Hartwell and Bruce M Alberts and Henry R Boume and Marc W Kirschner and Elliot Meyerowitz and Thomas D Pollard and Martin Raff and Joan Ruderman and Joseph Gall and James A Spudich and Roger Y Tsien and Masatoshi Takeichi and Michael H Wigler and Mitsuhiro Yanagida and Rosalba A Kampman and Ken-Ichi Arai and William Balch and Mary Beckerle and Merton Bemfield and Michael Berridge and Tom Curran and Benoit de Crombrugghe and Gregor Eichele and Marilyn Farquhar and James R Feramisco and Douglass J Forbes and John R Glenney Jr and Corey S Goodman and Michael M Gottesman and Thomas Graf and Warner Greene and Rudolf Jaenisch and Elizabeth Jones and Mary B Kennedy and Michael Klagsbrun and Robert J Lefkowitz and Lynn Matrisian and Donald Metcalf and Marc Mumby and Paul Nurse and Kenneth

In [5]:
#df = pd.DataFrame(columns = ['title','year','authors','abstract'])
import pandas as pd

def fetch(pub):
    return {'title':pub['bib'].get('title', ''), 'year': pub['bib'].get('pub_year', ''), 'authors': pub['bib'].get('author', ''), 'abstract': pub['bib'].get('abstract', '')}

def fetch_keywords(pubmed_id, print_title=False):
    url_pub = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pubmed_id}&&rettype=abstract&retmode=xml'
    response = requests.get(url_pub)
    tree = ElementTree.fromstring(response.content)
    
    if print_title:
        for el in tree.iter('ArticleTitle'):
            print(el.text)

    keywords = []
    for mesh in tree.iter('MeshHeading'):
        descriptor = mesh.find('DescriptorName').text
        keywords.append(descriptor)
    return keywords

with open('stopwords.txt') as f:
    x = f.readlines()
stopwords = [i.strip(',').strip() for i in x[0].split()]

def write_query(title):
    l = title.split()
    l = [i.lower() for i in l if not i.lower() in stopwords]
    l = [i.lower() for i in l if i.isalnum()]
    return " AND ".join([f"{i}[Title]" for i in l])+ f" AND {last_name} [Author]"


In [6]:
df = pd.DataFrame([fetch(pub) for pub in author['publications']])

In [7]:
df = df[df.year!='']
df = df[df.authors.str.lower().str.contains(last_name)]
df = df[df.abstract.notnull()]
df.to_csv(f'{last_name}_publications.csv')

In [8]:
df = pd.read_csv(f'{last_name}_publications.csv', index_col=0)

In [9]:
df.set_index('title', inplace = True)

In [10]:
df

Unnamed: 0_level_0,year,authors,abstract
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Peptide mapping by limited proteolysis in sodium dodecyl sulfate and analysis by gel electrophoresis.,1977,Don W Cleveland and Stuart G Fischer and Marc ...,A rapid and convenient method for peptide mapp...
Dynamic instability of microtubule growth,1984,Tim Mitchison and Marc Kirschner,We report here that microtubules in vitro coex...
A protein factor essential for microtubule assembly,1975,Murray D Weingarten and Arthur H Lockwood and ...,A heat stable protein essentail for microtubul...
Cyclin is degraded by the ubiquitin pathway,1991,Michael Glotzer and Andrew W Murray and Marc W...,Cyclin degradation is the key step governing e...
Absolute quantification of proteins and phosphoproteins from cell lysates by tandem MS,2003,Scott A Gerber and John Rush and Olaf Stemman ...,A need exists for technologies that permit the...
...,...,...,...
ISOLATION OF SEPARATE MESSENGER-RNAS FOR ALPHA AND BETA TUBULIN AND BETA AND GAMMA ACTIN AND CHARACTERIZATION OF CORRESPONDING INVITRO TRANSLATION PRODUCTS,1978,DW CLEVELAND and PS HERSH and MW KIRSCHNER and...,
TURNOVER OF NON-EXCHANGEABLY BOUND GTP ON TUBULIN IN CHINESE-HAMSTER OVARY CELLS,1977,BM SPIEGELMAN and SM PENNINGROTH and MW KIRSCHNER,
MECHANISM OF MICROTUBULE ASSEMBLY INVITRO,1975,MW KIRSCHNER,
Conformational changes in aspartate transcarbamylase,1971,Marc Wallace Kirschner,Conformational changes in proteins are thought...


In [11]:
title = 'Dynamic instability of microtubule growth'
write_query(title)

'dynamic[Title] AND instability[Title] AND microtubule[Title] AND growth[Title] AND kirschner [Author]'

In [12]:

pubmed = PubMed(tool="MyTool", email="my@email.address")

for i in tqdm(df.index):
    title = i
    year = df.loc[i,'year']
    q = write_query(title)
    results = pubmed.query(q, max_results=1)
    #print(title)
    try:
        x = next(results)
        x = x.toDict()
        pubmed_id = x['pubmed_id'].split('\n')[0]
        keywords = fetch_keywords(pubmed_id)
        df.loc[i, keywords] = 1
        df.loc[i, 'pubmed_id'] = pubmed_id
        #print(keywords, '\n')


    except StopIteration:

        #print("QUERY NOT FOUND!!! \n")
        continue
    

df.loc[:,kw_cols]= df[kw_cols]==1
kw_cols = df.columns[4:]
df[kw_cols] = pd.notnull(df[kw_cols])

df[kw_cols]

100%|██████████| 586/586 [12:17<00:00,  1.26s/it]


NameError: name 'kw_cols' is not defined

In [20]:
df.reset_index(inplace=True)

In [21]:
df.to_pickle('/Users/ernestmordret/Desktop/test_df')

In [24]:
df

Unnamed: 0,title,year,authors,abstract,Alkaline Phosphatase,"Electrophoresis, Polyacrylamide Gel",Escherichia coli,Peptide Fragments,Peptide Hydrolases,Proteins,...,Granulation Tissue,Hip Joint,Knee Joint,Prosthesis Failure,Prosthesis-Related Infections,Reoperation,DNA Glycosylases,N-Glycosyl Hydrolases,"Spectrometry, Mass, Matrix-Assisted Laser Desorption-Ionization",Uracil-DNA Glycosidase
0,Peptide mapping by limited proteolysis in sodi...,1977,Don W Cleveland and Stuart G Fischer and Marc ...,A rapid and convenient method for peptide mapp...,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,Dynamic instability of microtubule growth,1984,Tim Mitchison and Marc Kirschner,We report here that microtubules in vitro coex...,,,,,,,...,,,,,,,,,,
2,A protein factor essential for microtubule ass...,1975,Murray D Weingarten and Arthur H Lockwood and ...,A heat stable protein essentail for microtubul...,,,,,,,...,,,,,,,,,,
3,Cyclin is degraded by the ubiquitin pathway,1991,Michael Glotzer and Andrew W Murray and Marc W...,Cyclin degradation is the key step governing e...,,,,,,,...,,,,,,,,,,
4,Absolute quantification of proteins and phosph...,2003,Scott A Gerber and John Rush and Olaf Stemman ...,A need exists for technologies that permit the...,,,,,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,ISOLATION OF SEPARATE MESSENGER-RNAS FOR ALPHA...,1978,DW CLEVELAND and PS HERSH and MW KIRSCHNER and...,,,,,,,,...,,,,,,,,,,
582,TURNOVER OF NON-EXCHANGEABLY BOUND GTP ON TUBU...,1977,BM SPIEGELMAN and SM PENNINGROTH and MW KIRSCHNER,,,,,,,,...,,,,,,,,,,
583,MECHANISM OF MICROTUBULE ASSEMBLY INVITRO,1975,MW KIRSCHNER,,,,,,,,...,,,,,,,,,,
584,Conformational changes in aspartate transcarba...,1971,Marc Wallace Kirschner,Conformational changes in proteins are thought...,,,,,,,...,,,,,,,,,,


In [18]:
!pip install --upgrade kmodes


Collecting kmodes
  Downloading kmodes-0.11.0-py2.py3-none-any.whl (18 kB)
Collecting scipy>=0.13.3
  Using cached scipy-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl (28.8 MB)
Collecting joblib>=0.11
  Downloading joblib-1.0.1-py3-none-any.whl (303 kB)
[K     |████████████████████████████████| 303 kB 2.4 MB/s eta 0:00:01
Collecting scikit-learn>=0.22.0
  Downloading scikit_learn-0.24.1-cp36-cp36m-macosx_10_13_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 6.4 MB/s eta 0:00:01     |▋                               | 133 kB 2.1 MB/s eta 0:00:04     |██▍                             | 542 kB 2.1 MB/s eta 0:00:04     |███                             | 696 kB 2.1 MB/s eta 0:00:04     |████████████████████▍           | 4.6 MB 1.9 MB/s eta 0:00:02     |███████████████████████▏        | 5.2 MB 1.9 MB/s eta 0:00:02
[?25hCollecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit

In [97]:
kw_count = df[kw_cols].sum(0)
kw_count.sort_values()

pubmed_id                           0
Nucleoside-Diphosphate Kinase       1
Amino Acid Isomerases               1
Spectrophotometry, Ultraviolet      1
Isoelectric Focusing                1
                                 ... 
Xenopus                            41
Molecular Sequence Data            42
Microtubules                       47
Humans                             56
Animals                           132
Length: 650, dtype: int64

In [98]:
data_cols = kw_count[(kw_count>5) & (kw_count<50)].index

In [99]:
data = df[data_cols]

In [100]:
import numpy as np
from kmodes.kmodes import KModes

# random categorical data

km = KModes(n_clusters=10, init='Huang', n_init=5, verbose=1)

clusters = km.fit_predict(data)

# Print the cluster centroids
print(km.cluster_centroids_)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 31, cost: 1180.0
Run 1, iteration: 2/100, moves: 21, cost: 1173.0
Run 1, iteration: 3/100, moves: 3, cost: 1173.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 46, cost: 1159.0
Run 2, iteration: 2/100, moves: 20, cost: 1148.0
Run 2, iteration: 3/100, moves: 5, cost: 1148.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 27, cost: 1161.0
Run 3, iteration: 2/100, moves: 12, cost: 1155.0
Run 3, iteration: 3/100, moves: 1, cost: 1155.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 52, cost: 1130.0
Run 4, iteration: 2/100, moves: 17, cost: 1119.0
Run 4, iteration: 3/100, moves: 0, cost: 1119.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration

In [103]:
for i in range(len(km.cluster_centroids_)):
    print(i,data_cols[km.cluster_centroids_[i]], '\n')

0 Index(['Tubulin', 'Microtubules'], dtype='object') 

1 Index(['Cell Movement', 'Actins', 'Wiskott-Aldrich Syndrome Protein Family'], dtype='object') 

2 Index(['Cell Cycle Proteins', 'Actins', 'Recombinant Proteins',
       'Cytoskeletal Proteins', 'Macromolecular Substances',
       'Models, Biological', 'Oocytes'],
      dtype='object') 

3 Index([], dtype='object') 

4 Index(['Electrophoresis, Polyacrylamide Gel', 'Microtubules', 'Brain',
       'Microscopy, Electron', 'Nerve Tissue Proteins', 'Swine',
       'Macromolecular Substances'],
      dtype='object') 

5 Index(['Amino Acid Sequence', 'Cloning, Molecular', 'Molecular Sequence Data',
       'RNA, Messenger', 'Oocytes', 'Fibroblast Growth Factors',
       'Base Sequence', 'DNA'],
      dtype='object') 

6 Index(['Biological Evolution'], dtype='object') 

7 Index(['Xenopus', 'Mitosis', 'Female', 'Ovum'], dtype='object') 

8 Index(['Mice', 'Cell Size', 'G1 Phase'], dtype='object') 

9 Index(['Cell Cycle Proteins', 'HeLa Cells