In [1]:
# Step 1: Import the package
import urllib.request
base_url = 'https://scholar.google.com/'
url = "https://scholar.google.com/citations?hl=en&user=fZKJdb0AAAAJ"
 
html_doc = urllib.request.urlopen(url).read()

In [40]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

def read_page_and_get_soup(url):
    html_doc = urllib.request.urlopen(url).read()
    return BeautifulSoup(html_doc, 'html.parser')


In [9]:
name = soup.find(id='gsc_prf_in').text

'Zachary Novack'

In [48]:
page_layout_tags = {
    'citation_table': 'gsc_a_b',
    'work_info_row': 'gsc_a_t',
    'work_info_table': 'gsc_oci_table',
    'field_name': 'gsc_oci_field',
    'field_value': 'gsc_oci_value'
}


In [87]:
works = []
for table_row in soup.find(id=page_layout_tags['citation_table']).findAll('tr', recursive=False):
    work = {}
    for table_column in table_row.findAll('td', recursive=False):
        if page_layout_tags['work_info_row'] in table_column['class']:
            work['title'] = table_column.a.text
            work['link'] = table_column.a['href']
    works.append(work)

print(works)


[{'title': 'Chils: Zero-shot image classification with hierarchical label sets', 'link': '/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZKJdb0AAAAJ&citation_for_view=fZKJdb0AAAAJ:9yKSN-GCB0IC'}, {'title': 'Ditto: Diffusion inference-time t-optimization for music generation', 'link': '/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZKJdb0AAAAJ&citation_for_view=fZKJdb0AAAAJ:qjMakFHDy7sC'}, {'title': 'Disentangling the Mechanisms Behind Implicit Regularization in SGD', 'link': '/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZKJdb0AAAAJ&citation_for_view=fZKJdb0AAAAJ:u-x6o8ySG0sC'}, {'title': 'Futga: Towards Fine-grained Music Understanding through Temporally-enhanced Generative Augmentation', 'link': '/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZKJdb0AAAAJ&citation_for_view=fZKJdb0AAAAJ:IjCSPb-OGe4C'}, {'title': 'DITTO-2: Distilled Diffusion Inference-Time T-Optimization for Music Generation', 'link': '/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZ

In [118]:
valid_work_fields = [
    'authors', 'publication_date', 'conference', 
    'pages', 'publisher', 'description', 
    'total_citations',
]

def default_processor(soup):
    return soup.text

def process_text_date(soup):
    import dateutil.parser as parser
    return parser.parse(soup.text,yearfirst=True, dayfirst=False).strftime('%Y-%m-%d')

def process_authors(soup):
    return [author.strip() for author in soup.text.split(',')]

def process_citations(soup):
    total_citations = int(soup.div.a.text.replace('Cited by ', ''))
    return total_citations

from collections import defaultdict


field_processors = defaultdict(lambda: default_processor)
field_processors['publication_date'] = process_text_date
field_processors['authors'] = process_authors
field_processors['total_citations'] = process_citations


In [120]:
import time
for work in works:
    work_page = read_page_and_get_soup(base_url + work['link'])
    for work_field in work_page.find(id=page_layout_tags['work_info_table']).findAll('div', recursive=False):
        field_name = work_field.find('div', {'class': page_layout_tags['field_name']}).text.lower().replace(' ', '_')
        field_value = work_field.find('div', {'class': page_layout_tags['field_value']})
        
        if field_name in valid_work_fields:
            work[field_name] = field_processors[field_name](field_value)
    time.sleep(2)

works

[{'title': 'Chils: Zero-shot image classification with hierarchical label sets',
  'link': '/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZKJdb0AAAAJ&citation_for_view=fZKJdb0AAAAJ:9yKSN-GCB0IC',
  'authors': ['Zachary Novack',
   'Julian McAuley',
   'Zachary Chase Lipton',
   'Saurabh Garg'],
  'publication_date': '2023-07-03',
  'conference': 'International Conference on Machine Learning',
  'pages': '26342-26362',
  'publisher': 'PMLR',
  'description': 'Open vocabulary models (eg CLIP) have shown strong performance on zero-shot classification through their ability generate embeddings for each class based on their (natural language) names. Prior work has focused on improving the accuracy of these models through prompt engineering or by incorporating a small amount of labeled downstream data (via finetuning). However, there has been little focus on improving the richness of the class names themselves, which can pose issues when class labels are coarsely-defined and are uninf

In [128]:
json.dumps(works)

'[{"title": "Chils: Zero-shot image classification with hierarchical label sets", "link": "/citations?view_op=view_citation&hl=en&oe=ASCII&user=fZKJdb0AAAAJ&citation_for_view=fZKJdb0AAAAJ:9yKSN-GCB0IC", "authors": ["Zachary Novack", "Julian McAuley", "Zachary Chase Lipton", "Saurabh Garg"], "publication_date": "2023-07-03", "conference": "International Conference on Machine Learning", "pages": "26342-26362", "publisher": "PMLR", "description": "Open vocabulary models (eg CLIP) have shown strong performance on zero-shot classification through their ability generate embeddings for each class based on their (natural language) names. Prior work has focused on improving the accuracy of these models through prompt engineering or by incorporating a small amount of labeled downstream data (via finetuning). However, there has been little focus on improving the richness of the class names themselves, which can pose issues when class labels are coarsely-defined and are uninformative. We propose C

In [129]:
import json

with open('zach_scrape.json', 'w') as out_file:
    json.dump(works, out_file)