In [1]:
from utils.Requests import Requests
from utils.helpers import parse_dates
from bs4 import BeautifulSoup as BS

def get_arxiv(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [a.text for a in soup.find('div', {'class': 'authors'}).find_all('a')]
    dates = soup.find('div', {'class': 'dateline'}).text.split(' (v1), ')
    dates = [date.replace('(Submitted on ', '').replace('last revised ', '').split(' (')[0].strip(' ()') for date in dates]
    dates = parse_dates(dates)
    abstract = soup.find('blockquote', {'class': 'abstract mathjax'}).text.replace('Abstract: ', '').replace('\n', ' ').strip()
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_acm(url):
    r = Requests()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}
    text = r.get(url, params={'headers': headers})
    soup = BS(text, 'html.parser')
    
    title = soup.find('title').text
    authors = [a.text for a in soup.find_all('table')[4].find('table').find_all('table')[1].find_all('a')[::2]]
    tmp = soup.find_all(lambda tag: tag.name == "td" and "—" in tag.text)
    if tmp != []:
        date_tokens = tmp[-1].text.split('\n')[3].split(' — ')[1].split()
        dates = parse_dates([' '.join([date_tokens[0], date_tokens[1], date_tokens[4]])])
    else:
        tmp = soup.find_all(lambda tag: tag.name == "td" and "©" in tag.text)
        if tmp != []:
            year = tmp[-1].text.split('\n')[2].split('©')[1]
            dates = parse_dates([year], reset=True)
        else:
            tmp = soup.find_all(lambda tag: tag.name == "td" and "Article" in tag.text)
            year = tmp[0].text.strip().split(' ')[0]
            dates = parse_dates([year], reset=True)
    abstract = r.get(
        'https://dl.acm.org/tab_abstract.cfm?id={}&type=Article&usebody=tabbody&_cf_containerId=cf_layoutareaabstract&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0'.format(url.split('=')[1]), 
        params={'headers': headers}
    )
    abstract = BS(abstract, 'html.parser').find('p').text
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_acl(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [m['content'] for m in soup.find_all('meta', {'name': 'citation_author'})]
    dates = parse_dates([soup.find('meta', {'name': 'citation_publication_date'})['content']], reset=True)
    abstract = ''
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_scholarpedia(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [m['content'] for m in soup.find_all('meta', {'name': 'citation_author'})]
    dates = parse_dates([soup.find('meta', {'name': 'citation_date'})['content']])
    abstract = ''
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_nature(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [m['content'] for m in soup.find_all('meta', {'name': 'citation_author'})]
    dates = parse_dates([soup.find('meta', {'name': 'citation_online_date'})['content']])
    abstract = soup.find('div', {'id': 'abstract-content'}).find('p').text
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_empty(url):
    result = {
        'title': '', 
        'authors': '', 
        'dates': '',
        'abstract': ''
    }
    return result

# r = Requests()
# url = 'https://arxiv.org/abs/1612.03242'
# text = r.get(url)
# soup = BS(text, 'html.parser')
# soup

In [2]:
import pandas as pd
df = pd.read_csv('data.csv')
df.head(5)

Unnamed: 0,name,source,url
0,MLP,DTIC,http://www.dtic.mil/docs/citations/AD0256582
1,Backpropagation,DTIC,http://www.dtic.mil/docs/citations/ADA164453
2,Non-Linearity,MCSS,https://link.springer.com/article/10.1007/BF02...
3,Activation Functions,Book,https://dl.acm.org/citation.cfm?id=541500
4,RBM,ICML 2007,https://dl.acm.org/citation.cfm?id=1273596


In [3]:
result = []
for idx, row in df.iterrows():
    url = row['url']
    info = {
        'name': row['name'], 
        'source': row['source'], 
        'url': row['url']
    }
    try:
        if 'arxiv' in url:
            info.update(get_arxiv(url))
        elif 'acm' in url:
            info.update(get_acm(url))
        elif 'acl' in url:
            info.update(get_acl(url))
        elif 'scholarpedia' in url:
            info.update(get_scholarpedia(url))
        elif 'nature' in url:
            info.update(get_nature(url))
        else:
            print('Unrecognized url:', url)
            continue
    except Exception as e:
        print(url)
        raise e
    result.append(info)
print(result[:5])

Unrecognized url: http://www.dtic.mil/docs/citations/AD0256582
Unrecognized url: http://www.dtic.mil/docs/citations/ADA164453
Unrecognized url: https://link.springer.com/article/10.1007/BF02551274
Unrecognized url: https://www.researchgate.net/publication/269295652_Deep_neural_networks_for_small_footprint_text-dependent_speaker_verification
Unrecognized url: https://www.researchgate.net/publication/327812023_X-Vectors_Robust_DNN_Embeddings_for_Speaker_Recognition
Unrecognized url: https://ieeexplore.ieee.org/document/7780518/
Unrecognized url: https://ieeexplore.ieee.org/document/8237776/
Unrecognized url: https://deepmind.com/blog/neural-scene-representation-and-rendering/
Unrecognized url: https://openreview.net/forum?id=SkfMWhAqYQ
Unrecognized url: ''
Unrecognized url: https://github.com/facebookresearch/Detectron
Unrecognized url: https://sites.skoltech.ru/app/data/uploads/sites/25/2018/04/deep_image_prior.pdf
[{'name': 'Activation Functions', 'source': 'Book', 'url': 'https://dl.a

In [4]:
import json
json.dump(result, open('works.json', 'w', encoding='utf-8'), sort_keys=True, indent=2)