In [1]:
program_url = 'http://www.wikicfp.com/cfp/program?id={}'
event_url = 'http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid={}'
cfp_program_id = {
    'AAAI': 3, 
    'ACL': 37, 
    'CVPR': 628, 
    'EACL': 785, 
    'ECCV': 800, 
    'EMNLP': 883, 
    'ICASSP': 1289, 
    'ICCV': 1325, 
    'ICIP': 1390, 
    'ICML': 1421, 
    'IJCAI': 1567, 
    'NAACL': 2170, 
    'NIPS': 2212
}
cfp_event_id = {
    'TACL 2017': 59301, 
    'CoNLL 2008': 2320, 
    'CoNLL 2009': 4212, 
    'CoNLL 2010': 7177, 
    'CoNLL 2011': 13354, 
    'CoNLL 2013': 27672, 
    'CoNLL-GEC 2014': 34917, 
    'CoNLL 2016': 50147, 
    'CoNLL 2017': 61775, 
    'CoNLL 2018': 73954, 
    'CVPR 2017': 55850, 
    'EMNLP-CoNLL 2012': 'EMNLP 2012', 
    'ICLR 2013': 26958, 
    'ICLR 2016': 48154, 
    'ICLR 2017': 56720, 
    'ICLR 2018': 67323, 
    'ICLR 2019': 78931
}

In [2]:
from utils.Requests import Requests
from utils.helpers import parse_dates
from bs4 import BeautifulSoup as BS

def get_event_data(conf):
    id = cfp_event_id[conf]
    if id < 0: return {}
    
    r = Requests()
    text = r.get(event_url.format(id))
    soup = BS(text, 'html.parser')
    
    table = soup.find_all('table')[2].find_all('table')[3].find_all('table')[1]
    tokens = [td.text.strip() for td in table.find_all('td')]
    
    name = soup.find('title').text.strip()
    date = parse_dates(tokens[0].split(' - '))
    loca = tokens[1]
    if len(tokens) >= 5:
        subm = parse_dates([tokens[4], tokens[2]])
    else:
        subm = parse_dates([tokens[2]])
    
    conf = ' '.join(conf.split())
    result[conf] = {
        'name': name, 
        'date': date, 
        'location': loca, 
        'submission': subm
    }
    return result

def get_program_data(conf):
    id = cfp_program_id[conf]
    if id < 0: return {}
    
    r = Requests(verbose=True)
    text = r.get(program_url.format(id))
    soup = BS(text, 'html.parser')
    
    data = []
    for tr in soup.find_all('table')[2].find_all('table')[2].find_all('tr'):
        texts = [td.text for td in tr.find_all('td', {'align': 'left'})]
        a = tr.find('a')
        if a is not None:
            texts += [a['href']]
        if texts != []:
            data.append(texts)
    print(len(data))
    
    result = {}
    for i in range(0, len(data), 2):
        conf = data[i][0]
        name = data[i][1]
        event_url = data[i][2]
        date = parse_dates(data[i+1][0].split(' - '))
        loca = data[i+1][1]
        subm = parse_dates(data[i+1][2].strip('()').split(' ('))
        
        text = r.get(event_url)
        soup = BS(text, 'html.parser')
        url = soup.find('center').find_all('td', {'align': 'center'})[4].find('a')['href']
        
        conf = ' '.join(conf.split())
        result[conf] = {
            'name': name, 
            'date': date, 
            'location': loca, 
            'submission': subm, 
            'url': url
        }
    return result

# get_data('AAAI')

In [3]:
result = {}
for conf, id in cfp_program_id.items():
    if conf.startswith('EMNLP'):
        print(conf, ' '.join(conf.split()))
    result.update(get_program_data(conf))
for conf, id in cfp_event_id.items():
    if type(id) is int:
        result.update(get_event_data(conf))
    else:
        result[conf] = result[id]
        del result[id]
# result

cache http://www.wikicfp.com/cfp/program?id=3
16
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=75888&copyownerid=106070
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=63229&copyownerid=52097
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=45094&copyownerid=19251
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=37497&copyownerid=46410
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=31244&copyownerid=48382
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=25337&copyownerid=41734
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=18353&copyownerid=29161
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=1842&copyownerid=256
cache http://www.wikicfp.com/cfp/program?id=37
22
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=81290&copyownerid=2468
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=67104&copyownerid=2
cache http://www.wikicfp.com/cfp/servl

In [4]:
result.keys()

dict_keys(['AAAI 2019', 'AAAI 2018', 'AAAI 2016', 'AAAI 2015', 'AAAI 2014', 'AAAI 2013', 'AAAI 2012', 'AAAI 2008', 'ACL 2019', 'ACL 2018', 'ACL 2017', 'ACL 2016', 'ACL 2015', 'ACL 2014', 'ACL 2013', 'ACL 2012', 'ACL HLT 2011', 'ACL 2010', 'ACL 2008', 'CVPR 2019', 'CVPR 2018', 'CVPR 2015', 'CVPR 2014', 'CVPR 2013', 'CVPR 2012', 'CVPR 2010', 'CVPR 2009', 'CVPR 2008', 'EACL 2017', 'EACL 2014', 'EACL 2012', 'EACL 2009', 'ECCV 2018', 'ECCV 2012', 'ECCV 2010', 'ECCV 2008', 'EMNLP 2018', 'EMNLP 2017', 'EMNLP 2016', 'EMNLP 2015', 'EMNLP 2014', 'EMNLP 2013', 'EMNLP 2011', 'EMNLP 2010', 'EMNLP 2009', 'EMNLP 2008', 'ICASSP 2019', 'ICASSP 2017', 'ICASSP 2015', 'ICASSP 2014', 'ICASSP 2013', 'ICASSP 2012', 'ICASSP 2011', 'ICASSP 2010', 'ICASSP 2009', 'ICASSP 2008', 'ICCV 2019', 'ICCV 2017', 'ICCV 2013', 'ICCV 2011', 'ICCV 2009', 'ICIP 2019', 'ICIP 2018', 'ICIP 2017', 'ICIP 2016', 'ICIP 2015', 'ICIP 2013', 'ICIP 2012', 'ICIP 2011', 'ICIP 2010', 'ICIP 2009', 'ICIP 2008', 'ICML 2019', 'ICML 2018', 'ICM

In [5]:
result['CVPR 2017']

{'name': 'CVPR 2017 : Computer Vision and Pattern Recognition',
 'date': ['2017-06-24', '2017-06-30'],
 'location': 'Puerto Rico',
 'submission': ['2017-04-10', '2016-11-11']}

In [6]:
import json
json.dump(result, open('conferences.json', 'w', encoding='utf-8'), sort_keys=True, indent=2)