In [1]:
program_url = 'http://www.wikicfp.com/cfp/program?id={}'
event_url = 'http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid={}'
cfp_program_id = {
    'AAAI': 3, 
    'ACL': 37, 
    'CVPR': 628, 
    'EACL': 785, 
    'ECCV': 800, 
    'EMNLP': 883, 
    'ICASSP': 1289, 
    'ICCV': 1325, 
    'ICIP': 1390, 
    'ICML': 1421, 
    'IJCAI': 1567, 
    'NAACL': 2170, 
    'NIPS': 2212
}
cfp_event_id = {
    'TACL 2017': 59301, 
    'CoNLL 2008': 2320, 
    'CoNLL 2009': 4212, 
    'CoNLL 2010': 7177, 
    'CoNLL 2011': 13354, 
    'CoNLL 2013': 27672, 
    'CoNLL-GEC 2014': 34917, 
    'CoNLL 2016': 50147, 
    'CoNLL 2017': 61775, 
    'CoNLL 2018': 73954, 
    'CVPR 2017': 55850, 
    'EMNLP-CoNLL 2012': 'EMNLP 2012', 
    'ICLR 2013': 26958, 
    'ICLR 2016': 48154, 
    'ICLR 2017': 56720, 
    'ICLR 2018': 67323, 
    'ICLR 2019': 78931
}

In [2]:
from utils.Requests import Requests
from utils.helpers import parse_dates
from bs4 import BeautifulSoup as BS

def get_event_data(conf):
    id = cfp_event_id[conf]
    if id < 0: return {}
    
    r = Requests()
    text = r.get(event_url.format(id))
    soup = BS(text, 'html.parser')
    
    table = soup.find_all('table')[2].find_all('table')[3].find_all('table')[1]
    tokens = [td.text.strip() for td in table.find_all('td')]
    
    name = soup.find('title').text.strip()
    date = parse_dates(tokens[0].split(' - '))
    loca = tokens[1]
    if len(tokens) >= 5:
        subm = parse_dates([tokens[4], tokens[2]])
    else:
        subm = parse_dates([tokens[2]])
    
    conf = ' '.join(conf.split())
    result[conf] = {
        'name': name, 
        'date': date, 
        'location': loca, 
        'submission': subm
    }
    return result

def get_program_data(conf):
    id = cfp_program_id[conf]
    if id < 0: return {}
    
    r = Requests(verbose=True)
    text = r.get(program_url.format(id))
    soup = BS(text, 'html.parser')
    
    data = []
    for tr in soup.find_all('table')[2].find_all('table')[2].find_all('tr'):
        texts = [td.text for td in tr.find_all('td', {'align': 'left'})]
        a = tr.find('a')
        if a is not None:
            texts += [a['href']]
        if texts != []:
            data.append(texts)
    print(len(data))
    
    result = {}
    for i in range(0, len(data), 2):
        conf = data[i][0]
        name = data[i][1]
        event_url = data[i][2]
        date = parse_dates(data[i+1][0].split(' - '))
        loca = data[i+1][1]
        subm = parse_dates(data[i+1][2].strip('()').split(' ('))
        
        text = r.get(event_url)
        soup = BS(text, 'html.parser')
        url = soup.find('center').find_all('td', {'align': 'center'})[4].find('a')['href']
        
        conf = ' '.join(conf.split())
        result[conf] = {
            'name': name, 
            'date': date, 
            'location': loca, 
            'submission': subm, 
            'url': url
        }
    return result

# get_data('AAAI')

In [3]:
result = {}
for conf, id in cfp_program_id.items():
    if conf.startswith('EMNLP'):
        print(conf, ' '.join(conf.split()))
    result.update(get_program_data(conf))
for conf, id in cfp_event_id.items():
    if type(id) is int:
        result.update(get_event_data(conf))
    else:
        result[conf] = result[id]
        del result[id]
# result

cache http://www.wikicfp.com/cfp/program?id=628
18
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=78951&copyownerid=25236
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=67108&copyownerid=2
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=36730&copyownerid=2
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=31712&copyownerid=52931
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=22376&copyownerid=36575
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=16291&copyownerid=2
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=5649&copyownerid=2082
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=3507&copyownerid=1513
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=964&copyownerid=2
cache http://www.wikicfp.com/cfp/program?id=800
8
cache http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=58291&copyownerid=92805
cache http://www.wikicfp.com/cfp/servlet/event.show

In [4]:
result.keys()

dict_keys(['ICML 2018', 'CVPR 2019', 'AAAI 2013', 'EACL 2009', 'CoNLL 2009', 'EACL 2017', 'CVPR 2010', 'CoNLL 2013', 'ACL 2018', 'NIPS 2016', 'ECCV 2018', 'CVPR 2008', 'ICIP 2019', 'ACL 2016', 'AAAI 2016', 'ICIP 2018', 'CoNLL 2016', 'ICML 2009', 'NAACL-HLT 2009', 'NIPS 2018', 'ICML 2019', 'IJCAI 2016', 'CoNLL 2018', 'ACL 2008', 'ICASSP 2008', 'CoNLL 2008', 'ICASSP 2017', 'EMNLP 2013', 'NIPS 2014', 'ICASSP 2009', 'EACL 2012', 'ACL 2012', 'ICCV 2019', 'NIPS 2015', 'TACL 2017', 'CVPR 2013', 'AAAI 2018', 'CVPR 2017', 'AAAI 2015', 'IJCAI 2015', 'CVPR 2014', 'ICML 2016', 'ICASSP 2011', 'ICLR 2019', 'ICASSP 2013', 'CoNLL 2011', 'ACL HLT 2011', 'EMNLP 2016', 'NIPS 2013', 'EMNLP 2011', 'ICCV 2013', 'ICIP 2017', 'ICIP 2013', 'ICIP 2009', 'NIPS 2010', 'EMNLP-CoNLL 2012', 'ACL 2013', 'EMNLP 2010', 'EMNLP 2018', 'NIPS 2017', 'ICIP 2008', 'ICML 2015', 'CVPR 2015', 'NAACL 2015', 'CoNLL-GEC 2014', 'ACL 2017', 'ICASSP 2015', 'ICML 2008', 'ICIP 2011', 'ECCV 2012', 'AAAI 2012', 'ICASSP 2014', 'NAACL HLT 

In [5]:
result['CVPR 2017']

{'date': ['2017-06-24', '2017-06-30'],
 'location': 'Puerto Rico',
 'name': 'CVPR 2017 : Computer Vision and Pattern Recognition',
 'submission': ['2017-04-10', '2016-11-11']}

In [6]:
import json
json.dump(result, open('conferences.json', 'w', encoding='utf-8'), sort_keys=True, indent=2)