In [109]:
import re

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

from src.common import *
import src.mongos as mg

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preparation

## Country List

### Fetch

In [32]:
def fetch_countries():
    url = 'http://aceproject.org/regions-en/countries-and-territories/KR/default?set_language=en'
    r = requests.get(url)
    soup = bs(r.text)
    options = soup.select('select.cd-country-dropdown > option')
    return options


options = fetch_countries()

### Test

In [28]:
def test_countries(options):
    tmp = options[0]['value']
    re.search(r'ries/(\w\w)/def', tmp).group(1)
    return options[0].text.strip()


# test_countries(options)

### Parse

In [26]:
def parse_countries(options):
    urls, names, codes = [], [], []

    for o in options:
        url = o.get('value', '')
        urls.append(url)

        obj = re.search(r'ries/(\w\w)/def', url)
        if obj:
            code = obj.group(1)
        else:
            code = ''
        codes.append(code)

        name = o.text.strip()
        names.append(name)
    return urls, names, codes


# urls, names, codes = parse_countries(options)
# for x in [urls, names, codes]:
#     print(len(x))

## All Countries

### Test

In [105]:
def get_qna(soup):
    td = soup.find('td', attrs={'valign':'top', 'width':'45%'})
    list_raw_qna = str(td).split('<br/>')[1:-2]

    list_qna = []
    for qna in list_raw_qna:
        soup_qna = bs(qna)
        try:
            question = soup_qna.select('.cdquestion')[0].text.strip()
        except IndexError as e:
#             print(soup_qna)
            continue
        answer = ', '.join([x.text.strip() for x in soup_qna.select('.cdanswer')])
        list_qna.append((question, answer))

    return list_qna


def dictionize_qna(qna, name, code):
    dict_qna = dict(qna)
    dict_qna['Country'] = name
    dict_qna['Code'] = code
    return dict_qna


# qna = get_qna(soup)
# dictionize_qna(qna, 'a', 'b')

# qna

### Fetch all countries

In [107]:
def fetch_all_countries(urls, names, codes):
    if (len(urls) != len(names)) or (len(urls) != len(codes)):
        raise ValueError("Lengths of urls, names, codes are different.")
    
    all_qna = []

    for i, u in enumerate(urls):
        r = requests.get(u)
        soup = bs(r.text)
        list_qna = get_qna(soup)
        name = names[i]
        code = codes[i]
#         if code == 'PM':  # maybe required?
#             continue
        dict_qna = dictionize_qna(list_qna, name, code)
        all_qna.append(dict_qna)
    
    return all_qna

# idx = 10
# all_qna = fetch_all_countries(urls[:idx], names[:idx], codes[:idx])
all_qna = fetch_all_countries(urls, names, codes)

all_qna

[{'President': 'Yes',
  'Electoral System (Chamber 1)': 'Single Non-Transferable Vote',
  'Voting age': '18',
  'Compulsory/voluntary voting': 'Voting is voluntary',
  'Electoral Management model': 'Independent',
  'Voting outside the country is permitted for': 'Other, No information available',
  'Country': 'Afghanistan',
  'Code': 'AF'},
 {'President': 'Yes',
  'Electoral System (Chamber 1)': 'List Proportional Representation',
  'Voting age': '18',
  'Compulsory/voluntary voting': 'Voting is voluntary',
  'Electoral Management model': 'Independent',
  'Voting outside the country is permitted for': 'Not applicable',
  'Country': 'Albania',
  'Code': 'AL'},
 {'President': 'Yes',
  'Electoral System (Chamber 1)': 'List Proportional Representation',
  'Voting age': '18',
  'Compulsory/voluntary voting': 'Voting is voluntary',
  'Electoral Management model': 'Governmental',
  'Voting outside the country is permitted for': 'Citizens residing outside the country',
  'Country': 'Algeria',
 

### Make df

In [111]:
def make_systems(df):
    systems = pd.DataFrame(df)
    return systems


systems = make_systems(all_qna)
systems.head()

### Save df

In [110]:
mg.to_mongo(systems, 'electoral_system', 'systems', dtype='df')

Inserted rows: 236
