In [8]:
from bs4 import BeautifulSoup
import pandas as pd

In [12]:
def get_soup_from_file(file_):
    with open(file_, 'r') as f:
        file_text = f.read()
    return BeautifulSoup(file_text, 'html.parser')

def parse_soup(soup):
    geographies = soup.find_all('a', {'class': 'RoutedBreadcrumbs__SLink-ccRASu'})
    parties = soup.find_all('div', {'class': 'titleNombreTerr'})
    percentages = soup.find_all('div', {'class': 'porcAgr'})
    votes = soup.find_all('div', {'class': 'numAgr'})

    geographies = [geography.text for geography in geographies]
    parties = [party.text for party in parties]
    percentages = [float(percentage.text.replace(',', '.').replace('%', ''))/100 for percentage in percentages]
    votes = [int(vote.text.replace('.', '')) for vote in votes]
    return geographies, parties, percentages, votes
    
def make_dict(geographies, parties, percentages, votes):
    for party, percentage, votes in zip(parties, percentages, votes):
        if geographies[1] not in data:
            data[geographies[1]] = {'total': {}}
        
        if len(geographies) == 2:
            if party not in data[geographies[1]]['total']:
                data[geographies[1]]['total'][party] = {}
            data[geographies[1]]['total'][party]['percentage'] = percentage
            data[geographies[1]]['total'][party]['votes'] = votes
        else:
            if geographies[2] not in data[geographies[1]]:
                data[geographies[1]][geographies[2]] = {}
            
            if party not in data[geographies[1]][geographies[2]]:
                data[geographies[1]][geographies[2]][party] = {}
            
            data[geographies[1]][geographies[2]][party]['percentage'] = percentage
            data[geographies[1]][geographies[2]][party]['votes'] = votes
        
def make_df(geographies, parties, percentages, votes):

    state = 'Total' if len(geographies) < 2 else geographies[1]
    geography_l3 = 'Total' if len(geographies) < 3 else geographies[2]

    return pd.DataFrame({'country': geographies[0], 'state': state,
                    'geography_l3': geography_l3, 'party': parties, 
                    'votes': votes, 'percentage': percentages})

def process_file(file_):
    soup = get_soup_from_file(file_)
    geographies, parties, percentages, votes = parse_soup(soup)
    df = make_df(geographies, parties, percentages, votes)
    return df

In [13]:
# Elections
df = pd.DataFrame(columns=['country', 'state', 'geography_l3', 'party', 
                          'votes', 'percentage'])
for i in range(0, 561):
    file_ = 'data/{}.html'.format(i)    
    df = df.append(process_file(file_))

In [21]:
df['party'] = df['party'].str.lower().str.capitalize()

df['state'] = df['state'].replace({
    'Ciudad Autónoma de Buenos Aires': 'CABA',
    'Tierra del Fuego, Antártida e Islas del Atlántico Sur': 'Tierra del Fuego'})

# Aliases
aliases = {
    'Juntos por el cambio': [
        'Juntos',
        'Cambia mendoza',
        'Juntos por el cambio jxc',
        'Juntos por entre ríos',
        'Frente juntos por el cambio',
        'Eco + vamos corrientes',
        'Chaco cambia + juntos por el cambio',
        'Juntos por el cambio +',
        'Cambia jujuy',
        'Juntos por el cambio chubut',
        'Cambia neuquén',
        'Cambia santa cruz'],
 'Frente de todos': [
    'Frente civico por santiago',
    'Frente de todos - todos san juan'],
 'Avanza libertad': ['La libertad avanza']}

df['party_original'] = df['party']
for party, _aliases in aliases.items():
    for alias in _aliases:
        df['party'] = df['party'].replace({alias: party})

In [22]:
# Governors
soup = get_soup_from_file('data/governors.html')

governors = pd.DataFrame(columns=['state', 'governor', 'party', 'party_alliance'])
governors_tables = soup.find_all('table')
for governors_table in governors_tables:
    governor_table_rows = governors_table.findAll('tr')
    for row in range(1, len(governor_table_rows)):
        governor_table_row_cells = governor_table_rows[row].findAll('td')
        state = governor_table_row_cells[0].findAll('a')[1].text
        governor = governor_table_row_cells[2].text
        party = governor_table_row_cells[4].a.text
        party_alliance = governor_table_row_cells[4].find('small')
        if party_alliance:            
            party_alliance = party_alliance.text
        
        governors = governors.append({'state': state, 'governor': governor, 'party': party, 'party_alliance': party_alliance}, ignore_index=True)

In [23]:
governors['party_alliance'] = governors['party_alliance'].str.replace('[()]', '')
governors['governor'] = governors['governor'].str.replace('\\n', '')
governors.loc[13, 'party_alliance'] = 'Other peronismo'
governors.loc[14, 'party_alliance'] = 'Other'
governors.loc[20, 'party_alliance'] = 'Frente de todos'
governors['party_alliance'] = governors['party_alliance'].str.lower().str.capitalize()
governors['state'] = governors['state'].str.strip()
governors['state'] = governors['state'].replace({'Ciudad Autónomade Buenos Aires': 'CABA'})

governors['party_alliance_original'] = governors['party_alliance']

for party in aliases:
    for alias in aliases[party]:
        governors['party_alliance'] = governors['party_alliance'].replace(alias, party)


In [24]:
df.reset_index(drop=True, inplace=True)
df = df.merge(governors, on='state', suffixes=('', '_wiki'))
df.to_csv('./data/df.csv')