In [8]:
import pandas as pd
import numpy as np

In [9]:
voice = pd.read_csv('voice.csv')
geo = pd.read_csv('geo-categorized.csv')
coefficients = pd.read_csv('tihanovkaja_photo_coefficients.csv')
alt_coefficients = pd.read_csv('alternative_coefficients.csv')
trusted_vp = pd.read_csv('trusted-for-alternative.csv')

In [10]:
vp_to_area = {x['id']: x['area'] for _, x in geo.iterrows()}

In [11]:
area_to_coeff = {x['area']: x['coefficient'] for _, x in coefficients.iterrows()}
area_candidate_to_tih_coeff = {
    f"{x['area']}_{x['candidate']}": x['tih_coefficient'] 
    for _, x in alt_coefficients.iterrows()
}

In [12]:
trusted_set = set(trusted_vp['id'])
# trusted_votes = {x['id']: x['tihanovkaja_officialVotes'] for _, x in trusted_vp.iterrows()}

In [13]:
alt_candidates = (
    'against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja',
)

In [20]:
expected_list = []

for _, voice_row in voice.iterrows():
    poll_station_id = voice_row['id']
    area = vp_to_area[poll_station_id]
    alternative = {}
    
    if poll_station_id in trusted_set:
        source = 'official'
        tihanovkaja = voice_row['tihanovkaja_officialVotes']
        
        for candidate in alt_candidates:
            alternative[candidate] = voice_row[f'{candidate}_officialVotes']
            
        if np.isnan(voice_row['corrupted_officialVotes']):
            source = 'official-corrupted-fix'
            alternative['corrupted'] = tihanovkaja * area_candidate_to_tih_coeff[f'{area}_corrupted']
        
    else:
        source = 'voice'
        tihanovkaja = voice_row['tihanovkaja_photoVoices'] * area_to_coeff[area]
        for candidate in alt_candidates:
            alternative[candidate] = tihanovkaja * area_candidate_to_tih_coeff[f'{area}_{candidate}']
        
    poll_station_data = {
        'id': voice_row['id'],
        'area': area,
        'source': source,
        'tihanovkaja': tihanovkaja,
    }
    poll_station_data.update(alternative)
    expected_list.append(poll_station_data)

In [21]:
sum([x['tihanovkaja'] for x in expected_list])

1427031.7919999976

In [47]:
pd.DataFrame(expected_list).to_csv('candidates-by-poll-station.csv', index=False, float_format='%.1f')

In [27]:
candidates = (
    'against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja'
)

In [33]:
sum_by_candidate = {
    c: int(round(sum([x[c] for x in expected_list])))
    for c in candidates
}

In [34]:
sum_by_candidate

{'against': 218322,
 'cherechen': 57309,
 'corrupted': 42723,
 'dmitriyev': 63605,
 'kanopatskaja': 54572,
 'tihanovkaja': 1427032}

In [48]:
pd.DataFrame({'candidate' :list(sum_by_candidate.keys()), 'estimated_total':list(sum_by_candidate.values())}).to_csv('candidates-total.csv', index=False)

In [50]:
sum([sum([vp[c] for c in candidates]) for vp in expected_list if vp['area'] == 'capital'])

525990.7441129999

In [51]:
sum([x['tihanovkaja'] for x in expected_list if x['area'] == 'capital'])

399574.6449999998

In [52]:
sum([x['against'] for x in expected_list if x['area'] == 'capital'])

61163.718170000015