In [733]:
import pandas as pd
from decimal import Decimal

In [734]:
voice = pd.read_csv('voice.csv')
geo = pd.read_csv('geo-categorized.csv', converters={'latitude': Decimal, 'longitude': Decimal})
df = geo.merge(voice, on='id', how='left')

In [735]:
areas = ['city', 'minsk_suburb', 'minsk_village', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']
candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
candidates_with_luk = candidates + ['lukashenko']

In [736]:
area_size = {x['area']: x['size'] for x in pd.read_csv('poll-station-size.csv').iloc}

In [737]:
# Fill missing data with zero.

missing_mask = df.registered.isna()

df.loc[
    missing_mask, 
    (
        ['registered', 'photoVoices', 'ignore_registered'] + 
        [f'{c}_photoVoices' for c in candidates_with_luk] + 
        [f'{c}_registered' for c in candidates_with_luk]
    )
] = 0

df.loc[
    missing_mask, 
    ['registered', 'tihanovkaja_registered', 'against_registered']
] = [10, 9, 1]


In [738]:
# Fill official voters.


no_official_voters_mask = df['officialVoters'].isna()
df.loc[no_official_voters_mask, 'voters'] = [area_size[x] for x in df[no_official_voters_mask]['area']]
df.loc[~no_official_voters_mask, 'voters'] = df[~no_official_voters_mask]['officialVoters']


In [739]:
region_voters = pd.read_csv('voters.csv')

In [740]:
region_coeff = {8: 1}

for region in range(1, 8):
    region_mask = df['region'] == region
    official_region_voters = region_voters[region_voters['region'] == region].iloc[0]['voters']
    region_coeff[region] = (
        (official_region_voters - df[~no_official_voters_mask & region_mask]['officialVoters'].sum()) / 
        df[no_official_voters_mask & region_mask]['voters'].sum()
    )

In [741]:
region_coeff

{8: 1,
 1: 0.6719201996839486,
 2: 0.710658461867096,
 3: 0.6797002483685411,
 4: 0.6720982201531747,
 5: 0.7116482505472668,
 6: 0.6541507368343747,
 7: 0.8807336425472665}

In [742]:
df.loc[no_official_voters_mask, 'voters'] = [
    area_size[x['area']] * region_coeff[x['region']] 
    for x in df[no_official_voters_mask].iloc
]

In [743]:
# Fill real votes.

candidates_trusted = pd.read_csv('trusted-for-alternative-fixed.csv')
non_trusted_turnout = pd.read_csv('non_trusted_turnout.csv')
outliers = pd.read_csv('outliers.csv')

still_trusted_turnout = non_trusted_turnout[non_trusted_turnout['reason'] != 'outlier']['id']
still_trusted_data = outliers[
    [x['reason'].startswith('trusted') for x in outliers.iloc]
]['id']


In [744]:
turnout = pd.read_csv('turnout.csv', converters={'region': str})

area_region_turnout = {}

for row in turnout.iloc:
    area_region_turnout[f"{row['area']}_{row['region']}"] = row['turnout']

In [745]:
trusted_turnout_mask = (
    (
        df['id'].isin(candidates_trusted['id']) & 
        (~df['id'].isin(non_trusted_turnout['id']) | df['id'].isin(still_trusted_turnout))
    ) |
    df['id'].isin(still_trusted_data)
)

In [746]:
df.loc[trusted_turnout_mask, 'votes'] = df[~trusted_turnout_mask]['officialVotes']

In [747]:
def calc_votes(row):
    return area_region_turnout[f"{row['area']}_{row['region']}"] * row['voters']

In [748]:
area_region_turnout['minsk_suburb_7'] = area_region_turnout['minsk_suburb_5']

In [749]:
trusted_turnout_mask = (
    (
        df['id'].isin(candidates_trusted['id']) & 
        (~df['id'].isin(non_trusted_turnout['id']) | df['id'].isin(still_trusted_turnout))
    ) |
    df['id'].isin(still_trusted_data)
)
df.loc[trusted_turnout_mask, 'votes'] = df[trusted_turnout_mask]['officialVotes']
df.loc[~trusted_turnout_mask, 'votes'] = [calc_votes(x) for x in df[~trusted_turnout_mask].iloc]

In [750]:
# Check overflow for trusted poll stations.

data = df[~df['officialVotes'].isna()]
diff = data['votes'] - sum(data[f'{c}_officialVotes'] for c in candidates)

In [751]:
data[diff < 0]

Unnamed: 0,id,town,area,latitude,longitude,region,monitoredVotes,registered,photoVoices,officialVotes,...,kanopatskaja_officialVotes,kanopatskaja_photoVoices,lukashenko_registered,lukashenko_officialVotes,lukashenko_photoVoices,tihanovkaja_registered,tihanovkaja_officialVotes,tihanovkaja_photoVoices,voters,votes
5270,07-003-0021,Minsk,capital,53.882322,27.516888,7,,975.0,669.0,2396.0,...,19.0,1.0,0.0,438.0,0.0,951.0,1725.0,660.0,2400.0,1680.48
5731,07-009-0017,Minsk,capital,53.929606,27.544952,7,,996.0,727.0,2304.0,...,50.0,0.0,3.0,656.0,1.0,971.0,1407.0,715.0,2310.0,1617.462


In [752]:
df.loc[data[diff < 0].index, 'votes'] = sum(data[diff < 0][f'{c}_officialVotes'] for c in candidates) / 95 * 100

In [753]:
df['votes'].sum() / df['voters'].sum()

0.7864166999249712

In [754]:
df['turnout'] = df['votes'] / df['voters']

In [755]:
sum(df['voters'] < df['officialVoters'])

0

In [756]:
df.to_csv('total-votes.csv', float_format='%g', index=False)