In [170]:
from decimal import Decimal

import pandas as pd

In [171]:
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
candidates = alt_candidates + ['tihanovkaja']
candidates_with_luk = candidates + ['lukashenko']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [172]:
df = pd.read_csv('total-votes.csv', converters={'latitude': Decimal, 'longitude': Decimal})

In [173]:
candidates_trusted = pd.read_csv('trusted-for-alternative-fixed.csv')
outliers = pd.read_csv('outliers.csv')

still_trusted_data = outliers[
    [x['non_trust_reason'].startswith('Trusted.') for x in outliers.iloc]
]['id']


In [174]:
tih_coefficients = pd.read_csv('tihanovkaja_registered_coefficients.csv')
protest_coefficients = pd.read_csv('protest_registered_coefficients.csv')
alt_coefficients = pd.read_csv('alt_candidates_coefficients.csv')

In [175]:
def get_key(area, region):
    return f'{area}_{region}'

In [176]:
tih_coeff = {
    f"{x['area']}_{x['region']}": x['coefficient'] 
    for x in tih_coefficients.iloc
}

protest_coeff = {
    f"{x['area']}_{x['region']}": x['coefficient'] 
    for x in protest_coefficients.iloc
}

alt_coeff = {
    f"{x['area']}_{x['candidate']}": x['coefficient'] 
    for x in alt_coefficients.iloc
}

tih_coeff['minsk_suburb_7'] = tih_coeff['minsk_suburb_5']
protest_coeff['minsk_suburb_7'] = protest_coeff['minsk_suburb_5']


In [177]:
candidates_data = {
    c: list()
    for c in candidates
}

trusted_data_mask = df['id'].isin(candidates_trusted['id']) | df['id'].isin(still_trusted_data)

In [178]:
df.loc[trusted_data_mask, candidates] = df[trusted_data_mask][[f'{c}_officialVotes' for c in candidates]].values

In [179]:
def calculate_alt(candidate, protest, tihanovkaja):
    if tihanovkaja >= protest:
        return 0
    
    return (protest - tihanovkaja) * alt_coeff[f'{area}_{candidate}']

In [180]:
for voice_row in df[~trusted_data_mask].iloc:
    poll_station_id = voice_row['id']
    area = voice_row['area']
    region = voice_row['region']
    key = get_key(area, region)
    
    tihanovkaja = voice_row['tihanovkaja_registered'] * tih_coeff[key]
    protest = voice_row['registered'] * protest_coeff[key]
    
    candidates_data['tihanovkaja'].append(tihanovkaja)    
    
    for candidate in alt_candidates:
        candidates_data[candidate].append(calculate_alt(candidate, protest, tihanovkaja))

In [181]:
for candidate in candidates:
    df.loc[~trusted_data_mask, candidate] = candidates_data[candidate] 

In [182]:
# Fix corrupted.

fixed_corrupted = []
missing_corrupted_mask = df['corrupted'].isna()


for row in df[missing_corrupted_mask].iloc:
    coeff = alt_coeff[f'{row["area"]}_corrupted/corrupted+against']
    fixed_corrupted.append(row['against'] * coeff)

    
    
df.loc[missing_corrupted_mask, 'corrupted'] = fixed_corrupted
df.loc[missing_corrupted_mask, 'against'] -= fixed_corrupted

In [183]:
def fix_voters():
    too_little_votes_mask = (sum(df[c] for c in candidates) / df['votes']) > 0.95
    
    print(too_little_votes_mask.sum())
    
    df.loc[too_little_votes_mask & (df['area'] == 'embassy'), 'turnout'] = 1
#     df.loc[too_little_votes_mask & (df['region'] == 7), 'turnout'] = 0.8
#     df.loc[too_little_votes_mask & df['region'].isin([7, 8]), 'votes'] = (
#         df[too_little_votes_mask & df['region'].isin([7, 8])]['voters'] * 
#         df[too_little_votes_mask & df['region'].isin([7, 8])]['turnout']
#     )
    
    too_little_votes_mask = (sum(df[c] for c in candidates) / df['votes']) > 0.955
    
    print(too_little_votes_mask.sum())
    
    new_votes = sum(
        df[too_little_votes_mask][c] for c in candidates
    ) / 95 * 100
    new_voters = new_votes / df[too_little_votes_mask]['turnout']
    
    extra_voters = {}

    for region in range(1, 9):
        data = df[too_little_votes_mask & (df['region'] == region)]    
        extra_voters[region] = (new_voters[data.index] - data['voters']).sum() 
        
    print(extra_voters)
        
    estimated_voters = {}
    no_data_mask = df['officialVoters'].isna() & ((sum(df[c] for c in candidates) / df['votes']) < 0.8)

    for region in range(1, 9):
        data = df[no_data_mask & (df['region'] == region)]    
        estimated_voters[region] = data['voters'].sum()
        
    coeff = {
        k: (
            (estimated_voters[k] - extra_voters[k]) /
            estimated_voters[k]
        )
        for k in extra_voters
    }
    
    for region, c in coeff.items():
        mask = no_data_mask & (df['region'] == region)
        
        df.loc[mask, 'voters'] = df[mask]['voters'] * c
        df.loc[mask, 'votes'] = df[mask]['voters'] * df[mask]['turnout']
        
    df.loc[too_little_votes_mask, 'voters'] = new_voters
    df.loc[too_little_votes_mask, 'votes'] = new_votes
    
        
    print(coeff)
        
    

In [184]:
fix_voters()

482
475
{1: 34465.92769415137, 2: 7399.657135725711, 3: 10467.67914955045, 4: 24218.518209313635, 5: 24911.12949301745, 6: 2571.0925925409283, 7: 109124.56503892448, 8: -74.54105263157919}
{1: 0.9382182824145439, 2: 0.9876631757965273, 3: 0.9865264268893991, 4: 0.9323700430294322, 5: 0.9564280052729108, 6: 0.9955737267941214, 7: 0.5684311878674896, 8: 1.0056142147916411}


In [185]:
fix_voters()

309
94
{1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 22533.173492907285, 8: 0.0}
{1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 0.44947064547589055, 8: 1.0}


In [186]:
fix_voters()

272
13
{1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 3073.4881986956307, 8: 0.0}
{1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 0.7605385889236708, 8: 1.0}


In [187]:
fix_voters()

262
0
{1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0}
{1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0}


In [188]:
# Check data.

((df['votes'] / df['turnout'] - df['voters']).abs() > 0.01).sum()

0

In [189]:
[(df[c].isna()).sum() for c in candidates]

[0, 0, 0, 0, 0, 0]

In [190]:
for region in range(1, 9):
    print(region)
    print(df[df['region'] == region].voters.sum())

1
984600.9079999999
2
849472.006
3
1069566.1600000001
4
751084.715
5
1142041.992
6
801824.8640000001
7
1241020.82
8
30747.199999999997


In [191]:
pd.read_csv('voters.csv')

Unnamed: 0,name,region,voters
0,brest,1,984601
1,viciebsk,2,849472
2,homel,3,1069567
3,hrodna,4,751084
4,minsk-region,5,1142043
5,mahiliou,6,801825
6,minsk,7,1241021
7,embassy,8,5319


In [192]:
df[(df['area'] == 'embassy') & (df['voters'] < 200)][['id', 'voters', 'town']]

Unnamed: 0,id,voters,town
5221,07-002-0090,114.0,kz
5236,07-002-0105,60.0,rs
5244,07-002-0113,198.0,fi


In [193]:
sum((df['turnout'] > 1) | (df['turnout'] < 0))

0

In [194]:
df['lukashenko'] = df['votes'] - sum(df[c] for c in candidates)

In [195]:
[(df[c] < 0).sum() for c in candidates]

[0, 0, 0, 0, 0, 0]

In [196]:
df['votes'].sum() / df['voters'].sum()

0.7861676757782586

In [197]:
df.to_csv('total-candidates.csv', float_format='%g', index=False)