In [78]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [79]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})
no_turnout_df = pd.read_csv('non_trusted_turnout.csv')

In [80]:
df = df[~df['id'].isin(no_turnout_df['id'])]

In [81]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']
regioned_areas = ['town_below100', 'town_over100', 'city', 'village']

In [82]:
def build_area_df():

    coefficients = []

    for area in areas + ['town/city']:
        if area == 'town/city':
            mask = df['area'].isin(['town_below100', 'town_over100', 'city'])
        else:    
            mask = df['area'] == area
            
        data = df[mask]
        col = data['officialVotes'] / data['officialVoters']
        turnout = np.mean(col)
        std = np.std(col)
        coeff_var = std / turnout
        
        coefficients.append({
            'area': area,
            'turnout': turnout, 
            'coeff_variation': coeff_var,
            'source': 'area',
        })


    return pd.DataFrame(coefficients)

In [83]:
def build_region_df(area_df):
    region_coefficients = []

    for area in regioned_areas:
        for region in range(1, 7):
            area_mask = df['area'] == area            
            mask = area_mask & (df['region'] == region)
            
            col = df[mask]['officialVotes'] / df[mask]['officialVoters']
            
            area_row = area_df[area_df['area'] == area].iloc[0]
            
            turnout = np.mean(col)
            std = np.std(col)
            coeff_var = std / turnout
            source = 'area-region'
            
            if len(col) < 5 or coeff_var > area_row['coeff_variation']:
                turnout, coeff_var = area_row[['turnout', 'coeff_variation']].values
                source = 'area'
            
            region_coefficients.append({
                'area': area,
                'region': region,
                'turnout': turnout, 
                'coeff_variation': coeff_var,
                'source': source,
            })

    return pd.DataFrame(region_coefficients)

In [84]:
area_df = build_area_df()

In [85]:
region_df = build_region_df(area_df)

In [86]:
def concat_df(region_df, area_df):
    non_regioned_df = area_df[~area_df['area'].isin(regioned_areas + ['town/city'])].copy()
    non_regioned_df['region'] = 0
    non_regioned_df.loc[non_regioned_df['area'] == 'minsk_suburb', 'region'] = 5
    non_regioned_df.loc[non_regioned_df['area'] == 'capital', 'region'] = 7
    non_regioned_df.loc[non_regioned_df['area'] == 'embassy', 'region'] = 8
    return pd.concat([region_df, non_regioned_df])

In [87]:
turnout_df = concat_df(region_df, area_df)

In [88]:
turnout_df

Unnamed: 0,area,region,turnout,coeff_variation,source
0,town_below100,1,0.824074,0.104222,area
1,town_below100,2,0.824074,0.104222,area
2,town_below100,3,0.850174,0.094827,area-region
3,town_below100,4,0.845611,0.087197,area-region
4,town_below100,5,0.824074,0.104222,area
5,town_below100,6,0.824074,0.104222,area
6,town_over100,1,0.776241,0.105065,area
7,town_over100,2,0.776241,0.105065,area
8,town_over100,3,0.776241,0.105065,area
9,town_over100,4,0.776241,0.105065,area


In [89]:
turnout_df.to_csv('turnout.csv', float_format='%.4f', index=False)