In [137]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [138]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})

In [139]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']

candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [140]:
def official_alt_candidates(data):
    return _generic_candidates_sum(data, 'officialVotes', alt_candidates)

def _generic_candidates_sum(data, source, candidates_to_sum):
    return sum([data[f'{candidate}_{source}'] for candidate in candidates_to_sum])

In [141]:
alt_candidates_coefficients = []

for candidate in set(alt_candidates) - {'against', 'corrupted'}:
    for area in areas:
        data = df[df['area'] == area]
        values = data[f'{candidate}_officialVotes'] / official_alt_candidates(data)
        coeff = np.mean(values)
        std = np.std(values)
        alt_candidates_coefficients.append({
            'candidate': candidate,
            'area': area,
            'coefficient': coeff,
            'std': std,
            'coeff_variation': std / coeff,
        })
    values = df[f'{candidate}_officialVotes'] / official_alt_candidates(df)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': candidate,
        'area': 'total',
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
        
for area in areas:
    data = df[df['area'] == area]
    values = (data[f'against_officialVotes'] + data['corrupted_officialVotes']) / official_alt_candidates(data)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'against+corrupted',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
values = (df[f'against_officialVotes'] + df['corrupted_officialVotes']) / official_alt_candidates(df)
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'against+corrupted',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})


with_corrupted_mask = df['corrupted_officialVotes'] > 0

for area in areas:
    data = df[(df['area'] == area) & with_corrupted_mask]
    values = data['corrupted_officialVotes'] / (data['against_officialVotes'] + data['corrupted_officialVotes'])
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'corrupted/corrupted+against',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
data = df[with_corrupted_mask]
    
values = data['corrupted_officialVotes'] / (df[f'against_officialVotes'] + df['corrupted_officialVotes'])
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'corrupted/corrupted+against',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})

In [142]:
alt_coeff_intermediate = pd.DataFrame(alt_candidates_coefficients)

In [143]:
alt_coeff_intermediate

Unnamed: 0,candidate,area,coefficient,std,coeff_variation
0,dmitriyev,city,0.170326,0.055859,0.327952
1,dmitriyev,minsk_suburb,0.183484,0.07662,0.417584
2,dmitriyev,capital,0.123936,0.045338,0.365815
3,dmitriyev,village,0.163358,0.056713,0.34717
4,dmitriyev,town_below100,0.166685,0.064353,0.386073
5,dmitriyev,town_over100,0.158237,0.055308,0.349523
6,dmitriyev,embassy,0.180056,0.078361,0.435202
7,dmitriyev,total,0.16246,0.062333,0.383685
8,kanopatskaja,city,0.121926,0.072721,0.596431
9,kanopatskaja,minsk_suburb,0.127564,0.04482,0.351356


In [144]:
alt_coeff_final = alt_coeff_intermediate[alt_coeff_intermediate['area'] != 'total'][['candidate', 'area', 'coefficient', 'std', 'coeff_variation']].copy()
alt_coeff_final['source'] = ['area'] * len(alt_coeff_final) 

In [145]:
totals = alt_coeff_intermediate[alt_coeff_intermediate['area'] == 'total']

for c in set(alt_coeff_final['candidate']):
    total_row = totals[totals['candidate'] == c].iloc[0]
    
    mask = (alt_coeff_final['candidate'] == c) & (alt_coeff_final['coeff_variation'] > total_row['coeff_variation'])
    
    for col in ['coefficient', 'std', 'coeff_variation']:
        alt_coeff_final.loc[mask, col] = total_row[col]
        
    alt_coeff_final.loc[mask, 'source'] = 'total'

In [146]:
alt_coeff_final

Unnamed: 0,candidate,area,coefficient,std,coeff_variation,source
0,dmitriyev,city,0.170326,0.055859,0.327952,area
1,dmitriyev,minsk_suburb,0.16246,0.062333,0.383685,total
2,dmitriyev,capital,0.123936,0.045338,0.365815,area
3,dmitriyev,village,0.163358,0.056713,0.34717,area
4,dmitriyev,town_below100,0.16246,0.062333,0.383685,total
5,dmitriyev,town_over100,0.158237,0.055308,0.349523,area
6,dmitriyev,embassy,0.16246,0.062333,0.383685,total
8,kanopatskaja,city,0.122865,0.060896,0.495633,total
9,kanopatskaja,minsk_suburb,0.127564,0.04482,0.351356,area
10,kanopatskaja,capital,0.115511,0.043055,0.372737,area


In [147]:
for area in areas:
    mask = (alt_coeff_final['area'] == area) & (alt_coeff_final['candidate'] != 'corrupted/corrupted+against')
    k = 1 / alt_coeff_final[mask]['coefficient'].sum()
    print(k)
    alt_coeff_final.loc[mask, 'coefficient'] *= k
    alt_coeff_final.loc[mask, 'std'] *= k

0.9733104524559251
1.0046857665186
1.0
0.9911020468887862
1.0042428121338354
1.0
0.9585928598691952


In [148]:
pure_coeffs = []


corrupted = (
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) * 
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'corrupted/corrupted+against']['coefficient'])
)

against = np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) - corrupted

(corrupted, against)    
    

(array([0.07947533, 0.09693045, 0.12131271, 0.1038782 , 0.08234015,
        0.10263981, 0.23553661]),
 array([0.48489993, 0.47623163, 0.52087818, 0.45119814, 0.5043925 ,
        0.48945156, 0.32030464]))

In [149]:
alt_coeff_extended = pd.concat([alt_coeff_final, pd.DataFrame([
    {
        'area': area,
        'coefficient': cor,
        'candidate': 'corrupted',
    }
    for area, cor in zip(areas, corrupted)
] + [
    {
        'area': area,
        'coefficient': ag,
        'candidate': 'against',
    }
    for area, ag in zip(areas, against)
])])

In [150]:
alt_coeff_extended.to_csv('alt_candidates_coefficients.csv', float_format='%.3f', index=False)

In [151]:
for area in areas:
    mask = (alt_coeff_extended['area'] == area) & (alt_coeff_extended['candidate'].isin(alt_candidates))
    print(sum(alt_coeff_extended[mask]['coefficient']))

1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999999
