In [557]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [558]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})

# Too many for against, one observer, high turnout.
df = df[~df['id'].isin(['04-114-0017'])]

In [559]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']

candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [560]:
def official_alt_candidates(data):
    return _generic_candidates_sum(data, 'officialVotes', alt_candidates)

def _generic_candidates_sum(data, source, candidates_to_sum):
    return sum([data[f'{candidate}_{source}'] for candidate in candidates_to_sum])

In [561]:
alt_candidates_coefficients = []

for candidate in set(alt_candidates) - {'against', 'corrupted'}:
    for area in areas:
        data = df[df['area'] == area]
        values = data[f'{candidate}_officialVotes'] / official_alt_candidates(data)
        coeff = np.mean(values)
        std = np.std(values)
        alt_candidates_coefficients.append({
            'candidate': candidate,
            'area': area,
            'coefficient': coeff,
            'std': std,
            'coeff_variation': std / coeff,
        })
    values = df[f'{candidate}_officialVotes'] / official_alt_candidates(df)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': candidate,
        'area': 'total',
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
        
for area in areas:
    data = df[df['area'] == area]
    values = (data[f'against_officialVotes'] + data['corrupted_officialVotes']) / official_alt_candidates(data)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'against+corrupted',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
values = (df[f'against_officialVotes'] + df['corrupted_officialVotes']) / official_alt_candidates(df)
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'against+corrupted',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})


with_corrupted_mask = df['corrupted_officialVotes'] > 0

for area in areas:
    data = df[(df['area'] == area) & with_corrupted_mask]
    values = data['corrupted_officialVotes'] / (data['against_officialVotes'] + data['corrupted_officialVotes'])
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'corrupted/corrupted+against',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
data = df[with_corrupted_mask]
    
values = data['corrupted_officialVotes'] / (df[f'against_officialVotes'] + df['corrupted_officialVotes'])
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'corrupted/corrupted+against',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})

In [562]:
alt_coeff_intermediate = pd.DataFrame(alt_candidates_coefficients)

In [563]:
alt_coeff_intermediate

Unnamed: 0,candidate,area,coefficient,std,coeff_variation
0,dmitriyev,city,0.162542,0.048197,0.29652
1,dmitriyev,minsk_suburb,0.183484,0.07662,0.417584
2,dmitriyev,capital,0.128187,0.046736,0.364591
3,dmitriyev,village,0.170952,0.061962,0.362451
4,dmitriyev,town_below100,0.161301,0.056435,0.349877
5,dmitriyev,town_over100,0.163573,0.053946,0.329799
6,dmitriyev,embassy,0.180056,0.078361,0.435202
7,dmitriyev,total,0.162229,0.060417,0.372419
8,kanopatskaja,city,0.111711,0.070884,0.63453
9,kanopatskaja,minsk_suburb,0.127564,0.04482,0.351356


In [564]:
alt_coeff_final = alt_coeff_intermediate[alt_coeff_intermediate['area'] != 'total'][['candidate', 'area', 'coefficient', 'std', 'coeff_variation']].copy()
alt_coeff_final['source'] = ['area'] * len(alt_coeff_final) 

In [565]:
totals = alt_coeff_intermediate[alt_coeff_intermediate['area'] == 'total']

for c in set(alt_coeff_final['candidate']):
    total_row = totals[totals['candidate'] == c].iloc[0]
    
    mask = (alt_coeff_final['candidate'] == c) & (alt_coeff_final['coeff_variation'] > total_row['coeff_variation'])
    
    for col in ['coefficient', 'std', 'coeff_variation']:
        alt_coeff_final.loc[mask, col] = total_row[col]
        
    alt_coeff_final.loc[mask, 'source'] = 'total'

In [566]:
alt_coeff_final

Unnamed: 0,candidate,area,coefficient,std,coeff_variation,source
0,dmitriyev,city,0.162542,0.048197,0.29652,area
1,dmitriyev,minsk_suburb,0.162229,0.060417,0.372419,total
2,dmitriyev,capital,0.128187,0.046736,0.364591,area
3,dmitriyev,village,0.170952,0.061962,0.362451,area
4,dmitriyev,town_below100,0.161301,0.056435,0.349877,area
5,dmitriyev,town_over100,0.163573,0.053946,0.329799,area
6,dmitriyev,embassy,0.162229,0.060417,0.372419,total
8,kanopatskaja,city,0.124866,0.064242,0.514489,total
9,kanopatskaja,minsk_suburb,0.127564,0.04482,0.351356,area
10,kanopatskaja,capital,0.119895,0.051219,0.427203,area


In [567]:
for area in areas:
    mask = (alt_coeff_final['area'] == area) & (alt_coeff_final['candidate'] != 'corrupted/corrupted+against')
    k = 1 / alt_coeff_final[mask]['coefficient'].sum()
    print(k)
    alt_coeff_final.loc[mask, 'coefficient'] *= k
    alt_coeff_final.loc[mask, 'std'] *= k

0.9870157679815269
1.0217165254068146
1.0000000000000002
0.9624434991638918
1.0
0.987621344229767
0.9604345606045054


In [568]:
pure_coeffs = []


corrupted = (
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) * 
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'corrupted/corrupted+against']['coefficient'])
)

against = np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) - corrupted

(corrupted, against)    
    

(array([0.0693931 , 0.09857355, 0.11391487, 0.10112689, 0.07339805,
        0.09509407, 0.23378032]),
 array([0.49349209, 0.48430439, 0.51718627, 0.45172369, 0.51526129,
        0.47221926, 0.31791627]))

In [569]:
alt_coeff_extended = pd.concat([alt_coeff_final, pd.DataFrame([
    {
        'area': area,
        'coefficient': cor,
        'candidate': 'corrupted',
    }
    for area, cor in zip(areas, corrupted)
] + [
    {
        'area': area,
        'coefficient': ag,
        'candidate': 'against',
    }
    for area, ag in zip(areas, against)
])])

In [570]:
alt_coeff_extended.to_csv('alt_candidates_coefficients.csv', float_format='%.3f', index=False)

In [571]:
for area in areas:
    mask = (alt_coeff_extended['area'] == area) & (alt_coeff_extended['candidate'].isin(alt_candidates))
    print(sum(alt_coeff_extended[mask]['coefficient']))

1.0000000000000002
1.0
1.0000000000000002
0.9999999999999999
1.0
1.0
1.0
