In [437]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [438]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})

# Too many for against, one observer, high turnout.
df = df[~df['id'].isin(['04-114-0017'])]

In [439]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']

candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [440]:
def official_alt_candidates(data):
    return _generic_candidates_sum(data, 'officialVotes', alt_candidates)

def _generic_candidates_sum(data, source, candidates_to_sum):
    return sum([data[f'{candidate}_{source}'] for candidate in candidates_to_sum])

In [441]:
alt_candidates_coefficients = []

for candidate in set(alt_candidates) - {'against', 'corrupted'}:
    for area in areas:
        data = df[df['area'] == area]
        values = data[f'{candidate}_officialVotes'] / official_alt_candidates(data)
        coeff = np.mean(values)
        std = np.std(values)
        alt_candidates_coefficients.append({
            'candidate': candidate,
            'area': area,
            'coefficient': coeff,
            'std': std,
            'coeff_variation': std / coeff,
        })
    values = df[f'{candidate}_officialVotes'] / official_alt_candidates(df)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': candidate,
        'area': 'total',
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
        
for area in areas:
    data = df[df['area'] == area]
    values = (data[f'against_officialVotes'] + data['corrupted_officialVotes']) / official_alt_candidates(data)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'against+corrupted',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
values = (df[f'against_officialVotes'] + df['corrupted_officialVotes']) / official_alt_candidates(df)
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'against+corrupted',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})


with_corrupted_mask = df['corrupted_officialVotes'] > 0

for area in areas:
    data = df[(df['area'] == area) & with_corrupted_mask]
    values = data['corrupted_officialVotes'] / (data['against_officialVotes'] + data['corrupted_officialVotes'])
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'corrupted/corrupted+against',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
data = df[with_corrupted_mask]
    
values = data['corrupted_officialVotes'] / (df[f'against_officialVotes'] + df['corrupted_officialVotes'])
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'corrupted/corrupted+against',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})

In [442]:
alt_coeff_intermediate = pd.DataFrame(alt_candidates_coefficients)

In [443]:
alt_coeff_intermediate

Unnamed: 0,candidate,area,coefficient,std,coeff_variation
0,dmitriyev,city,0.163334,0.045857,0.280757
1,dmitriyev,minsk_suburb,0.183484,0.07662,0.417584
2,dmitriyev,capital,0.12527,0.045466,0.362949
3,dmitriyev,village,0.164043,0.058193,0.354741
4,dmitriyev,town_below100,0.166679,0.06523,0.391349
5,dmitriyev,town_over100,0.160555,0.055606,0.346336
6,dmitriyev,embassy,0.180056,0.078361,0.435202
7,dmitriyev,total,0.16144,0.061745,0.382467
8,kanopatskaja,city,0.108617,0.062587,0.576223
9,kanopatskaja,minsk_suburb,0.127564,0.04482,0.351356


In [444]:
alt_coeff_final = alt_coeff_intermediate[alt_coeff_intermediate['area'] != 'total'][['candidate', 'area', 'coefficient', 'std', 'coeff_variation']].copy()
alt_coeff_final['source'] = ['area'] * len(alt_coeff_final) 

In [445]:
totals = alt_coeff_intermediate[alt_coeff_intermediate['area'] == 'total']

for c in set(alt_coeff_final['candidate']):
    total_row = totals[totals['candidate'] == c].iloc[0]
    
    mask = (alt_coeff_final['candidate'] == c) & (alt_coeff_final['coeff_variation'] > total_row['coeff_variation'])
    
    for col in ['coefficient', 'std', 'coeff_variation']:
        alt_coeff_final.loc[mask, col] = total_row[col]
        
    alt_coeff_final.loc[mask, 'source'] = 'total'

In [446]:
alt_coeff_final

Unnamed: 0,candidate,area,coefficient,std,coeff_variation,source
0,dmitriyev,city,0.163334,0.045857,0.280757,area
1,dmitriyev,minsk_suburb,0.16144,0.061745,0.382467,total
2,dmitriyev,capital,0.12527,0.045466,0.362949,area
3,dmitriyev,village,0.164043,0.058193,0.354741,area
4,dmitriyev,town_below100,0.16144,0.061745,0.382467,total
5,dmitriyev,town_over100,0.160555,0.055606,0.346336,area
6,dmitriyev,embassy,0.16144,0.061745,0.382467,total
8,kanopatskaja,city,0.122428,0.061656,0.503612,total
9,kanopatskaja,minsk_suburb,0.127564,0.04482,0.351356,area
10,kanopatskaja,capital,0.120109,0.050866,0.423496,area


In [447]:
for area in areas:
    mask = (alt_coeff_final['area'] == area) & (alt_coeff_final['candidate'] != 'corrupted/corrupted+against')
    k = 1 / alt_coeff_final[mask]['coefficient'].sum()
    print(k)
    alt_coeff_final.loc[mask, 'coefficient'] *= k
    alt_coeff_final.loc[mask, 'std'] *= k

0.9863765451092582
1.0048346008235145
1.0
0.96643576402378
1.0052668875706032
0.982508483426381
0.9581914245920473


In [448]:
pure_coeffs = []


corrupted = (
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) * 
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'corrupted/corrupted+against']['coefficient'])
)

against = np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) - corrupted

(corrupted, against)    
    

(array([0.0789082 , 0.09694481, 0.11694988, 0.10399012, 0.08676536,
        0.0996339 , 0.23567529]),
 array([0.48269116, 0.47630218, 0.51890337, 0.4569637 , 0.50050798,
        0.4706491 , 0.32049323]))

In [449]:
alt_coeff_extended = pd.concat([alt_coeff_final, pd.DataFrame([
    {
        'area': area,
        'coefficient': cor,
        'candidate': 'corrupted',
    }
    for area, cor in zip(areas, corrupted)
] + [
    {
        'area': area,
        'coefficient': ag,
        'candidate': 'against',
    }
    for area, ag in zip(areas, against)
])])

In [450]:
alt_coeff_extended.to_csv('alt_candidates_coefficients.csv', float_format='%.3f', index=False)

In [451]:
for area in areas:
    mask = (alt_coeff_extended['area'] == area) & (alt_coeff_extended['candidate'].isin(alt_candidates))
    print(sum(alt_coeff_extended[mask]['coefficient']))

1.0
0.9999999999999998
1.0
0.9999999999999998
1.0
0.9999999999999999
0.9999999999999999
