In [742]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [743]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})

In [744]:
areas = ['city', 'minsk_suburb', 'minsk_village', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']
candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [745]:
coefficients = []

for area in areas:
    mask = df['area'] == area
    
    data = df[mask]
    slope, _, _, _ = np.linalg.lstsq(
        np.array(data['tihanovkaja_photoVoices'])[:,np.newaxis], data['tihanovkaja_officialVotes'], None
    )
    slope = slope[0]
    
    
    mask = mask & ~df['tihanovkaja_officialVotes'].isna() & data['tihanovkaja_photoVoices'] > 0
    data = df[mask]
        

    coefficients.append({
        'area': area,
        'coefficient': slope, 
    })
    

coef_df = pd.DataFrame(coefficients)

In [746]:
coef_df.to_csv('tihanovkaja_photo_coefficients.csv', float_format='%.3f', index=False)

In [747]:
def official_candidates(data):
    return _generic_candidates_sum(data, 'officialVotes', candidates)


def photo_candidates(data):
    return _generic_candidates_sum(data, 'photoVoices', candidates)


def official_alt_candidates(data):
    return _generic_candidates_sum(data, 'officialVotes', alt_candidates)


def photo_alt_candidates(data):
    return _generic_candidates_sum(data, 'photoVoices', alt_candidates)

def registered_alt_candidates(data):
    return _generic_candidates_sum(data, 'registered', alt_candidates)


def _generic_candidates_sum(data, source, candidates_to_sum):
    return sum([data[f'{candidate}_{source}'] for candidate in candidates_to_sum])

In [748]:
protest_coeff_by_area = {}

for area in areas:
    mask = df['area'] == area
    
    data = df[mask]
    slope, _, _, _ = np.linalg.lstsq(
        np.array(data['registered'] - data['lukashenko_registered'] - data['ignore_registered'])[:,np.newaxis], 
        official_candidates(data), 
        None
    )
    slope = slope[0]
    
    protest_coeff_by_area[area] = slope

In [749]:
protest_coeff_by_area

{'city': 2.764345281593497,
 'minsk_suburb': 1.7746997482514106,
 'minsk_village': 2.9130119691941356,
 'capital': 2.068626058819882,
 'village': 5.116600461454296,
 'town_below100': 3.1612280317497325,
 'town_over100': 3.1557156281040712,
 'embassy': 1.713818460378525}

In [750]:
pd.DataFrame(protest_coeff_by_area.items(), columns=['area', 'protest_coefficient']).to_csv('protest_registered_coefficients.csv', index=False, float_format='%.3f')

In [751]:
alt_candidates_coefficients = []

for candidate in set(alt_candidates) - {'against', 'corrupted'}:
    for area in areas:
        data = df[df['area'] == area]
        values = data[f'{candidate}_officialVotes'] / official_alt_candidates(data)
        coeff = np.mean(values)
        std = np.std(values)
        alt_candidates_coefficients.append({
            'candidate': candidate,
            'area': area,
            'coefficient': coeff,
            'std': std,
            'coeff_variation': std / coeff,
        })
    values = df[f'{candidate}_officialVotes'] / official_alt_candidates(df)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': candidate,
        'area': 'total',
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
        
for area in areas:
    data = df[df['area'] == area]
    values = (data[f'against_officialVotes'] + data['corrupted_officialVotes']) / official_alt_candidates(data)
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'against+corrupted',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
values = (df[f'against_officialVotes'] + df['corrupted_officialVotes']) / official_alt_candidates(df)
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'against+corrupted',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})


with_corrupted_mask = df['corrupted_officialVotes'] > 0

for area in areas:
    data = df[(df['area'] == area) & with_corrupted_mask]
    values = data['corrupted_officialVotes'] / (data['against_officialVotes'] + data['corrupted_officialVotes'])
    coeff = np.mean(values)
    std = np.std(values)
    alt_candidates_coefficients.append({
        'candidate': 'corrupted/corrupted+against',
        'area': area,
        'coefficient': coeff,
        'std': std,
        'coeff_variation': std / coeff,
    })
    
data = df[with_corrupted_mask]
    
values = data['corrupted_officialVotes'] / (df[f'against_officialVotes'] + df['corrupted_officialVotes'])
coeff = np.mean(values)
std = np.std(values)
alt_candidates_coefficients.append({
    'candidate': 'corrupted/corrupted+against',
    'area': 'total',
    'coefficient': coeff,
    'std': std,
    'coeff_variation': std / coeff,
})

In [752]:
alt_coeff_intermediate = pd.DataFrame(alt_candidates_coefficients)

In [753]:
alt_coeff_intermediate

Unnamed: 0,candidate,area,coefficient,std,coeff_variation
0,kanopatskaja,city,0.129031,0.073889,0.572648
1,kanopatskaja,minsk_suburb,0.128111,0.056227,0.438892
2,kanopatskaja,minsk_village,0.159577,0.079321,0.497072
3,kanopatskaja,capital,0.114922,0.042487,0.369701
4,kanopatskaja,village,0.15405,0.075769,0.491848
5,kanopatskaja,town_below100,0.118077,0.065735,0.556714
6,kanopatskaja,town_over100,0.114616,0.060025,0.52371
7,kanopatskaja,embassy,0.152444,0.071781,0.470867
8,kanopatskaja,total,0.129105,0.068846,0.533258
9,cherechen,city,0.153326,0.057543,0.375303


In [754]:
alt_coeff_intermediate.to_csv('alt_candidates_coefficients--intermediate.csv', float_format='%.3f', index=False)

In [755]:
alt_coeff_final = alt_coeff_intermediate[alt_coeff_intermediate['area'] != 'total'][['candidate', 'area', 'coefficient', 'std', 'coeff_variation']].copy()
alt_coeff_final['source'] = ['area'] * len(alt_coeff_final) 

In [756]:
totals = alt_coeff_intermediate[alt_coeff_intermediate['area'] == 'total']

for c in set(alt_coeff_final['candidate']):
    total_row = totals[totals['candidate'] == c].iloc[0]
    
    mask = (alt_coeff_final['candidate'] == c) & (alt_coeff_final['coeff_variation'] > total_row['coeff_variation'])
    
    for col in ['coefficient', 'std', 'coeff_variation']:
        alt_coeff_final.loc[mask, col] = total_row[col]
        
    alt_coeff_final.loc[mask, 'source'] = 'total'

In [757]:
alt_coeff_final

Unnamed: 0,candidate,area,coefficient,std,coeff_variation,source
0,kanopatskaja,city,0.129105,0.068846,0.533258,total
1,kanopatskaja,minsk_suburb,0.128111,0.056227,0.438892,area
2,kanopatskaja,minsk_village,0.159577,0.079321,0.497072,area
3,kanopatskaja,capital,0.114922,0.042487,0.369701,area
4,kanopatskaja,village,0.15405,0.075769,0.491848,area
5,kanopatskaja,town_below100,0.129105,0.068846,0.533258,total
6,kanopatskaja,town_over100,0.114616,0.060025,0.52371,area
7,kanopatskaja,embassy,0.152444,0.071781,0.470867,area
9,cherechen,city,0.153326,0.057543,0.375303,area
10,cherechen,minsk_suburb,0.123641,0.050391,0.407561,area


In [758]:
for area in areas:
    mask = (alt_coeff_final['area'] == area) & (alt_coeff_final['candidate'] != 'corrupted/corrupted+against')
    k = 1 / alt_coeff_final[mask]['coefficient'].sum()
    print(k)
    alt_coeff_final.loc[mask, 'coefficient'] *= k
    alt_coeff_final.loc[mask, 'std'] *= k

0.9835299385787036
1.01272605905613
0.9729280587200787
1.0
0.9756616908321871
0.9890926955314852
0.9989951065512197
0.9771933993069353


In [759]:
pure_coeffs = []


corrupted = (
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) * 
    np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'corrupted/corrupted+against']['coefficient'])
)

against = np.array(alt_coeff_final[alt_coeff_final['candidate'] == 'against+corrupted']['coefficient']) - corrupted

(corrupted, against)    
    

(array([0.0769261 , 0.10155341, 0.09707345, 0.12302256, 0.08666081,
        0.09675078, 0.08812966, 0.26212283]),
 array([0.48059844, 0.47643811, 0.44814326, 0.52001318, 0.46640353,
        0.46796298, 0.47816148, 0.29180978]))

In [760]:
alt_coeff_extended = pd.concat([alt_coeff_final, pd.DataFrame([
    {
        'area': area,
        'coefficient': cor,
        'candidate': 'corrupted',
    }
    for area, cor in zip(areas, corrupted)
] + [
    {
        'area': area,
        'coefficient': ag,
        'candidate': 'against',
    }
    for area, ag in zip(areas, against)
])])

In [761]:
alt_coeff_extended.to_csv('alt_candidates_coefficients.csv', float_format='%.3f', index=False)

In [762]:
for area in areas:
    mask = (alt_coeff_extended['area'] == area) & (alt_coeff_extended['candidate'].isin(alt_candidates))
    print(sum(alt_coeff_extended[mask]['coefficient']))

1.0
1.0000000000000002
0.9999999999999999
1.0
1.0
0.9999999999999998
0.9999999999999998
1.0
