In [790]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [791]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})

In [792]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']
regioned_areas = ['town_below100', 'town_over100', 'city', 'village']

candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [793]:
def corr(x, y):
    return np.corrcoef(x, y)[0][1]

In [794]:
{
    'reg--protest': corr(df['registered'], official_candidates(df)),
    'reg-not-for-luk--protest': corr(df['registered'] - df['lukashenko_registered'], official_candidates(df)),
    'reg-not-ignore--protest': corr(df['registered'] - df['ignore_registered'], official_candidates(df)),
    'reg-protest--protest': corr(df['registered']- df['lukashenko_registered'] - df['ignore_registered'], official_candidates(df)),
    'reg--voters': corr(df['registered'], df['officialVoters']),
    'reg--votes': corr(df['registered'], df['officialVotes']),
    'reg--tih': corr(df['registered'], df['tihanovkaja_officialVotes']),
    'tih-reg--tih': corr(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes']),   
}

{'reg--protest': 0.7981356506736353,
 'reg-not-for-luk--protest': 0.7973061777194324,
 'reg-not-ignore--protest': 0.797906702143812,
 'reg-protest--protest': 0.7970712065739163,
 'reg--voters': 0.7177819343432367,
 'reg--votes': 0.6367563937929476,
 'reg--tih': 0.8216820043260249,
 'tih-reg--tih': 0.8210788626546867}

In [795]:
def get_coefficient(x, y):
    x = np.array(x)
    
    slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
    return slope[0]


In [796]:
data = df[df['area'] != 'embassy']
get_coefficient(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

2.1106965396680333

In [797]:
def build_area_df(x_col, y_col):

    coefficients = []

    for area in areas + ['town/city']:
        if area == 'town/city':
            mask = df['area'].isin(['town_below100', 'town_over100', 'city'])
        else:    
            mask = df['area'] == area
            
        x = x_col[mask]
        y = y_col[mask]

        coefficients.append({
            'area': area,
            'coefficient': get_coefficient(x, y), 
            'correlation': corr(x, y),
            'source': 'area',
        })


    return pd.DataFrame(coefficients)

In [798]:
coef_df = build_area_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])

In [799]:
coef_df

Unnamed: 0,area,coefficient,correlation,source
0,city,2.418799,0.768025,area
1,minsk_suburb,1.851287,0.897647,area
2,capital,1.687831,0.898053,area
3,village,2.676662,0.829257,area
4,town_below100,2.604982,0.71745,area
5,town_over100,2.772336,0.609409,area
6,embassy,1.561313,0.976445,area
7,town/city,2.544671,0.733781,area


In [800]:
def build_region_df(x_col, y_col, town_fix):
    region_coefficients = []

    for area in regioned_areas:
        for region in range(1, 7):
            area_mask = df['area'] == area            
            mask = area_mask & (df['region'] == region)

            x = np.array(x_col[mask])
            y = y_col[mask]

            slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
            slope = slope[0]
            correlation = corr(x, y)
            source = 'area-region'
            
            if area in ['town_below100', 'town_over100', 'city']:
                if correlation < town_fix[region]['correlation'] or len(x) < 5:
                    correlation = town_fix[region]['correlation']
                    slope = town_fix[region]['coefficient']   
                    source = 'town/city-region'

            region_coefficients.append({
                'area': area,
                'region': region,
                'coefficient': slope, 
                'correlation': correlation,
                'source': source,
            })

    return pd.DataFrame(region_coefficients)

In [801]:
def build_town_fix(x_col, y_col):
    coeffs = {}
    
    for region in range(1, 7):
        area_mask = df['area'].isin(['town_below100', 'town_over100', 'city'])
        
        mask = area_mask & (df['region'] == region)

        x = np.array(x_col[mask])
        y = y_col[mask]

        slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
        slope = slope[0]


        coeffs[region] = {
            'coefficient': slope, 
            'correlation': corr(x, y),
        }
        
    return coeffs

In [802]:
town_fix = build_town_fix(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])

In [803]:
reg_coef_df = build_region_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'], town_fix)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [804]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.239515,0.837444,area-region
1,town_below100,2,2.574984,0.443046,town/city-region
2,town_below100,3,2.550359,0.875062,area-region
3,town_below100,4,3.256651,0.845774,area-region
4,town_below100,5,2.280308,0.628524,town/city-region
5,town_below100,6,3.281245,0.276995,town/city-region
6,town_over100,1,2.695272,0.739308,town/city-region
7,town_over100,2,1.952299,0.787372,area-region
8,town_over100,3,2.606714,0.864458,town/city-region
9,town_over100,4,2.632979,0.80629,area-region


In [805]:
def fix_region_with_area(reg_df, area_df):
    for area in regioned_areas:
        reg_df.loc[
            (reg_df['area'] == area) & 
            (reg_df['correlation'] < area_df[coef_df['area'] == area]['correlation'].iloc[0]) |
            reg_df['correlation'].isna(),
            ['coefficient', 'correlation', 'source']
        ] = area_df[area_df['area'] == area][['coefficient', 'correlation', 'source']].values

In [806]:
fix_region_with_area(reg_coef_df, coef_df)

In [807]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.239515,0.837444,area-region
1,town_below100,2,2.604982,0.71745,area
2,town_below100,3,2.550359,0.875062,area-region
3,town_below100,4,3.256651,0.845774,area-region
4,town_below100,5,2.604982,0.71745,area
5,town_below100,6,2.604982,0.71745,area
6,town_over100,1,2.695272,0.739308,town/city-region
7,town_over100,2,1.952299,0.787372,area-region
8,town_over100,3,2.606714,0.864458,town/city-region
9,town_over100,4,2.632979,0.80629,area-region


In [808]:
def fix_town_by_area(reg_df, area_df):
    reg_df.loc[
        reg_df['area'].isin(['town_below100', 'town_over100', 'city']) & 
        (reg_df['correlation'] < area_df[area_df['area'] == 'town/city']['correlation'].iloc[0]),
        ['coefficient', 'correlation', 'source']
    ] = list(area_df[area_df['area'] == 'town/city'][['coefficient', 'correlation']].values[0]) + ['town/city']

In [809]:
fix_town_by_area(reg_coef_df, coef_df)

In [810]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.239515,0.837444,area-region
1,town_below100,2,2.544671,0.733781,town/city
2,town_below100,3,2.550359,0.875062,area-region
3,town_below100,4,3.256651,0.845774,area-region
4,town_below100,5,2.544671,0.733781,town/city
5,town_below100,6,2.544671,0.733781,town/city
6,town_over100,1,2.695272,0.739308,town/city-region
7,town_over100,2,1.952299,0.787372,area-region
8,town_over100,3,2.606714,0.864458,town/city-region
9,town_over100,4,2.632979,0.80629,area-region


In [811]:
def concat_df(region_df, area_df):
    non_regioned_df = area_df[~area_df['area'].isin(regioned_areas + ['town/city'])].copy()
    non_regioned_df['region'] = 0
    non_regioned_df.loc[non_regioned_df['area'] == 'minsk_suburb', 'region'] = 5
    non_regioned_df.loc[non_regioned_df['area'] == 'capital', 'region'] = 7
    non_regioned_df.loc[non_regioned_df['area'] == 'embassy', 'region'] = 8
    return pd.concat([region_df, non_regioned_df])

In [812]:
tih_reg_coeff = concat_df(reg_coef_df, coef_df)

In [813]:
tih_reg_coeff

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.239515,0.837444,area-region
1,town_below100,2,2.544671,0.733781,town/city
2,town_below100,3,2.550359,0.875062,area-region
3,town_below100,4,3.256651,0.845774,area-region
4,town_below100,5,2.544671,0.733781,town/city
5,town_below100,6,2.544671,0.733781,town/city
6,town_over100,1,2.695272,0.739308,town/city-region
7,town_over100,2,1.952299,0.787372,area-region
8,town_over100,3,2.606714,0.864458,town/city-region
9,town_over100,4,2.632979,0.80629,area-region


In [814]:
tih_reg_coeff.to_csv('tihanovkaja_registered_coefficients.csv', float_format='%.3f', index=False)

In [815]:
def official_candidates(data):
    return _generic_candidates_sum(data, 'officialVotes', candidates)

def _generic_candidates_sum(data, source, candidates_to_sum):
    return sum([data[f'{candidate}_{source}'] for candidate in candidates_to_sum])

In [816]:
official_candidates_column = official_candidates(df)

In [817]:
protest_area_df = build_area_df(df['registered'], official_candidates_column)

In [818]:
protest_area_df

Unnamed: 0,area,coefficient,correlation,source
0,city,2.898205,0.707572,area
1,minsk_suburb,2.072287,0.87332,area
2,capital,2.073937,0.911744,area
3,village,3.243032,0.808817,area
4,town_below100,3.257043,0.729172,area
5,town_over100,3.521583,0.64313,area
6,embassy,1.601618,0.97085,area
7,town/city,3.134743,0.713648,area


In [819]:
town_fix = build_town_fix(df['registered'], official_candidates_column)

In [820]:
protest_region_df = build_region_df(df['registered'], official_candidates_column, town_fix)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [821]:
fix_region_with_area(protest_region_df, protest_area_df)

In [822]:
fix_town_by_area(protest_region_df, protest_area_df)

In [823]:
protest_region_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.965086,0.899903,area-region
1,town_below100,2,3.257043,0.729172,area
2,town_below100,3,3.133767,0.890877,area-region
3,town_below100,4,4.079292,0.934506,area-region
4,town_below100,5,3.257043,0.729172,area
5,town_below100,6,3.257043,0.729172,area
6,town_over100,1,3.303203,0.737024,town/city-region
7,town_over100,2,2.658752,0.875497,area-region
8,town_over100,3,3.217209,0.881418,town/city-region
9,town_over100,4,3.291761,0.837441,area-region


In [824]:
protest_reg_coeff = concat_df(protest_region_df, protest_area_df)

In [825]:
protest_reg_coeff.to_csv('protest_registered_coefficients.csv', float_format='%.3f', index=False)