In [1100]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [1101]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})
#df = df[df['id'] != '01-056-0021']

In [1102]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']
regioned_areas = ['town_below100', 'town_over100', 'city', 'village']

candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [1103]:
def corr(x, y):
    return np.corrcoef(x, y)[0][1]

In [1104]:
def official_candidates(data):
    return sum([data[f'{candidate}_officialVotes'] for candidate in candidates])

In [1105]:
{
    'reg--protest': corr(df['registered'], official_candidates(df)),
    'reg-not-for-luk--protest': corr(df['registered'] - df['lukashenko_registered'], official_candidates(df)),
    'reg-not-ignore--protest': corr(df['registered'] - df['ignore_registered'], official_candidates(df)),
    'reg-protest--protest': corr(df['registered']- df['lukashenko_registered'] - df['ignore_registered'], official_candidates(df)),
    'reg--voters': corr(df['registered'], df['officialVoters']),
    'reg--votes': corr(df['registered'], df['officialVotes']),
    'reg--tih': corr(df['registered'], df['tihanovkaja_officialVotes']),
    'tih-reg--tih': corr(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes']),   
}

{'reg--protest': 0.8080479615816749,
 'reg-not-for-luk--protest': 0.8072498434292082,
 'reg-not-ignore--protest': 0.8077012776750071,
 'reg-protest--protest': 0.8068972477553655,
 'reg--voters': 0.7145438010898286,
 'reg--votes': 0.6238668817846248,
 'reg--tih': 0.8308920449201923,
 'tih-reg--tih': 0.8300777876489589}

In [1106]:
def get_coefficient(x, y):
    x = np.array(x)
    
    slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
    return slope[0]

In [1107]:
data = df[~df['area'].isin(['embassy', 'minsk_suburb'])]
get_coefficient(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

2.2556933948729525

In [1108]:
#data = df[~df['area'].isin(['embassy', 'capital']) & (df['region'] == 1)]

data = df[~df['area'].isin(['embassy', 'capital', 'minsk_suburb'])]
get_coefficient(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

2.7776120831355864

In [1109]:
corr(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

0.866903008847731

In [1110]:
def count_country_correlation(x_col, y_col):
    mask = ~df['region'].isin([7, 8]) & (df['area'] != 'minsk_suburb')
    mask = df['area'].isin(regioned_areas)
    x = x_col[mask]
    y = y_col[mask]
    
    return {
        'correlation': corr(x, y),
        'coefficient': get_coefficient(x, y), 
    }

In [1111]:
def build_area_df(x_col, y_col):

    coefficients = []

    for area in areas:
        mask = df['area'] == area
            
        x = x_col[mask]
        y = y_col[mask]

        coefficients.append({
            'area': area,
            'coefficient': get_coefficient(x, y), 
            'correlation': corr(x, y),
            'source': 'area',
        })


    return pd.DataFrame(coefficients)

In [1112]:
def build_region_df(x_col, y_col):

    coefficients = []

    for region in range(1, 7):
        mask = df['region'] == region
        
        if region == 5:
            mask &= df['area'] != 'minsk_suburb'
            
        x = x_col[mask]
        y = y_col[mask]

        coefficients.append({
            'region': region,
            'coefficient': get_coefficient(x, y), 
            'correlation': corr(x, y),
            'source': 'region',
        })


    return pd.DataFrame(coefficients)

In [1113]:
coef_df = build_area_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])
tih_regions_df = build_region_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])
tih_total = count_country_correlation(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])

In [1114]:
coef_df

Unnamed: 0,area,coefficient,correlation,source
0,city,2.800546,0.913989,area
1,minsk_suburb,1.851287,0.897647,area
2,capital,1.705492,0.89709,area
3,village,2.633248,0.840685,area
4,town_below100,2.804901,0.792373,area
5,town_over100,2.744969,0.61973,area
6,embassy,1.561313,0.976445,area


In [1115]:
tih_regions_df

Unnamed: 0,region,coefficient,correlation,source
0,1,3.018802,0.944234,region
1,2,2.637138,0.678788,region
2,3,2.619546,0.903808,region
3,4,2.797579,0.896417,region
4,5,2.625042,0.854974,region
5,6,3.409268,0.425584,region


In [1116]:
tih_total

{'correlation': 0.866903008847731, 'coefficient': 2.7776120831355864}

In [1117]:
def build_area_region_df(x_col, y_col):
    region_coefficients = []

    for area in regioned_areas:
        for region in range(1, 7):
            area_mask = df['area'] == area            
            mask = area_mask & (df['region'] == region)
            
            if region == 5:
                mask &= df['area'] != 'minsk_suburb'

            x = np.array(x_col[mask])
            y = y_col[mask]

            slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
            slope = slope[0]
            correlation = corr(x, y)
            source = 'area-region'
            
            if len(x) < 5:
                slope = None
                correlation = None
            
#             if area in ['town_below100', 'town_over100', 'city']:
#                 if correlation < town_fix[region]['correlation'] or len(x) < 5:
#                     correlation = town_fix[region]['correlation']
#                     slope = town_fix[region]['coefficient']   
#                     source = 'town/city-region'

            region_coefficients.append({
                'area': area,
                'region': region,
                'coefficient': slope, 
                'correlation': correlation,
                'source': source,
            })

    return pd.DataFrame(region_coefficients)

In [1118]:
reg_coef_df = build_area_region_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [1119]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.239515,0.837444,area-region
1,town_below100,2,2.927972,0.378134,area-region
2,town_below100,3,2.550359,0.875062,area-region
3,town_below100,4,3.272629,0.846605,area-region
4,town_below100,5,2.724995,0.817699,area-region
5,town_below100,6,3.486666,0.118699,area-region
6,town_over100,1,,,area-region
7,town_over100,2,1.952299,0.787372,area-region
8,town_over100,3,,,area-region
9,town_over100,4,2.632979,0.80629,area-region


In [1120]:
def fix_by_area_region_total(reg_df, area_df, total_region_df, total):
    for area in regioned_areas:
        reg_df.loc[
            (reg_df['area'] == area) & 
            (
                reg_df['correlation'].isna() |
                (area_df[coef_df['area'] == area]['correlation'].iloc[0] - reg_df['correlation'] > 0.00)
            ),
            ['coefficient', 'correlation', 'source']
        ] = area_df[area_df['area'] == area][['coefficient', 'correlation', 'source']].values
        
    for region in range(1, 7):
        reg_df.loc[
            (reg_df['region'] == region) & 
            (total_region_df[total_region_df['region'] == region]['correlation'].iloc[0] - reg_df['correlation'] > 0.00),
            ['coefficient', 'correlation', 'source']
        ] = total_region_df[total_region_df['region'] == region][['coefficient', 'correlation', 'source']].values
        
    reg_df.loc[
        (total['correlation'] - reg_df['correlation'] > 0.00), 
        ['coefficient', 'correlation', 'source']
    ] = [total['coefficient'], total['correlation'], 'total']
    

In [1121]:
fix_by_area_region_total(reg_coef_df, coef_df, tih_regions_df, tih_total)

In [1122]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.018802,0.944234,region
1,town_below100,2,2.777612,0.866903,total
2,town_below100,3,2.619546,0.903808,region
3,town_below100,4,2.797579,0.896417,region
4,town_below100,5,2.777612,0.866903,total
5,town_below100,6,2.777612,0.866903,total
6,town_over100,1,3.018802,0.944234,region
7,town_over100,2,2.777612,0.866903,total
8,town_over100,3,2.619546,0.903808,region
9,town_over100,4,2.797579,0.896417,region


In [1123]:
def concat_df(region_df, area_df):
    non_regioned_df = area_df[~area_df['area'].isin(regioned_areas + ['town/city'])].copy()
    non_regioned_df['region'] = 0
    non_regioned_df.loc[non_regioned_df['area'] == 'minsk_suburb', 'region'] = 5
    non_regioned_df.loc[non_regioned_df['area'] == 'capital', 'region'] = 7
    non_regioned_df.loc[non_regioned_df['area'] == 'embassy', 'region'] = 8
    return pd.concat([region_df, non_regioned_df])

In [1124]:
tih_reg_coeff = concat_df(reg_coef_df, coef_df)

In [1125]:
tih_reg_coeff

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.018802,0.944234,region
1,town_below100,2,2.777612,0.866903,total
2,town_below100,3,2.619546,0.903808,region
3,town_below100,4,2.797579,0.896417,region
4,town_below100,5,2.777612,0.866903,total
5,town_below100,6,2.777612,0.866903,total
6,town_over100,1,3.018802,0.944234,region
7,town_over100,2,2.777612,0.866903,total
8,town_over100,3,2.619546,0.903808,region
9,town_over100,4,2.797579,0.896417,region


In [1126]:
tih_reg_coeff.to_csv('tihanovkaja_registered_coefficients.csv', float_format='%.3f', index=False)

In [1127]:
official_candidates_column = official_candidates(df)

In [1128]:
protest_area_df = build_area_df(df['registered'], official_candidates_column)
protest_regions_df = build_region_df(df['registered'], official_candidates_column)
protest_total = count_country_correlation(df['registered'], official_candidates_column)

In [1129]:
protest_area_df

Unnamed: 0,area,coefficient,correlation,source
0,city,3.411446,0.901488,area
1,minsk_suburb,2.072287,0.87332,area
2,capital,2.102312,0.904506,area
3,village,3.186274,0.821414,area
4,town_below100,3.497068,0.801788,area
5,town_over100,3.53846,0.647634,area
6,embassy,1.601618,0.97085,area


In [1130]:
protest_regions_df

Unnamed: 0,region,coefficient,correlation,source
0,1,3.700908,0.9493,region
1,2,3.430759,0.700431,region
2,3,3.234582,0.920409,region
3,4,3.416343,0.906985,region
4,5,3.250965,0.86064,region
5,6,4.406954,0.397349,region


In [1131]:
protest_total

{'correlation': 0.8689811540998936, 'coefficient': 3.444957186962418}

In [1132]:
# town_fix = build_town_fix(df['registered'], official_candidates_column)

In [1133]:
protest_region_df = build_area_region_df(df['registered'], official_candidates_column)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [1134]:
protest_region_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.965086,0.899903,area-region
1,town_below100,2,3.814641,0.39628,area-region
2,town_below100,3,3.133767,0.890877,area-region
3,town_below100,4,4.156669,0.899291,area-region
4,town_below100,5,3.414655,0.845175,area-region
5,town_below100,6,4.359672,0.065237,area-region
6,town_over100,1,,,area-region
7,town_over100,2,2.658752,0.875497,area-region
8,town_over100,3,,,area-region
9,town_over100,4,3.291761,0.837441,area-region


In [1135]:
fix_by_area_region_total(protest_region_df, protest_area_df, protest_regions_df, protest_total)

In [1136]:
protest_region_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.700908,0.9493,region
1,town_below100,2,3.444957,0.868981,total
2,town_below100,3,3.234582,0.920409,region
3,town_below100,4,3.416343,0.906985,region
4,town_below100,5,3.444957,0.868981,total
5,town_below100,6,3.444957,0.868981,total
6,town_over100,1,3.700908,0.9493,region
7,town_over100,2,2.658752,0.875497,area-region
8,town_over100,3,3.234582,0.920409,region
9,town_over100,4,3.416343,0.906985,region


In [1137]:
# fix_town_by_area(protest_region_df, protest_area_df)

In [1138]:
protest_reg_coeff = concat_df(protest_region_df, protest_area_df)

In [1139]:
protest_reg_coeff.to_csv('protest_registered_coefficients.csv', float_format='%.3f', index=False)