In [1971]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [1972]:
df = pd.read_csv('trusted-for-alternative-fixed.csv', converters={'latitude': Decimal, 'longitude': Decimal})
#df = df[~df['id'].isin(['01-054-0009', '01-055-0004'])

In [1973]:
areas = ['city', 'minsk_suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']
regioned_areas = ['town_below100', 'town_over100', 'city', 'village']

candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja', 'tihanovkaja']
alt_candidates = ['against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja']
alt_candidates_no_corrupted = set(alt_candidates) - {'corrupted'}

In [1974]:
def corr(x, y):
    return np.corrcoef(x, y)[0][1]

In [1975]:
def official_candidates(data):
    return sum([data[f'{candidate}_officialVotes'] for candidate in candidates])

In [1976]:
{
    'reg--protest': corr(df['registered'], official_candidates(df)),
    'reg-not-for-luk--protest': corr(df['registered'] - df['lukashenko_registered'], official_candidates(df)),
    'reg-not-ignore--protest': corr(df['registered'] - df['ignore_registered'], official_candidates(df)),
    'reg-protest--protest': corr(df['registered']- df['lukashenko_registered'] - df['ignore_registered'], official_candidates(df)),
    'reg--voters': corr(df['registered'], df['officialVoters']),
    'reg--votes': corr(df['registered'], df['officialVotes']),
    'reg--tih': corr(df['registered'], df['tihanovkaja_officialVotes']),
    'tih-reg--tih': corr(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes']),   
}

{'reg--protest': 0.8172593412676167,
 'reg-not-for-luk--protest': 0.8163473042997992,
 'reg-not-ignore--protest': 0.8168671967946259,
 'reg-protest--protest': 0.8159466756261811,
 'reg--voters': 0.7324301375822109,
 'reg--votes': 0.639306278635681,
 'reg--tih': 0.8397913497556483,
 'tih-reg--tih': 0.8385579972980546}

In [1977]:
def get_coefficient(x, y):
    x = np.array(x)
    
    slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
    return slope[0]

In [1978]:
data = df[~df['area'].isin(['embassy', 'minsk_suburb'])]
get_coefficient(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

2.2885129434352134

In [1979]:
#data = df[~df['area'].isin(['embassy', 'capital']) & (df['region'] == 1)]

data = df[~df['area'].isin(['embassy', 'capital', 'minsk_suburb'])]
get_coefficient(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

2.842522459782109

In [1980]:
corr(data['tihanovkaja_registered'], data['tihanovkaja_officialVotes'])

0.894568909607306

In [1981]:
def count_country_correlation(x_col, y_col):
    mask = ~df['region'].isin([7, 8]) & (df['area'] != 'minsk_suburb')
    mask = df['area'].isin(regioned_areas)
    x = x_col[mask]
    y = y_col[mask]
    
    return {
        'correlation': corr(x, y),
        'coefficient': get_coefficient(x, y), 
    }

In [1982]:
def build_area_df(x_col, y_col):

    coefficients = []

    for area in areas:
        mask = df['area'] == area
            
        x = x_col[mask]
        y = y_col[mask]

        coefficients.append({
            'area': area,
            'coefficient': get_coefficient(x, y), 
            'correlation': corr(x, y),
            'source': 'area',
        })


    return pd.DataFrame(coefficients)

In [1983]:
def build_region_df(x_col, y_col):

    coefficients = []

    for region in range(1, 7):
        mask = df['region'] == region
        
        if region == 5:
            mask &= df['area'] != 'minsk_suburb'
            
        x = x_col[mask]
        y = y_col[mask]

        coefficients.append({
            'region': region,
            'coefficient': get_coefficient(x, y), 
            'correlation': corr(x, y),
            'source': 'region',
        })


    return pd.DataFrame(coefficients)

In [1984]:
coef_df = build_area_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])
tih_regions_df = build_region_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])
tih_total = count_country_correlation(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])

In [1985]:
coef_df

Unnamed: 0,area,coefficient,correlation,source
0,city,2.847489,0.946163,area
1,minsk_suburb,1.851287,0.897647,area
2,capital,1.716984,0.894505,area
3,village,2.724964,0.834028,area
4,town_below100,2.908215,0.83668,area
5,town_over100,2.712351,0.692536,area
6,embassy,1.561313,0.976445,area


In [1986]:
tih_regions_df

Unnamed: 0,region,coefficient,correlation,source
0,1,3.084419,0.96403,region
1,2,2.889551,0.763027,region
2,3,2.648511,0.91681,region
3,4,2.907544,0.924457,region
4,5,2.623912,0.861878,region
5,6,3.492626,0.690941,region


In [1987]:
tih_total

{'correlation': 0.894568909607306, 'coefficient': 2.842522459782109}

In [1988]:
total_row = {k: [v] for k, v in tih_total.items()}
total_row['source'] = 'total'
tih_areas_df = pd.concat([coef_df, tih_regions_df, pd.DataFrame(total_row)])
tih_areas_df.to_csv('tihanovkaja_registered_areas_coefficients.csv', float_format='%.3f', index=False)

In [1989]:
def build_area_region_df(x_col, y_col):
    region_coefficients = []

    for area in regioned_areas:
        for region in range(1, 7):
            area_mask = df['area'] == area            
            mask = area_mask & (df['region'] == region)
            
            if region == 5:
                mask &= df['area'] != 'minsk_suburb'

            x = np.array(x_col[mask])
            y = y_col[mask]

            slope, _, _, _ = np.linalg.lstsq(x[:,np.newaxis], y, None)
            slope = slope[0]
            correlation = corr(x, y)
            source = 'area-region'
            
            if len(x) < 5:
                slope = None
                correlation = None
            
            region_coefficients.append({
                'area': area,
                'region': region,
                'coefficient': slope, 
                'correlation': correlation,
                'source': source,
            })

    return pd.DataFrame(region_coefficients)

In [1990]:
reg_coef_df = build_area_region_df(df['tihanovkaja_registered'], df['tihanovkaja_officialVotes'])
reg_coef_df.to_csv('tihanovkaja_registered_original_coefficients.csv', float_format='%.3f', index=False)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [1991]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.399283,0.955019,area-region
1,town_below100,2,3.945794,0.983057,area-region
2,town_below100,3,2.583554,0.884703,area-region
3,town_below100,4,3.275061,0.93464,area-region
4,town_below100,5,2.724995,0.817699,area-region
5,town_below100,6,,,area-region
6,town_over100,1,,,area-region
7,town_over100,2,1.952299,0.787372,area-region
8,town_over100,3,3.153837,0.698162,area-region
9,town_over100,4,2.632979,0.80629,area-region


In [1992]:
def fix_by_area_region_total(reg_df, area_df, total_region_df, total):
    for area in regioned_areas:
        reg_df.loc[
            (reg_df['area'] == area) & 
            (
                reg_df['correlation'].isna() |
                (area_df[coef_df['area'] == area]['correlation'].iloc[0] - reg_df['correlation'] > 0.00)
            ),
            ['coefficient', 'correlation', 'source']
        ] = area_df[area_df['area'] == area][['coefficient', 'correlation', 'source']].values
        
    for region in range(1, 7):
        reg_df.loc[
            (reg_df['region'] == region) & 
            (total_region_df[total_region_df['region'] == region]['correlation'].iloc[0] - reg_df['correlation'] > 0.00),
            ['coefficient', 'correlation', 'source']
        ] = total_region_df[total_region_df['region'] == region][['coefficient', 'correlation', 'source']].values
        
    reg_df.loc[
        (total['correlation'] - reg_df['correlation'] > 0.00), 
        ['coefficient', 'correlation', 'source']
    ] = [total['coefficient'], total['correlation'], 'total']
    

In [1993]:
fix_by_area_region_total(reg_coef_df, coef_df, tih_regions_df, tih_total)

In [1994]:
reg_coef_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.084419,0.96403,region
1,town_below100,2,3.945794,0.983057,area-region
2,town_below100,3,2.648511,0.91681,region
3,town_below100,4,3.275061,0.93464,area-region
4,town_below100,5,2.842522,0.894569,total
5,town_below100,6,2.842522,0.894569,total
6,town_over100,1,3.084419,0.96403,region
7,town_over100,2,2.842522,0.894569,total
8,town_over100,3,2.648511,0.91681,region
9,town_over100,4,2.907544,0.924457,region


In [1995]:
def concat_df(region_df, area_df):
    non_regioned_df = area_df[~area_df['area'].isin(regioned_areas + ['town/city'])].copy()
    non_regioned_df['region'] = 0
    non_regioned_df.loc[non_regioned_df['area'] == 'minsk_suburb', 'region'] = 5
    non_regioned_df.loc[non_regioned_df['area'] == 'capital', 'region'] = 7
    non_regioned_df.loc[non_regioned_df['area'] == 'embassy', 'region'] = 8
    return pd.concat([region_df, non_regioned_df])

In [1996]:
tih_reg_coeff = concat_df(reg_coef_df, coef_df)

In [1997]:
tih_reg_coeff

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.084419,0.96403,region
1,town_below100,2,3.945794,0.983057,area-region
2,town_below100,3,2.648511,0.91681,region
3,town_below100,4,3.275061,0.93464,area-region
4,town_below100,5,2.842522,0.894569,total
5,town_below100,6,2.842522,0.894569,total
6,town_over100,1,3.084419,0.96403,region
7,town_over100,2,2.842522,0.894569,total
8,town_over100,3,2.648511,0.91681,region
9,town_over100,4,2.907544,0.924457,region


In [1998]:
tih_reg_coeff.to_csv('tihanovkaja_registered_coefficients.csv', float_format='%.3f', index=False)

In [1999]:
official_candidates_column = official_candidates(df)

In [2000]:
protest_area_df = build_area_df(df['registered'], official_candidates_column)
protest_regions_df = build_region_df(df['registered'], official_candidates_column)
protest_total = count_country_correlation(df['registered'], official_candidates_column)

In [2001]:
protest_area_df

Unnamed: 0,area,coefficient,correlation,source
0,city,3.466843,0.932805,area
1,minsk_suburb,2.072287,0.87332,area
2,capital,2.10982,0.903715,area
3,village,3.31419,0.810578,area
4,town_below100,3.594735,0.838809,area
5,town_over100,3.478117,0.73512,area
6,embassy,1.601618,0.97085,area


In [2002]:
protest_regions_df

Unnamed: 0,region,coefficient,correlation,source
0,1,3.784098,0.955271,region
1,2,3.792222,0.793487,region
2,3,3.233061,0.932144,region
3,4,3.52371,0.926619,region
4,5,3.249256,0.866999,region
5,6,4.520604,0.722371,region


In [2003]:
protest_total

{'correlation': 0.8945902976989147, 'coefficient': 3.5106755039160524}

In [2004]:
total_row = {k: [v] for k, v in tih_total.items()}
total_row['source'] = 'total'
protest_areas_df = pd.concat([protest_area_df, protest_regions_df, pd.DataFrame(total_row)])
protest_areas_df.to_csv('protest_registered_areas_coefficients.csv', index=False, float_format='%.3f')

In [2005]:
protest_region_df = build_area_region_df(df['registered'], official_candidates_column)
protest_region_df.to_csv('protest_registered_original_coefficients.csv', float_format='%.3f', index=False)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  avg = a.mean(axis)
  ret = um.true_divide(
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [2006]:
protest_region_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,4.168145,0.955225,area-region
1,town_below100,2,5.15416,0.972445,area-region
2,town_below100,3,3.140206,0.900011,area-region
3,town_below100,4,4.022915,0.93937,area-region
4,town_below100,5,3.414655,0.845175,area-region
5,town_below100,6,,,area-region
6,town_over100,1,,,area-region
7,town_over100,2,2.658752,0.875497,area-region
8,town_over100,3,3.896472,0.658925,area-region
9,town_over100,4,3.291761,0.837441,area-region


In [2007]:
fix_by_area_region_total(protest_region_df, protest_area_df, protest_regions_df, protest_total)

In [2008]:
protest_region_df

Unnamed: 0,area,region,coefficient,correlation,source
0,town_below100,1,3.784098,0.955271,region
1,town_below100,2,5.15416,0.972445,area-region
2,town_below100,3,3.233061,0.932144,region
3,town_below100,4,4.022915,0.93937,area-region
4,town_below100,5,3.510676,0.89459,total
5,town_below100,6,3.510676,0.89459,total
6,town_over100,1,3.784098,0.955271,region
7,town_over100,2,3.510676,0.89459,total
8,town_over100,3,3.233061,0.932144,region
9,town_over100,4,3.52371,0.926619,region


In [2009]:
# fix_town_by_area(protest_region_df, protest_area_df)

In [2010]:
protest_reg_coeff = concat_df(protest_region_df, protest_area_df)

In [2011]:
protest_reg_coeff.to_csv('protest_registered_coefficients.csv', float_format='%.3f', index=False)