In [17]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [4]:
df = pd.read_csv('trusted-for-alternative.csv', converters={'latitude': Decimal, 'longitude': Decimal})

In [5]:
areas = ['city', 'suburb', 'capital', 'village', 'town_below100', 'town_over100', 'embassy']

In [71]:
coefficients = []

for area in areas:
    mask = df['area'] == area
    
    data = df[mask]
    slope, _, _, _ = np.linalg.lstsq(
        np.array(data['tihanovkaja_photoVoices'])[:,np.newaxis], data['tihanovkaja_officialVotes'], None
    )
    slope = slope[0]
    
    
    mask = mask & ~df['tihanovkaja_officialVotes'].isna() & data['tihanovkaja_photoVoices'] > 0
    data = df[mask]
    
    more_than_expected = data[data['tihanovkaja_officialVotes'] / data['tihanovkaja_photoVoices'] > slope]
    less_than_expected = data[data['tihanovkaja_officialVotes'] / data['tihanovkaja_photoVoices'] < slope]
    
    more_coefficients = more_than_expected['tihanovkaja_officialVotes'] / more_than_expected['tihanovkaja_photoVoices'] / slope
    less_coefficients = less_than_expected['tihanovkaja_officialVotes'] / less_than_expected['tihanovkaja_photoVoices'] / slope

    more_mean = np.mean(more_coefficients)
    less_mean = np.mean(less_coefficients)
    more_std = np.std(more_coefficients)
    less_std = np.std(less_coefficients)
    

    coefficients.append({
        'area': area,
        'coefficient': slope, 
        'more_mean': more_mean,
        'less_mean': less_mean,
        'more_std': more_std,
        'less_std': less_std,        
    })
    

coef_df = pd.DataFrame(coefficients)

In [72]:
coef_df

Unnamed: 0,area,coefficient,more_mean,less_mean,more_std,less_std
0,city,2.57621,1.535174,0.641745,0.494907,0.173411
1,suburb,2.232375,1.943564,0.83199,1.666507,0.093624
2,capital,1.916785,1.447489,0.731796,0.787438,0.141557
3,village,3.305239,2.894109,0.769916,1.665011,0.199267
4,town_below100,3.634188,1.530651,0.649515,0.391596,0.209391
5,town_over100,3.16666,1.608945,0.579178,0.642697,0.193359
6,embassy,13.70603,4.16029,,0.813019,


In [74]:
coef_df.to_csv('tihanovkaja_photo_coefficients.csv', float_format='%.3f', index=False)

In [166]:
def _real_tihanovkaja_coeff_for_df(data, candidate):
    return data[f'{candidate}_officialVotes'] / data['tihanovkaja_officialVotes']
    

def build_coeff_for_candidate(candidate):
    coefficients = []

    for area in areas:
        mask = (df['area'] == area) & ~df[f'{candidate}_officialVotes'].isna() 
        
        if candidate in ('against', 'corrupted'):
            mask &= (df['corrupted_officialVotes'] > 0)

        data = df[mask]
        slope, _, _, _ = np.linalg.lstsq(
            np.array(data['tihanovkaja_officialVotes'])[:,np.newaxis], data[f'{candidate}_officialVotes'], None
        )
        slope = slope[0]


        mask = (
            mask & 
            (df[f'tihanovkaja_officialVotes'] > 0) & 
            ~df[f'{candidate}_officialVotes'].isna()
        )
        data = df[mask]
        
        real_coefficients = _real_tihanovkaja_coeff_for_df(data, candidate)

        more_than_expected = data[real_coefficients > slope]
        less_than_expected = data[real_coefficients < slope]

        more_coefficients = _real_tihanovkaja_coeff_for_df(more_than_expected, candidate) / slope
        less_coefficients = _real_tihanovkaja_coeff_for_df(less_than_expected, candidate) / slope

        more_mean = np.mean(more_coefficients)
        less_mean = np.mean(less_coefficients)
        more_std = np.std(more_coefficients)
        less_std = np.std(less_coefficients)


        yield {
            'candidate': candidate,
            'area': area,            
            'tih_coefficient': slope, 
            'more_mean': more_mean,
            'less_mean': less_mean,
            'more_std': more_std,
            'less_std': less_std,        
        }

In [167]:
candidates = (
    'against', 'cherechen', 'corrupted', 'dmitriyev', 'kanopatskaja',
)

In [168]:
alternative = []

for candidate in candidates:
    alternative.extend(build_coeff_for_candidate(candidate))
    

In [169]:
alternative

[{'candidate': 'against',
  'area': 'city',
  'tih_coefficient': 0.15143736786430997,
  'more_mean': 2.7866152736175445,
  'less_mean': 0.6389896327010721,
  'more_std': 2.175298317802411,
  'less_std': 0.2704438222873623},
 {'candidate': 'against',
  'area': 'suburb',
  'tih_coefficient': 0.07691446151068852,
  'more_mean': 1.5134073994895878,
  'less_mean': 0.6733633531290247,
  'more_std': 0.39894179075214425,
  'less_std': 0.2091543482940307},
 {'candidate': 'against',
  'area': 'capital',
  'tih_coefficient': 0.14604847411496116,
  'more_mean': 2.2557738895954698,
  'less_mean': 0.5426448774025191,
  'more_std': 1.091608377213615,
  'less_std': 0.2616324796143272},
 {'candidate': 'against',
  'area': 'village',
  'tih_coefficient': 0.12675562256377343,
  'more_mean': 2.153784790738719,
  'less_mean': 0.6337928286327533,
  'more_std': 1.459974260944058,
  'less_std': 0.2343897164441303},
 {'candidate': 'against',
  'area': 'town_below100',
  'tih_coefficient': 0.14895798335369984,


In [170]:
[x for x in alternative if (x['less_mean'] + x['less_std']) > 1]

[]

In [171]:
[x for x in alternative if x['tih_coefficient'] >  x['more_mean'] - x['more_std']]

[{'candidate': 'corrupted',
  'area': 'town_below100',
  'tih_coefficient': 0.02214106401928412,
  'more_mean': 3.250086223892239,
  'less_mean': 0.6293209757495235,
  'more_std': 5.530214222132578,
  'less_std': 0.24460327923188455}]

In [174]:
pd.DataFrame(alternative).to_csv('alternative_coefficients.csv', float_format='%.4f', index=False)