In [1]:
import pandas as pd
from scipy.stats.stats import pearsonr
from scipy import stats
from scipy import special
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
def plot(df, label, show_graphs):
    no_nulls = df[['LogEnrollees', 'LogTransfers']].dropna()
    X = no_nulls[['LogEnrollees']]
    y = no_nulls['LogTransfers']
    X = sm.add_constant(X)
    est = sm.OLS(y, X).fit()
    equation = "Log T = %f + %f * Log Enrollees" % (est.params[0], est.params[1])
    if show_graphs:
        fig, ax = plt.subplots(figsize=(8,8))
        plt.scatter(df["LogEnrollees"], df["LogTransfers"])
        x = np.linspace(min(df["LogEnrollees"]), max(df["LogEnrollees"]), 1000)
        plt.plot(x, est.params[0] + est.params[1] * x, label=equation)
        plt.title(label)
        plt.xlabel("Log # Enrollees")
        plt.ylabel("Log Transfers^2 + 1")
        plt.legend()
        plt.show()
    print(label)
    print(equation)
    
def state(df, state):
    return df.where(df["STATE"] == state).where(df["LogTransfers"] > 2).dropna()

def remove_outliers(df):
    return df.where(df["LogTransfers"] > 2).dropna()

In [3]:
def erfinv2(sample_size):# sig):
    # erf inverse 0.5 * p/k where p=0.05
    print("Sample size %d" % (sample_size))
    return special.erfinv(1-0.5*0.05/sample_size) * np.sqrt(2)# * sig

def num_pass(df, a):
    sample_size = df.shape[0]
    count = 0
    delta = erfinv2(sample_size)
    print("Delta_0 % .3f" % (delta))
    es1 = max(abs(df["TransfersPerSqrtEnrollee"])) / delta
    print("Comparison es_0 = %.3f" % (es1))
    print("Comparison es_0*%.2f = %.3f" % (a, a*es1))
    
    for index, row in df.iterrows():
        t = row["Transfers"]
        n = row["Enrollees"]
        if abs(row["TransfersPerSqrtEnrollee"]) < a*es1:
            count += 1
            print("T/sqrt(N)=%.3f \t <a*es_0 Satisfied" % (row["TransfersPerSqrtEnrollee"]))
        else:
            print("T/sqrt(N)=%.3f \t <a*es_0 NOT Satisfied \t Distance |T/sqrt(N)|-a*es_0 %.1f " % (row["TransfersPerSqrtEnrollee"], abs(row["TransfersPerSqrtEnrollee"]) - a*es1))
    return count
    
def binom(k, n, p):
    return stats.binom.cdf(k, n, p)

In [4]:
def run_stat_test(data, label, a):
    print(label)
    p01 = special.erf(a/np.sqrt(2))
    print("P01 = %.3f" % (p01))
    successes = num_pass(data, a)
    print("Number of successes: %d" % (successes))
    print("P value = %.3f" % (1- binom(successes, data.shape[0], p01)))
    
def run_binom_stat_test2(beta, values):
    above_2_count = 0
    for i in range(len(values)):
        if abs(values[i]) > 2*beta:
            above_2_count += 1
    p_above_2 = 1 - special.erf(np.sqrt(2))
    print("Number of above 2: %d" % (above_2_count))
    print("Proportion of above 2: %.3f" % (above_2_count / len(values)))
    print("P value = %.6f" % (1- binom(above_2_count, len(values), p_above_2)))

In [5]:
def ratio(df, v):# sig):
    # # of people where |d_i| < v
    return df["TransfersPerSqrtEnrollee"].where(abs(df["TransfersPerSqrtEnrollee"]) < v).dropna().count() / df["TransfersPerSqrtEnrollee"].count()

# the ratio function, just on lists instead of df's
def ratio_list(t, v):# sig):
    # # of people where |d_i| < v
    k = len(t)
    t2 = [x for x in t if abs(x) < v]
    return len(t2) / k

In [6]:
def create_simulation(df3, es):
    all_transfers3 = list()
    mu = 0

    for a in range(5):
        all_costs = list()
        for _, row in df3.iterrows():
            all_costs.append(np.random.normal(mu, es* np.sqrt(row["Enrollees"])))

        c_mean = sum(all_costs) / sum(df3["Enrollees"])

        i = 0
        for _, row in df3.iterrows():
            all_transfers3.append((all_costs[i] - c_mean*row["Enrollees"]) / np.sqrt(row["Enrollees"]))  
            i += 1
            
    ratios_c3 = [ratio_list(all_transfers3, vi) for vi in v]
    return ratios_c3