In [1]:
import numpy as np
import pandas as pd

from epitools import between_p, within_p

In [29]:
df = pd.DataFrame([
    ("A", [40, 40, 40, 40, 0, 0, 0, 0]),
    ("B", [0, 0, 0, 0, 100, 100, 100, 100]),
    # ("C", [0, 0, 0, 0, 100, 100, 0, 0]),
    # ("D", [0, 0, 0, 0, 0, 0,100, 100]),
    # ("D", [100, 100, 100, 100, 100, 100]),
    # ("C", [100, 100, 100, 100, 100, 100]),
    # ("D", [50, 50, 50, 50, 50, 50]),
    # ("E", [50, 50, 50, 50, 50, 50]),
    # ("F", [50, 50, 50, 50, 50, 50])
])
df = df.explode(1)
df["polling_id"] = df.groupby(0).cumcount() + 1
df = df.rename(columns={0: "candidate", 1: "value"})
df["rate"] = df.groupby("polling_id", group_keys=False)["value"].apply(lambda x: x/x.sum())
df.head()

Unnamed: 0,candidate,value,polling_id,rate
0,A,40,1,1.0
0,A,40,2,1.0
0,A,40,3,1.0
0,A,40,4,1.0
0,A,0,5,0.0


In [30]:
within_p(df)

Unnamed: 0,candidate,value,weight,type
0,A,0.714286,0.285714,Within
1,B,0.285714,0.714286,Within


In [4]:
between_p(df)

Unnamed: 0,candidate,value,weight,type
0,A,0.0,0.333333,Between
1,B,0.0,0.333333,Between
2,C,0.0,0.333333,Between


In [5]:
df.groupby(["candidate"]).agg({"rate": "std"}) ** 2

Unnamed: 0_level_0,rate
candidate,Unnamed: 1_level_1
A,2.1e-05
B,5e-06
C,5e-06


In [6]:
values = df.pivot(index="candidate", columns="polling_id", values="value")
values

polling_id,1,2,3,4,5,6
candidate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,51,51,51,51,100,100
B,49,49,49,49,100,100
C,49,49,49,49,100,100


In [7]:
rates = df.pivot(index="candidate", columns="polling_id", values="rate")
rates

polling_id,1,2,3,4,5,6
candidate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,0.342282,0.342282,0.342282,0.342282,0.333333,0.333333
B,0.328859,0.328859,0.328859,0.328859,0.333333,0.333333
C,0.328859,0.328859,0.328859,0.328859,0.333333,0.333333


In [8]:
# candidate = "A"
output = []
total = values.sum().sum()
for candidate in rates.index:
    N_candidates = rates.shape[0]
    rates_c = rates[rates.index == candidate]
    values_c = values[values.index == candidate]

    xx = np.tile(rates_c, reps=(N_candidates, 1))
    yy = np.tile(values_c, reps=(N_candidates, 1))

    between = np.multiply(yy, (1 - np.absolute(xx - rates)))
    between = between[between.index != candidate]

    dv_between = 0 if np.sum(values_c).sum() == 0 else np.sum(between).sum() / ((N_candidates - 1) * np.sum(values_c).sum())
    output.append({
        "candidate": candidate,
        "value": dv_between,
        "weight": values_c.sum().sum()/total
    })

df_between = pd.DataFrame(output)
# df_between["value"] = df_between["weight"] * df_between["value"]
# df_between = df_between.drop(columns=["weight"])
df_between["type"] = "Between"
df_between

Unnamed: 0,candidate,value,weight,type
0,A,0.993222,0.337793,Between
1,B,0.996678,0.331104,Between
2,C,0.996678,0.331104,Between


In [9]:
def get_average(x, total=1):
    return x["value"].sum() / total

total = df["value"].sum()
df_mean = df.groupby("candidate").apply(lambda x: get_average(x, total=total)).reset_index().rename(columns={0: "mean"})
# display(df_mean)
xx = np.sum(values).reset_index().rename(columns={0: "total"})

df_sum = df.groupby("candidate").agg({"value": "sum"}).reset_index()
N_candidates = len(df.candidate.unique())
# N_candidates = 2

df_within = pd.merge(df, df_mean, on="candidate")
df_within = pd.merge(df_within, xx, on="polling_id")
df_within["diff_abs"] = np.absolute(df_within["rate"] - df_within["mean"])
df_within["epi"] = df_within["value"] * df_within["diff_abs"]
df_within = N_candidates * df_within.groupby("candidate").agg({"epi": "sum"}) / (N_candidates - 1)
# df_within = df_within.reset_index().rename(columns={"value": "weight", "total": "value"})
# df_within["value"] = 2 * df_within["value"]
# df_within["value"] = df_within["weight"] * df_within["value"]
# df_within = df_within.drop(columns=["weight"])
df_within["type"] = "Within"
df_within = pd.merge(df_within, df_sum, on="candidate")
df_within["epi"] = df_within.apply(lambda x: x["epi"] / x["value"] if x["value"] > 0 else 0, axis=1)
df_within

Unnamed: 0,candidate,epi,type,value
0,A,0.006712,Within,404
1,B,0.003356,Within,396
2,C,0.003356,Within,396


In [10]:
df_polarization = pd.concat([df_between, df_within])
df_polarization = pd.merge(df_polarization, df_mean, on="candidate")
df_polarization["value"] = df_polarization["value"] * df_polarization["mean"]
df_polarization = df_polarization.drop(columns=["mean"])
df_polarization

Unnamed: 0,candidate,value,weight,type,epi
0,A,0.335503,0.337793,Between,
1,A,136.468227,,Within,0.006712
2,B,0.330004,0.331104,Between,
3,B,131.117057,,Within,0.003356
4,C,0.330004,0.331104,Between,
5,C,131.117057,,Within,0.003356
