In [91]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from comchoice.aggregate.ahp import ahp
from comchoice.aggregate.__set_card_id import __set_card_id
from comchoice.aggregate.__set_rank import __set_rank
from comchoice.preprocessing import to_pairwise


# TODO: Calculate Divisiveness with the Score
def divisiveness(
    df,
    alternative: str = "alternative",
    alternative_a: str = "alternative_a",
    alternative_b: str = "alternative_b",
    convert_pairwise: bool = False,
    method=ahp,
    method_kws: dict = dict(),
    selected: str = "selected",
    show_rank: bool = True,
    verbose: bool = True,
    voter: str = "voter"
):
    """Divisiveness

    Parameters
    ----------
    df : _type_
        _description_
    alternative : str, optional
        _description_, by default "id"
    method : _type_, optional
        _description_, by default borda
    alternative_a : str, optional
        _description_, by default "alternative_a"
    alternative_b : str, optional
        _description_, by default "alternative_b"
    selected : str, optional
        _description_, by default "selected"
    verbose : bool, optional
        _description_, by default True
    voter : str, optional
        _description_, by default "voter"

    Returns
    -------
    _type_
        _description_
    """
    tmp = df.copy()
    df_original = df.copy()
    if convert_pairwise:
        tmp = to_pairwise(tmp, origin="voting")

    tmp = __set_card_id(
        tmp.copy(),
        alternative_a=alternative_a,
        alternative_b=alternative_b,
        selected=selected,
        concat="_"
    )

    dd = tmp.groupby(["card_id", selected, voter]).agg({"id": "count"})
    # _data = df.copy().set_index(voter)

    def _f(idx, df_select):
        card_id = idx[0]
        s = idx[1]
        users = [item[2] for item in df_select.index.to_numpy()]

        # data_temp = _data.loc[users].reset_index()
        data_temp = df_original[df_original[voter].isin(users)]
        r_tmp = method(data_temp, **method_kws).dropna()
        r_tmp["card_id"] = card_id
        r_tmp[selected] = s
        r_tmp["users"] = len(users)
        
#         print(r_tmp)

        del data_temp, users

        return r_tmp

    tmp_list = []

    _data_tmp = dd.groupby(level=[0, 1])

    _iter = tqdm(_data_tmp, position=0,
                 leave=True) if verbose else _data_tmp

    for idx, df_select in _iter:
        tmp_list.append(_f(idx, df_select))

    tmp = pd.concat(tmp_list, ignore_index=True)

    tmp[[f"{alternative_a}_sorted", f"{alternative_b}_sorted"]
        ] = tmp["card_id"].str.split("_", expand=True)
    tmp["group"] = tmp[f"{alternative_a}_sorted"].astype(
        str) == tmp[selected].astype(str)
    tmp["group"] = tmp["group"].replace({True: "A", False: "B"})

    tmp_a = tmp[tmp["group"] == "A"]
    tmp_b = tmp[tmp["group"] == "B"]

    tmp_dv = pd.merge(
        tmp_a,
        tmp_b,
        on=["card_id", alternative,
            f"{alternative_a}_sorted", f"{alternative_b}_sorted"]
    )

#     tmp_dv = tmp_dv[[alternative, "card_id", "value_x",
#                      "value_y", f"{selected}_x", f"{selected}_y"]]
#     tmp_dv["value"] = tmp_dv["value_x"] - tmp_dv["value_y"]
    val = tmp_dv["users_x"] + tmp_dv["users_y"]
    tmp_dv["value"] = (tmp_dv["users_x"] - tmp_dv["value_y"])
    

    tmp_dv["value"] = tmp_dv["value"] ** 2
    return tmp_dv
    tmp_dv["value"] = (((tmp_dv["users_x"]/val)**1.25)*(tmp_dv["users_y"]/val))*np.sqrt(tmp_dv["value"])

#     return tmp_dv

    tmp_frag_a = tmp_dv[[alternative, f"{selected}_x", "value"]].rename(
        columns={f"{selected}_x": "selected"})
    tmp_frag_b = tmp_dv[[alternative, f"{selected}_y", "value"]].rename(
        columns={f"{selected}_y": "selected"})
    tmp = pd.concat([tmp_frag_a, tmp_frag_b])
    tmp = tmp[tmp[alternative]
              == tmp["selected"]]
    

    tmp = tmp.groupby(alternative).agg(
        {"value": "mean"}).reset_index()

    if show_rank:
        tmp = __set_rank(tmp)

    return tmp

In [5]:
df = pd.read_csv("data_output/France/2002_pairwise.csv.gz")
df.head()

Unnamed: 0,polling_id,alternative_a,alternative_b,selected
0,01-1-1,BRUNO MEGRET,ALAIN MADELIN,ALAIN MADELIN
1,01-1-1,BRUNO MEGRET,ARLETTE LAGUILLER,ARLETTE LAGUILLER
2,01-1-1,FRANCOIS BAYROU,BRUNO MEGRET,FRANCOIS BAYROU
3,01-1-1,FRANCOIS BAYROU,CHRISTIANE TAUBIRA,FRANCOIS BAYROU
4,01-1-1,FRANCOIS BAYROU,ALAIN MADELIN,FRANCOIS BAYROU


In [None]:
from comchoice.aggregate.win_rate import win_rate

In [79]:
df_fround = pd.read_csv(f"data_output/France/2002_first_round.csv.gz", compression="gzip")
df_fround.columns = [x.lower() for x in df_fround.columns]
v = df_fround.groupby("candidate").agg({"rate": "mean"}).sort_values("rate", ascending=False).head(7).index
v

Index(['JACQUES CHIRAC', 'JEAN-MARIE LE PEN', 'LIONEL JOSPIN',
       'FRANCOIS BAYROU', 'JEAN SAINT-JOSSE', 'ARLETTE LAGUILLER',
       'JEAN-PIERRE CHEVENEMENT'],
      dtype='object', name='candidate')

In [62]:
df.shape

(5002998, 4)

In [92]:
d = divisiveness(
    df.sample(150000),
    voter="polling_id",
    convert_pairwise=False,
    method=win_rate,
    method_kws={"voter": "polling_id"}
)
d

100%|██████████████████████████████████████████████████████████████████| 234/234 [00:15<00:00, 15.19it/s]


Unnamed: 0,alternative,value_x,rank_x,card_id,selected_x,users_x,alternative_a_sorted,alternative_b_sorted,group_x,value_y,rank_y,selected_y,users_y,group_y,value
0,JACQUES CHIRAC,0.987879,1,ALAIN MADELIN_ARLETTE LAGUILLER,ALAIN MADELIN,479,ALAIN MADELIN,ARLETTE LAGUILLER,A,0.852941,1,0,96,B,2.286246e+05
1,JACQUES CHIRAC,0.987879,1,ALAIN MADELIN_ARLETTE LAGUILLER,ALAIN MADELIN,479,ALAIN MADELIN,ARLETTE LAGUILLER,A,0.897177,1,ARLETTE LAGUILLER,1307,B,2.285823e+05
2,ALAIN MADELIN,0.882995,2,ALAIN MADELIN_ARLETTE LAGUILLER,ALAIN MADELIN,479,ALAIN MADELIN,ARLETTE LAGUILLER,A,0.490741,6,0,96,B,2.289711e+05
3,ALAIN MADELIN,0.882995,2,ALAIN MADELIN_ARLETTE LAGUILLER,ALAIN MADELIN,479,ALAIN MADELIN,ARLETTE LAGUILLER,A,0.077915,13,ARLETTE LAGUILLER,1307,B,2.293664e+05
4,JEAN-MARIE LE PEN,0.853107,3,ALAIN MADELIN_ARLETTE LAGUILLER,ALAIN MADELIN,479,ALAIN MADELIN,ARLETTE LAGUILLER,A,0.826923,2,0,96,B,2.286495e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,BRUNO MEGRET,0.204365,11,OLIVIER BESANCENOT_ROBERT HUE,OLIVIER BESANCENOT,1317,OLIVIER BESANCENOT,ROBERT HUE,A,0.195402,11,ROBERT HUE,503,B,1.733974e+06
1997,CHRISTIANE TAUBIRA,0.153409,12,OLIVIER BESANCENOT_ROBERT HUE,OLIVIER BESANCENOT,1317,OLIVIER BESANCENOT,ROBERT HUE,A,0.351351,10,0,124,B,1.733564e+06
1998,CHRISTIANE TAUBIRA,0.153409,12,OLIVIER BESANCENOT_ROBERT HUE,OLIVIER BESANCENOT,1317,OLIVIER BESANCENOT,ROBERT HUE,A,0.167568,12,ROBERT HUE,503,B,1.734048e+06
1999,ROBERT HUE,0.049208,13,OLIVIER BESANCENOT_ROBERT HUE,OLIVIER BESANCENOT,1317,OLIVIER BESANCENOT,ROBERT HUE,A,0.179487,13,0,124,B,1.734016e+06


In [57]:
d[(d["selected_y"] != "0") & (d["alternative"] == "JEAN-MARIE LE PEN")].groupby("alternative").agg({"value": "mean"})

Unnamed: 0_level_0,value
alternative,Unnamed: 1_level_1
JEAN-MARIE LE PEN,1398.062432
