In [1]:
import os
import polars as pl

In [2]:
qrq_data = {
    x: pl.read_csv(
        f'../data/{x}_qrq.csv',
        null_values = 'NA'
    )
    for x in ['raw', 'clean']
}

The ML model is meant to capture the following relationship:

$
flag = f(d2m_{i,g}, d2lm_{i,g}, d2pym_{i,g}, n_g, l_i, y2e_i, tps_i)
$

where,

- $flag = \begin{cases} 1 &\text{if individual was dropped from the final sample} \\ 0 &\text{if individual is present in the final sample} \end{cases}$

- $d2m = s_n - s_i$, and $s_n$ is te average group $g$ score and $s_i$ is the score obtained by the individual

- $d2lm = \dfrac{s_l - s_i}{d2m}$, and $s_l$ is the average longitudinal group $g$ score

- $d2pym = \dfrac{s_py - s_i}{d2m}$, and $s_py$ is the average group $g$ score for $t-1$

- $n$ is the size of group $g$

- $y2e$ is the number of years between the individual's answers and the ROLI edition

- $tps$ ... TBD

In [3]:
qrq_data_raw_processed = (
    qrq_data['raw']
    .with_columns(
        pl.col('roli').mean().over('country').alias('avg_country_score'),
        pl.col('roli').std().over('country').alias('std_country_score')
    )
    .with_columns(
        pl.len().over('country', 'edition').alias('pool_size'),

        pl.col('roli')
        .filter(pl.col('longitudinal') == 1)
        .mean()
        .over('country')
        .alias('avg_country_longitudinal_score'),

        (pl.col('unid').is_in(qrq_data['clean']['unid']).not_())
        .cast(pl.Int8)
        .alias('dropped'),

        (pl.col('roli') - pl.col('avg_country_score'))
        .alias('distance2mean'),

        (pl.col('year') - pl.col('edition'))
        .alias('distance2edition')

        # N flags in subfactors (% of total flags) * distance2mean
        # N Flags in TPS * Distance to prev year mean
    )
    .with_columns(
        (pl.col('distance2mean') / pl.col('std_country_score')).alias('distance2mean_nstd')
    )
)

In [41]:
features = (
    qrq_data_raw_processed
    .select(['distance2mean'])
)