In [2]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

In [11]:
# This notebook only shows how we use the representativity dimension at Meta. 
# It cannot be run locally since it is usually applied on large scale date and we implemented it by using SQL/Hive under the hood. 
# Most of APIs of this dimension expect the data are stored in database, not in DataFrame format. 

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
from random import seed, sample, randint
import sys
import numpy as np, numpy.random
import math
from collections import Counter

In [5]:
sys.path.append('../metrics')

## Representativity

In [6]:
continuous_covars_dataset = [
    {"name": "var1"},
    {"name": "var2", "quantile_literal": [3, 12, 25, 28]},
    {"name": "var3", "quantile_percentile": [0.1, 0.3, 0.5, 0.8], "null_value": "-1.0"},
]

In [7]:
continuous_covars = [x.get("name") for x in continuous_covars_dataset]
discrete_covars = ["st"]
holdout_covar="score"
primary_key="id"

In [None]:
population_quantiles = representativeness.get_population_quantiles(
    population_data_hive_namespace=population_data_path_namespace,
    population_data_hive_table = population_data_path_table,
    primary_key=primary_key,
    continuous_covars=continuous_covars,
    continuous_covars_dataset=continuous_covars_dataset,
    discrete_covars = discrete_covars,
    holdout_covar=holdout_covar,

In [None]:
target_popn = representativeness.summarize_target_population(
    population_quantiles, continuous_covars, discrete_covars, holdout_covar
)

In [None]:
sample_features = representativeness.get_sample_feature(
    sample_data_path_namespace,
    sample_data_path_tablename,
    population_quantiles,
    continuous_covars,
    discrete_covars,
    holdout_covar,
    inclusive_columns_in_sample,
    primary_key,
)

In [None]:
# get sample weight from external resource
sample_weights = representativeness.weigh_sample_data_from_external_source(sample_feature, sample_with_weight_namespace,sample_with_weight_tablename, primary_key,)

### Metric Calculation

In [None]:
# design effect
design_effect = representativeness.get_design_effect(sample_weights)

In [None]:
# covariate balance and plots
covariate_balance = representativeness.get_covariate_balance(target_popn, sample_weights, covars)
covariate_balance_df = representativeness.get_covariate_balance_df(covariate_balance)

In [None]:
representativeness.proportion_plot(covariate_balance_df, covars)

In [None]:
# mau coverage
coverage_cutoff = 0.5
mau = representativeness.get_mau_coverage(covariate_balance, coverage_cutoff, target_popn, covars)

In [None]:
# Hold-out covariate Representativity

score_distn_df = representativeness.get_score_distn_df(target_popn, sample_weights, holdout_covar)

In [None]:
distn_plot = representativeness.gen_plot_distribution_of_hold_out(score_distn_df)

In [None]:
# hellinger distance

hellinger_arguments = {"name": "score", "min_value": 0, "max_value": 1, "step": 0.02}
hellinger_distance = representativeness.get_hellinger(target_popn, sample_weights, holdout_covar, hellinger_arguments)