In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from random import seed
from random import sample
import sys

In [4]:
seed(1)

In [5]:
sys.path.append('../metrics')

### Efficiency

In [6]:
import efficiency as efficiency

In [7]:
# Data Preparation
# Please note that all datasets are generated randomly. The metric results might be unreasonable.

# Datasets for cost efficiency
label_dataset_columns = {
    "ds_month": [sample(["2023-01-01", "2023-02-01"], 1)[0] for _ in range(1000)],
    "id": [sample(list(range(1000, 10000)), 1)[0] for _ in range(1000)],
    "vertical": [sample(["A", "B"], 1)[0] for _ in range(1000)],
    "use_case": [sample(["uc1", "uc2"], 1)[0] for _ in range(1000)],
    "vendor": [sample(["V1", "V2"], 1)[0] for _ in range(1000)],
    "labeler_type": [sample(["L1", "L2"], 1)[0] for _ in range(1000)],
    "decision_value": [sample(["dv1", "dv2", "not_valid"], 1)[0] for _ in range(1000)],
    "final_decision": [sample(["dv1", "dv2", "not_valid"], 1)[0] for _ in range(1000)],
    "handling_times_sec": [sample(range(1, 1000), 1)[0] for _ in range(1000)],
}
efficiency_label_dataset = pd.DataFrame.from_dict(label_dataset_columns)
cost_dataset_columns = {
    "language": ["english", "english", "english", "spanish", "spanish"],
    "labeler_type": ["L1", "L1", "L2", "L1", "L2"],
    "vendor": ["V1", "V2", "V1", "V1", "V2"],
    "cost_per_second": [0.1, 0.2, 0.3, 0.11, 0.19],
}
efficiency_cost_summary = pd.DataFrame.from_dict(cost_dataset_columns)

In [8]:
efficiency_label_dataset.head()

Unnamed: 0,ds_month,id,vertical,use_case,vendor,labeler_type,decision_value,final_decision,handling_times_sec
0,2023-01-01,3295,A,uc2,V1,L2,not_valid,not_valid,404
1,2023-01-01,5201,B,uc1,V2,L1,dv1,not_valid,127
2,2023-02-01,4686,A,uc2,V2,L1,dv2,not_valid,518
3,2023-01-01,2442,B,uc2,V2,L1,dv2,dv2,72
4,2023-02-01,9828,B,uc1,V2,L1,dv1,dv2,101


In [9]:
efficiency_cost_summary

Unnamed: 0,language,labeler_type,vendor,cost_per_second
0,english,L1,V1,0.1
1,english,L1,V2,0.2
2,english,L2,V1,0.3
3,spanish,L1,V1,0.11
4,spanish,L2,V2,0.19


In [10]:
# merge label dataset and commercial report and get cpd(cost per decision) column
merged_dataset = efficiency.merge_label_commercial_datsets(
    label_data=efficiency_label_dataset,
    commercial_report=efficiency_cost_summary,
    common_cols=["vendor", "labeler_type"],
    cost_per_second="cost_per_second",
    handling_time_second="handling_times_sec",
)
merged_dataset.head()

Unnamed: 0,ds_month,id,vertical,use_case,vendor,labeler_type,decision_value,final_decision,handling_times_sec,language,cost_per_second,cpd
0,2023-01-01,3295,A,uc2,V1,L2,not_valid,not_valid,404,english,0.3,121.2
1,2023-01-01,5201,B,uc1,V2,L1,dv1,not_valid,127,english,0.2,25.4
2,2023-02-01,4686,A,uc2,V2,L1,dv2,not_valid,518,english,0.2,103.6
3,2023-01-01,2442,B,uc2,V2,L1,dv2,dv2,72,english,0.2,14.4
4,2023-02-01,9828,B,uc1,V2,L1,dv1,dv2,101,english,0.2,20.2


In [11]:
task_level_cost_summary = efficiency.get_task_level_cost_summary(
    dataset=merged_dataset,
    detailed_group_list=["vertical", "vendor", "labeler_type"],
    non_converted_list=["not_valid"],
    decision_value_column="decision_value",
    handling_time_second="handling_times_sec",
    cpd="cpd",
)
task_level_cost_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cnt_attempted,cnt_converted,conversion_rate,conversion_rate_sd,ht_attempted,ht_attempted_std,cpd_attempted,cpd_attempted_std,ht_converted,ht_converted_std,cpd_converted,cpd_converted_std
vertical,vendor,labeler_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,V1,L1,238,170,0.714286,0.029283,514.689076,18.975357,54.042353,2.001633,720.564706,26.5655,75.659294,2.802286
A,V1,L2,117,78,0.666667,0.043581,510.076923,27.275895,153.023077,8.182769,765.115385,40.913843,229.534615,12.274153
A,V2,L1,134,89,0.664179,0.040798,538.402985,25.094493,107.680597,5.018899,810.629213,37.78272,162.125843,7.556544
A,V2,L2,129,84,0.651163,0.041962,497.860465,24.830518,94.593488,4.717798,764.571429,38.132581,145.268571,7.24519
B,V1,L1,236,148,0.627119,0.031478,486.822034,17.767958,51.116314,1.874458,776.283784,28.33269,81.509797,2.989001
B,V1,L2,130,85,0.653846,0.041725,483.823077,23.476589,145.146923,7.042977,739.964706,35.905371,221.989412,10.771611
B,V2,L1,134,83,0.619403,0.041944,485.179104,25.037176,97.035821,5.007435,783.301205,40.421465,156.660241,8.084293
B,V2,L2,119,76,0.638655,0.044037,469.512605,24.622588,89.207395,4.678292,735.157895,38.55379,139.68,7.32522


In [12]:
task_level_cost_summary_by_time = efficiency.get_task_level_cost_summary(
    dataset=merged_dataset,
    detailed_group_list=["vertical", "vendor", "labeler_type"],
    non_converted_list=["not_valid"],
    decision_value_column="decision_value",
    handling_time_second="handling_times_sec",
    cpd="cpd",
    time_column="ds_month",
)
task_level_cost_summary_by_time

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cnt_attempted,cnt_converted,conversion_rate,conversion_rate_sd,ht_attempted,ht_attempted_std,cpd_attempted,cpd_attempted_std,ht_converted,ht_converted_std,cpd_converted,cpd_converted_std
ds_month,vertical,vendor,labeler_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01,A,V1,L1,114,86,0.754386,0.040315,521.421053,26.924355,54.749211,2.840774,691.186047,35.690424,72.574535,3.765678
2023-01-01,A,V1,L2,51,34,0.666667,0.06601,513.843137,40.913618,154.152941,12.274085,770.764706,61.370427,231.229412,18.411128
2023-01-01,A,V2,L1,64,38,0.59375,0.061392,543.171875,34.184076,108.634375,6.836815,914.815789,57.573181,182.963158,11.514636
2023-01-01,A,V2,L2,69,46,0.666667,0.05675,528.681159,32.972703,100.44942,6.264813,793.021739,49.459054,150.67413,9.39722
2023-01-01,B,V1,L1,114,74,0.649123,0.044698,457.175439,24.27941,48.003421,2.561191,704.297297,37.403415,73.951216,3.945618
2023-01-01,B,V1,L2,64,40,0.625,0.060515,456.578125,31.956672,136.973437,9.587002,730.525,51.130676,219.1575,15.339203
2023-01-01,B,V2,L1,67,40,0.597015,0.059924,474.552239,35.121685,94.910448,7.024337,794.875,58.828822,158.975,11.765764
2023-01-01,B,V2,L2,56,35,0.625,0.064694,458.285714,35.842277,87.074286,6.810033,733.257143,57.347643,139.318857,10.896052
2023-02-01,A,V1,L1,124,84,0.677419,0.041979,508.5,26.703764,53.3925,2.816343,750.642857,39.419841,78.8175,4.157458
2023-02-01,A,V1,L2,66,44,0.666667,0.058026,507.166667,36.581127,152.15,10.974338,760.75,54.871691,228.225,16.461507


In [13]:
target_level_cost_summary = efficiency.get_target_level_cost_summary(
    dataset=merged_dataset,
    general_group_list=["vertical", "vendor"],
    final_decision_column="final_decision",
    ids=["id"],
    labeler_type="labeler_type",
    non_converted_list=["not_valid"],
    decision_value_column="decision_value",
    handling_time_second="handling_times_sec",
    cpd="cpd",
)
target_level_cost_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt_task,cnt_target,cnt_final,final_conversion_rate,final_conversion_rate_std,ht_target,ht_target_std,cpd_target,cpd_target_std,ht_final,ht_final_std,cpd_final,cpd_final_std
vertical,vendor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,V1,355,233,151,0.648069,0.031287,781.866953,35.772384,132.041974,5.418061,1206.456954,55.198447,203.746887,8.36032
A,V2,263,258,174,0.674419,0.029173,528.565891,18.383411,103.223876,3.602477,783.735632,27.25816,153.056092,5.341603
B,V1,366,242,163,0.673554,0.030143,734.657025,32.531454,127.820455,5.125063,1090.717791,48.298233,189.770245,7.608989
B,V2,253,253,167,0.660079,0.02978,477.810277,17.613026,93.353676,3.454899,723.868263,26.683206,141.428024,5.234068


In [14]:
target_level_cost_summary_by_time = efficiency.get_target_level_cost_summary(
    dataset=merged_dataset,
    general_group_list=["vertical", "vendor"],
    final_decision_column="final_decision",
    ids=["id"],
    labeler_type="labeler_type",
    non_converted_list=["not_valid"],
    decision_value_column="decision_value",
    handling_time_second="handling_times_sec",
    cpd="cpd",
    time_column="ds_month",
)
target_level_cost_summary_by_time

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cnt_task,cnt_target,cnt_final,final_conversion_rate,final_conversion_rate_std,ht_target,ht_target_std,cpd_target,cpd_target_std,ht_final,ht_final_std,cpd_final,cpd_final_std
ds_month,vertical,vendor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01,A,V1,165,108,70,0.648148,0.045952,793.037037,51.327021,130.585278,7.483728,1223.542857,79.190262,201.474429,11.546323
2023-01-01,A,V2,133,131,91,0.694656,0.040239,543.832061,24.154168,105.981756,4.719966,782.879121,34.771385,152.567143,6.794677
2023-01-01,B,V1,178,119,80,0.672269,0.043029,683.521008,42.876153,119.652857,6.527625,1016.7375,63.778278,177.983625,9.709842
2023-01-01,B,V2,123,123,87,0.707317,0.041025,467.146341,25.15616,91.342764,4.93733,660.448276,35.565606,129.13977,6.980363
2023-02-01,A,V1,190,127,83,0.653543,0.042224,760.055118,48.630159,131.201339,7.463868,1162.975904,74.410002,200.753855,11.420617
2023-02-01,A,V2,130,129,83,0.643411,0.042173,504.868217,26.251959,98.822868,5.169254,784.674699,40.801238,153.592169,8.034142
2023-02-01,B,V1,188,126,86,0.68254,0.041469,765.460317,46.958602,132.490952,7.078543,1121.488372,68.799812,194.114651,10.370888
2023-02-01,B,V2,130,130,80,0.615385,0.042669,487.9,24.63394,95.256308,4.830016,792.8375,40.030152,154.7915,7.848777


In [15]:
survey_cost_summary = efficiency.get_survey_cost_summary(
    dataset=merged_dataset,
    detailed_group_list=["vertical", "vendor", "labeler_type"],
    non_converted_list=["not_valid"],
    decision_value_column="decision_value",
)
survey_cost_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cnt_attempted,cnt_converted,conversion_rate,conversion_rate_sd,cost_attempted,cost_converted
vertical,vendor,labeler_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,V1,L1,238,170,0.714286,0.029283,1,1.4
A,V1,L2,117,78,0.666667,0.043581,1,1.5
A,V2,L1,134,89,0.664179,0.040798,1,1.505618
A,V2,L2,129,84,0.651163,0.041962,1,1.535714
B,V1,L1,236,148,0.627119,0.031478,1,1.594595
B,V1,L2,130,85,0.653846,0.041725,1,1.529412
B,V2,L1,134,83,0.619403,0.041944,1,1.614458
B,V2,L2,119,76,0.638655,0.044037,1,1.565789


In [16]:
survey_cost_summary_by_time = efficiency.get_survey_cost_summary(
    dataset=merged_dataset,
    detailed_group_list=["vertical",  "vendor", "labeler_type"],
    non_converted_list=["not_valid"],
    decision_value_column="decision_value",
    time_column="ds_month",
)
survey_cost_summary_by_time

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cnt_attempted,cnt_converted,conversion_rate,conversion_rate_sd,cost_attempted,cost_converted
ds_month,vertical,vendor,labeler_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-01-01,A,V1,L1,114,86,0.754386,0.040315,1,1.325581
2023-01-01,A,V1,L2,51,34,0.666667,0.06601,1,1.5
2023-01-01,A,V2,L1,64,38,0.59375,0.061392,1,1.684211
2023-01-01,A,V2,L2,69,46,0.666667,0.05675,1,1.5
2023-01-01,B,V1,L1,114,74,0.649123,0.044698,1,1.540541
2023-01-01,B,V1,L2,64,40,0.625,0.060515,1,1.6
2023-01-01,B,V2,L1,67,40,0.597015,0.059924,1,1.675
2023-01-01,B,V2,L2,56,35,0.625,0.064694,1,1.6
2023-02-01,A,V1,L1,124,84,0.677419,0.041979,1,1.47619
2023-02-01,A,V1,L2,66,44,0.666667,0.058026,1,1.5
