## Pull Objects labelling sheet

This notebook pulls the Google sheet used by users to label data, and processes that data to provide 
- A dataframe for Single Feature to Label Mapping (SFLM) 
- A dataframe with examples that have labels which cannot be inferred using a one-one mapping from a feature to that label 

It pulls the SFLM from the `{Tenant_id}_class_mappings` google sheets, deduplicates it with the processed SFLM, and creates a pushable SFLM.

It then persists the SFLM back to google sheets. It also persists (and overwrites) the examples with labels but no features in a separate worksheet. 


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_params, utils
from phoenix.tag.topic import single_feature_match_topic_config as sfm_topic_config
from phoenix.tag.labelling import pull_label_sheet
from phoenix.tag.labelling import utils as labelling_utils

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parameters
# See phoenix/common/run_datetime.py expected format of parameter
RUN_DATETIME = None
TENANT_ID = None

# See phoenix/common/artifacts/registry_environment.py expected format of parameter
ARTIFACTS_ENVIRONMENT_KEY = "local"

OBJECT_TYPE = "facebook_posts"


In [None]:
cur_run_params = run_params.general.create(ARTIFACTS_ENVIRONMENT_KEY, TENANT_ID, RUN_DATETIME)

# INPUT
SPREADSHEET_NAME = f"{TENANT_ID}_manual_data_labelling"
WORKSHEET_NAME = f"{OBJECT_TYPE}_to_label"

TENANT_FOLDER_ID = cur_run_params.tenant_config.google_drive_folder_id

# OUTPUT
OUTPUT_SPREADSHEET_NAME = f"{TENANT_ID}_class_mappings"
OUTPUT_WORKSHEET_NAME_SFLM = f"{OBJECT_TYPE}_feature_mappings"
OUTPUT_WORKSHEET_NAME_NO_FEATURES = f"{OBJECT_TYPE}_no_feature_labels"
OUTPUT_WORKSHEET_NAME_STATISTICS = f"{OBJECT_TYPE}_statistics"


In [None]:
# Display params.
print(
cur_run_params.run_dt.dt,
cur_run_params.tenant_config,
SPREADSHEET_NAME,
WORKSHEET_NAME,
OUTPUT_SPREADSHEET_NAME,
OUTPUT_WORKSHEET_NAME_SFLM,
OUTPUT_WORKSHEET_NAME_NO_FEATURES,
OUTPUT_WORKSHEET_NAME_STATISTICS,
sep='\n',
)

In [None]:
google_client = artifacts.google_sheets.get_client()

In [None]:
labelled_objects_df = artifacts.google_sheets.get(
    google_client, TENANT_FOLDER_ID, SPREADSHEET_NAME, WORKSHEET_NAME
)

In [None]:
labelled_objects_df

In [None]:
labelled_objects_df["object_id"].nunique()

In [None]:
df_object_labels, labels_no_features_df = pull_label_sheet.extract_features_to_label_mapping_objects(labelled_objects_df)

In [None]:
df_object_labels

In [None]:
df_object_labels["class"].nunique()

In [None]:
current_object_feature_mappings = artifacts.google_sheets.get(
    google_client, 
    TENANT_FOLDER_ID, 
    OUTPUT_SPREADSHEET_NAME,
    OUTPUT_WORKSHEET_NAME_SFLM,
)

In [None]:
current_object_feature_mappings

In [None]:
cols = ["class", "unprocessed_features", "processed_features"] 

In [None]:
appendable_single_feature_to_label_mapping = labelling_utils.filter_out_duplicates(
    current_object_feature_mappings,
    df_object_labels,
    cols
)

In [None]:
appendable_single_feature_to_label_mapping

In [None]:
pushable_single_feature_to_label_mapping = current_object_feature_mappings.append(appendable_single_feature_to_label_mapping)

In [None]:
artifacts.google_sheets.persist(
    google_client, 
    TENANT_FOLDER_ID, 
    OUTPUT_SPREADSHEET_NAME,
    OUTPUT_WORKSHEET_NAME_SFLM,
    pushable_single_feature_to_label_mapping
)

In [None]:
labels_no_features_df

In [None]:
artifacts.google_sheets.persist(
    google_client, 
    TENANT_FOLDER_ID, 
    OUTPUT_SPREADSHEET_NAME,
    OUTPUT_WORKSHEET_NAME_NO_FEATURES,
    labels_no_features_df
)

In [None]:
sflm_statistics_df = pull_label_sheet.compute_sflm_statistics(labelled_objects_df, pushable_single_feature_to_label_mapping)

In [None]:
sflm_statistics_df

In [None]:
artifacts.google_sheets.persist(
    google_client, 
    TENANT_FOLDER_ID, 
    OUTPUT_SPREADSHEET_NAME,
    OUTPUT_WORKSHEET_NAME_STATISTICS,
    sflm_statistics_df
)