In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_params, utils
from phoenix.tag.topic import single_feature_match_topic_config as sfm_topic_config
from phoenix.tag.labelling import generate_label_sheet
from phoenix.tag.labelling import utils as labelling_utils
from phoenix.tag import normalise

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parameters
# See phoenix/common/run_datetime.py expected format of parameter
RUN_DATETIME = None
TENANT_ID = None

# See phoenix/common/artifacts/registry_environment.py expected format of parameter
ARTIFACTS_ENVIRONMENT_KEY = "local"

# Filters for batch
YEAR_FILTER = 2021
# Without zero padding
MONTH_FILTER = 11
OBJECT_TYPE = "facebook_posts"

# Number of rows that should be pushed
GOAL_NUM_ROWS = 10000


In [None]:
# OUTPUT
SPREADSHEET_NAME = f"{TENANT_ID}_manual_data_labelling"
WORKSHEET_NAME = f"{OBJECT_TYPE}_to_label"

cur_run_params = run_params.general.create(ARTIFACTS_ENVIRONMENT_KEY, TENANT_ID, RUN_DATETIME)
    
url_config = {
    "YEAR_FILTER": int(YEAR_FILTER),
    "MONTH_FILTER": int(MONTH_FILTER),
    "OBJECT_TYPE": OBJECT_TYPE,
}

TAGGING_RUNS_URL_FEATURES_FOR_TAGGING_FOLDER = cur_run_params.art_url_reg.get_url("tagging_runs-features_for_tagging", url_config)
TENANT_FOLDER_ID = cur_run_params.tenant_config.google_drive_folder_id

In [None]:
# Display params.
print(
TAGGING_RUNS_URL_FEATURES_FOR_TAGGING_FOLDER,
GOAL_NUM_ROWS,
cur_run_params.run_dt.dt,
cur_run_params.tenant_config,
YEAR_FILTER,
MONTH_FILTER,
WORKSHEET_NAME,
sep='\n',
)

In [None]:
google_client = artifacts.google_sheets.get_client()

In [None]:
labeled_objects_df = artifacts.google_sheets.get(
    google_client, TENANT_FOLDER_ID, SPREADSHEET_NAME, WORKSHEET_NAME
)

In [None]:
labeled_objects_df

In [None]:
num_already_labeled_rows = len(labeled_objects_df)

In [None]:
df = normalise.merge(TAGGING_RUNS_URL_FEATURES_FOR_TAGGING_FOLDER)

In [None]:
df

In [None]:
appendable_data_df = labelling_utils.filter_out_duplicates(labeled_objects_df, df)

In [None]:
appendable_data_df

In [None]:
GOAL_NUM_NEW_ROWS = GOAL_NUM_ROWS - num_already_labeled_rows

In [None]:
GOAL_NUM_NEW_ROWS

In [None]:
excluded_df, df = generate_label_sheet.get_goal_number_rows(
    appendable_data_df, stratify_col="object_user_url", n=GOAL_NUM_NEW_ROWS
)

In [None]:
excluded_df

In [None]:
df = df.sort_values("created_at")

In [None]:
df

In [None]:
new_object_labelling_df = generate_label_sheet.create_object_labelling_df(
    df, 
    with_user_notes=(GOAL_NUM_ROWS==GOAL_NUM_NEW_ROWS)
)

In [None]:
labelling_df_to_push = labeled_objects_df.append(new_object_labelling_df).fillna("")


In [None]:
labelling_df_to_push

In [None]:
artifacts.google_sheets.persist(
    google_client, TENANT_FOLDER_ID, SPREADSHEET_NAME, WORKSHEET_NAME, labelling_df_to_push
)