# Topics
Compute the topics

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_params
from phoenix.common import utils
from phoenix.tag import topic
from phoenix.tag.topic import single_feature_match as sfm
from phoenix.tag.topic import single_feature_match_topic_config as sfm_topic_config

In [None]:
utils.setup_notebook_output(max_rows=200)
utils.setup_notebook_logging()

In [None]:
# Parameters
# See phoenix/common/run_datetime.py expected format of parameter
RUN_DATETIME = None

TENANT_ID = None

# See phoenix/common/artifacts/registry_environment.py expected format of parameter
ARTIFACTS_ENVIRONMENT_KEY = "local"


# Filters for batch
YEAR_FILTER = 2021
# Without zero padding
MONTH_FILTER = 11
OBJECT_TYPE = "facebook_posts"

In [None]:
cur_run_params = run_params.general.create(ARTIFACTS_ENVIRONMENT_KEY, TENANT_ID, RUN_DATETIME)

url_config = {
    "YEAR_FILTER": int(YEAR_FILTER),
    "MONTH_FILTER": int(MONTH_FILTER),
    "OBJECT_TYPE": OBJECT_TYPE,
}


TAGGING_RUNS_URL_PIPELINE_BASE = cur_run_params.art_url_reg.get_url("tagging_runs-pipeline_base", url_config)
TAGGING_RUNS_URL_OBJECTS = cur_run_params.art_url_reg.get_url("tagging_runs-objects", url_config)
TAGGING_RUNS_URL_ALL_FEATURES = cur_run_params.art_url_reg.get_url("tagging_runs-all_features", url_config)
TAGGING_RUNS_URL_SFLM_UNPROCESSED_FEATURES = cur_run_params.art_url_reg.get_url("tagging_runs-sflm_unprocessed_features", url_config)
TAGGING_RUNS_URL_TOPICS = cur_run_params.art_url_reg.get_url("tagging_runs-topics", url_config)
TAGGING_RUNS_URL_OBJECTS_TOPICS = cur_run_params.art_url_reg.get_url("tagging_runs-objects_topics", url_config)
TAGGING_RUNS_URL_TOPICS_CSV = cur_run_params.art_url_reg.get_url("tagging_runs-topics_csv", url_config)
TAGGING_RUNS_URL_OBJECTS_TOPICS_CSV = cur_run_params.art_url_reg.get_url("tagging_runs-objects_topics_csv", url_config)
SFM_TOPIC_CONFIG_URL = cur_run_params.art_url_reg.get_url("static-legacy-sfm-config")

In [None]:
# Display params.
print(
TAGGING_RUNS_URL_PIPELINE_BASE,
TAGGING_RUNS_URL_ALL_FEATURES,
TAGGING_RUNS_URL_SFLM_UNPROCESSED_FEATURES,
TAGGING_RUNS_URL_OBJECTS,
TAGGING_RUNS_URL_TOPICS,
TAGGING_RUNS_URL_OBJECTS_TOPICS,
TAGGING_RUNS_URL_TOPICS_CSV,
TAGGING_RUNS_URL_OBJECTS_TOPICS_CSV,
SFM_TOPIC_CONFIG_URL,
cur_run_params.run_dt.dt,
sep='\n',
)

In [None]:
all_features_df = artifacts.dataframes.get(TAGGING_RUNS_URL_ALL_FEATURES).dataframe

In [None]:
all_features_df.head()

In [None]:
sflm_unprocessed_features = artifacts.dataframes.get(TAGGING_RUNS_URL_SFLM_UNPROCESSED_FEATURES).dataframe

In [None]:
sflm_unprocessed_features.head()

In [None]:
objects_df = artifacts.dataframes.get(TAGGING_RUNS_URL_OBJECTS).dataframe
objects_df.head()

In [None]:
topic_config_df = sfm_topic_config.get_topic_config(SFM_TOPIC_CONFIG_URL)

In [None]:
topic_config_df.head()

In [None]:
topics_df = sfm.get_topics(topic_config_df, all_features_df, sflm_unprocessed_features)

In [None]:
topics_df.head()

In [None]:
analysis_df = sfm.analyse(topics_df)

In [None]:
analysis_df

In [None]:
topics_df.shape[0]

In [None]:
objects_topics = topic.get_object_topics(topics_df, objects_df)

In [None]:
objects_topics.head()

In [None]:
objects_topics.shape[0]

In [None]:
sum(objects_topics["has_topics"])

In [None]:
a = artifacts.dataframes.persist(TAGGING_RUNS_URL_OBJECTS_TOPICS, objects_topics)
a.url

In [None]:
a = artifacts.dataframes.persist(TAGGING_RUNS_URL_TOPICS, topics_df)
a.url

In [None]:
with tentaclio.open(TAGGING_RUNS_URL_TOPICS_CSV, "w") as fb:
    topics_df.to_csv(fb)

In [None]:
with tentaclio.open(TAGGING_RUNS_URL_OBJECTS_TOPICS_CSV, "w") as fb:
    objects_topics.to_csv(fb)