## Tags objects with tensions and writes back tagged_objects_tensions

reads objects, adds the columns below, and writes back to `object_tensions.parquet` Each of the following columns are of type boolean

- `is_economic_labour_tension`,
- `is_sectarian_tension`,
- `is_environmental_tension`,
- `is_political_tension`,
- `is_service_related_tension`,
- `is_community_insecurity_tension`,
- `is_geopolitics_tension`,
- `is_intercommunity_relations_tension`
- `has_tension`

Currently uses the latest CountVectorizerTensionClassifier trained to predict which tensions are present in objects. It only classifies tensions it has trained for, with all other tensions being False.

The `has_tension` column is True if any of the other tensions is True. 


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import pickle

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_datetime
from phoenix.common import utils
from phoenix.custom_models.tension_classifier.tension_classifier import CountVectorizerTensionClassifier
from phoenix.tag.tension import tag_tension

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parameters
# See phoenix/common/run_datetime.py expected format of parameter
RUN_DATETIME = None

# See phoenix/common/artifacts/registry_environment.py expected format of parameter
ARTIFACTS_ENVIRONMENT_KEY = "local"

# Filters for batch
YEAR_FILTER = 2021
# Without zero padding
MONTH_FILTER = 7
OBJECT_TYPE = "facebook_posts"

# Model URLs
TENSIONS_CLASSIFIER_SUFFIX = ""
STATIC_URL_CUSTOM_MODELS_TENSION_CLASSIFIER_BASE = None

In [None]:
if RUN_DATETIME:
    run_dt = run_datetime.from_file_safe_str(RUN_DATETIME)
else:
    run_dt = run_datetime.create_run_datetime_now()
    
url_config = {
    "YEAR_FILTER": int(YEAR_FILTER),
    "MONTH_FILTER": int(MONTH_FILTER),
    "OBJECT_TYPE": OBJECT_TYPE,
}
art_url_reg = artifacts.registry.ArtifactURLRegistry(run_dt, ARTIFACTS_ENVIRONMENT_KEY, artifacts.registry_mappers.get_default_mappers())
TAGGING_RUNS_URL_PIPELINE_BASE = art_url_reg.get_url("tagging_runs-pipeline_base", url_config)
TAGGING_RUNS_URL_OBJECTS = art_url_reg.get_url("tagging_runs-objects", url_config)
TAGGING_RUNS_URL_TOPICS = art_url_reg.get_url("tagging_runs-topics", url_config)
TAGGING_RUNS_URL_OBJECTS_TENSIONS = art_url_reg.get_url("tagging_runs-objects_tensions", url_config)
if not STATIC_URL_CUSTOM_MODELS_TENSION_CLASSIFIER_BASE:
    STATIC_URL_CUSTOM_MODELS_TENSION_CLASSIFIER_BASE = art_url_reg.get_url("static-custom_models_tension_classifier_base", url_config)

In [None]:
# Display params.
print(
TAGGING_RUNS_URL_PIPELINE_BASE,
TAGGING_RUNS_URL_OBJECTS,
TAGGING_RUNS_URL_TOPICS,
TAGGING_RUNS_URL_OBJECTS_TENSIONS,
STATIC_URL_CUSTOM_MODELS_TENSION_CLASSIFIER_BASE,
run_dt.dt,
sep='\n',
)

In [None]:
objects_df = artifacts.dataframes.get(TAGGING_RUNS_URL_OBJECTS).dataframe

In [None]:
topics_df = artifacts.dataframes.get(TAGGING_RUNS_URL_TOPICS).dataframe

In [None]:
topics_df = tag_tension.agg_list_topics(topics_df)

In [None]:
topics_df

In [None]:
objects_df = pd.merge(objects_df, topics_df, on="object_id", how="left")

In [None]:
tension_classifier = CountVectorizerTensionClassifier.get_model(
    CountVectorizerTensionClassifier.get_model_url(STATIC_URL_CUSTOM_MODELS_TENSION_CLASSIFIER_BASE, TENSIONS_CLASSIFIER_SUFFIX)
)

In [None]:
print(f"We will classify the following tensions: {tension_classifier.class_labels}")

In [None]:
objects_df = tension_classifier.predict(objects_df)

In [None]:
objects_df = tag_tension.normalise(objects_df)

In [None]:
a = artifacts.dataframes.persist(TAGGING_RUNS_URL_OBJECTS_TENSIONS, objects_df)
a.url