## Tags objects with tensions and writes back tagged_objects_tensions

reads objects, adds the columns below, and writes back to `object_tensions.parquet` Each of the following columns are of type boolean

- `is_economic_labour_tension`,
- `is_sectarian_tension`,
- `is_environmental_tension`,
- `is_political_tension`,
- `is_service_related_tension`,
- `is_community_insecurity_tension`,
- `is_geopolitics_tension`,
- `is_intercommunity_relations_tension`
- `has_tension`

Currently uses the latest CountVectorizerTensionClassifier trained to predict which tensions are present in objects. It only classifies tensions it has trained for, with all other tensions being False.

The `has_tension` column is True if any of the other tensions is True. 


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import pickle

import pandas as pd
import tentaclio

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.custom_models.tension_classifier.tension_classifier import CountVectorizerTensionClassifier
from phoenix.tag.tension import tag_tension

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)

# TODO: this has to be refactored so that MONTH_PREFIX is relational to the YEAR_FILTER and MONTH_FILTER
YEAR_FILTER = 2021
MONTH_FILTER = 7
MONTH_PREFIX = f"year_filter={YEAR_FILTER}/month_filter={MONTH_FILTER}/"

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}tagging_runs/{MONTH_PREFIX}facebook_posts/"
MODEL_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/count_vectorizer_tension_classifier_model.sav"
MODEL_SUFFIX = ""
MODEL_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/"

# Input
OBJECTS_PATH = artifacts.dataframes.url(ARTIFACTS_BASE_URL, "objects")
TOPICS_PATH = artifacts.dataframes.url(ARTIFACTS_BASE_URL, "topics")

# Output
TAGGED_OBJECTS_PATH = artifacts.dataframes.url(ARTIFACTS_BASE_URL, "object_tensions")

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
OBJECTS_PATH,
TOPICS_PATH,
TAGGED_OBJECTS_PATH,
RUN_DATE,
sep='\n',
)

In [None]:
objects_df = artifacts.dataframes.get(OBJECTS_PATH).dataframe

In [None]:
objects_df

In [None]:
# will later use this, but not yet
# topics_df = artifacts.dataframes.get(TOPICS_PATH)

In [None]:
# Model Suffix can later be used to distinguish between models, but for now is left empty
# and we use the latest model.
model_filename = CountVectorizerTensionClassifier.get_model_name(MODEL_SUFFIX)

In [None]:
filepath_model = f"{MODEL_BASE_URL}{model_filename}.pickle"

In [None]:
with tentaclio.open(filepath_model, "rb") as f:
    tension_classifier = pickle.load(f)

In [None]:
print(f"We will classify the following tensions: {tension_classifier.class_labels}")

In [None]:
objects_df = tension_classifier.predict(objects_df, "clean_text")

In [None]:
objects_df[tensions_cols] = objects_df[tensions_cols].astype(bool) 

In [None]:
objects_df = tag_tension.tag_object_has_tension(objects_df)

In [None]:
a = artifacts.dataframes.persist(TAGGED_OBJECTS_PATH, objects_df)