# Features
Compute the features

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.tag import normalise
from phoenix.tag import feature
from phoenix.tag import feature_object_process
from phoenix.tag import export

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)

# TODO: this has to be refactored so that MONTH_PREFIX is relational to the YEAR_FILTER and MONTH_FILTER
YEAR_FILTER = 2021
MONTH_FILTER = 5
MONTH_PREFIX = f"year_filter={YEAR_FILTER}/month_filter={MONTH_FILTER}/"

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/facebook_posts/{MONTH_PREFIX}"

# Input
FOR_TAGGING_ARTIFACTS_FOLDER = f"{ARTIFACTS_BASE_URL}for_tagging/"

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
FOR_TAGGING_ARTIFACTS_FOLDER,
RUN_DATE,
sep='\n',
)

In [None]:
%env DASK_CLUSTER_IP=tcp://127.0.0.1:40861

In [None]:
utils.dask_global_init()

In [None]:
all_objects = normalise.merge(FOR_TAGGING_ARTIFACTS_FOLDER)

In [None]:
all_objects.head()

In [None]:
all_objects_normalised = normalise.execute(all_objects)

In [None]:
all_objects_normalised.head()

In [None]:
lang_dist_all = normalise.language_distribution(all_objects_normalised)
lang_dist_all

In [None]:
object_features_df = feature.features(all_objects_normalised)

In [None]:
features_df = feature.explode_features(object_features_df)

In [None]:
features_df.head()

In [None]:
objects_final, key_objects_final, features_final = feature_object_process.finalise(all_objects_normalised, features_df)

In [None]:
objects_final.head()

In [None]:
key_objects_final.head()

In [None]:
key_objects_final.shape

In [None]:
lang_dist_key = normalise.language_distribution(key_objects_final)
lang_dist_key

In [None]:
features_final.head()

In [None]:
features_final.shape

In [None]:
features_final.memory_usage().sum()

In [None]:
# Free up some memory so that we can do the persist
del all_objects
del all_objects_normalised
del object_features_df
del features_df

In [None]:
a = artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "key_objects"), export.get_objects_for_export(key_objects_final))
a.url

In [None]:
a = artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "objects"), export.get_objects_for_export(objects_final))
a.url

In [None]:
# Free up some memory so that we can do the persist
del key_objects_final
del objects_final

In [None]:
_ = artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "all_features"), export.get_all_features_for_export(features_final))

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, None)

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, "tweets")

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, "facebook_posts")

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, "facebook_comments")

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, "key_tweets")

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, "key_facebook_posts")

In [None]:
export.features_for_labeling(ARTIFACTS_BASE_URL, features_final, "key_facebook_comments")