# May tagging
This notebook is does the tagging for May

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.tag import normalise
from phoenix.tag import feature
from phoenix.tag import data_pull

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/"

# Input CSV
INPUT_FOLDER_CSV = f"{artifacts.urls.get_local()}input_csvs/facebook-ct-may/"

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
INPUT_FOLDER_CSV,
RUN_DATE,
sep='\n',
)

In [None]:
# %env DASK_CLUSTER_IP=tcp://127.0.0.1:45143

In [None]:
utils.dask_global_init()

In [None]:
posts_df = data_pull.crowdtangle_csvs(INPUT_FOLDER_CSV)

In [None]:
posts_df.head()

In [None]:
artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "normalised_posts"), posts_df)

In [None]:
posts_df.shape

In [None]:
posts_df = normalise.execute(posts_df)

In [None]:
lang_dist = normalise.language_distribution(posts_df)
lang_dist

In [None]:
posts_features_df = feature.features(posts_df)

In [None]:
posts_features = feature.explode_features(posts_features_df)

In [None]:
posts_features.shape

In [None]:
key_posts = feature.get_key_posts(posts_features)

In [None]:
lang_dist = normalise.language_distribution(key_posts)
lang_dist

In [None]:
features_to_label = feature.get_features_to_label(posts_features)

In [None]:
features_to_label.head()

In [None]:
artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "posts_features"), posts_features[["index", "facebook_id", "features", "features_count", "language"]])

In [None]:
with tentaclio.open(ARTIFACTS_BASE_URL + "features_to_label.csv"), "w") as fb:
    features_to_label.to_csv(fb)