# May tagging
This notebook is does the tagging for May

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.tag import normalise
from phoenix.tag import feature
from phoenix.tag import data_pull
from phoenix.tag import export

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/"

# Input CSV
FB_POSTS_INPUT_FOLDER_CSV = f"{artifacts.urls.get_local()}input_csvs/facebook-ct-may/"
TWEETS_INPUT_FOLDER_JSON = f"{artifacts.urls.get_local()}input_csvs/twitter-may/"

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
FB_POSTS_INPUT_FOLDER_CSV,
TWEETS_INPUT_FOLDER_JSON,
RUN_DATE,
sep='\n',
)

In [None]:
# %env DASK_CLUSTER_IP=tcp://127.0.0.1:45143

In [None]:
utils.dask_global_init()

## Facebook Posts

In [None]:
posts_df = data_pull.crowdtangle_csvs(FB_POSTS_INPUT_FOLDER_CSV)

In [None]:
posts_df.head()

In [None]:
artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "normalised_posts"), posts_df)

In [None]:
posts_df.shape

In [None]:
posts_df = normalise.execute(posts_df)

In [None]:
lang_dist = normalise.language_distribution(posts_df)
lang_dist

In [None]:
posts_features_df = feature.features(posts_df)

In [None]:
posts_features = feature.explode_features(posts_features_df)

In [None]:
posts_features.shape

In [None]:
key_posts, posts_features_has = feature.get_key_items(posts_features)

In [None]:
key_posts.head()

In [None]:
posts_to_scrape = export.get_posts_to_scrape(key_posts)
posts_to_scrape.shape

In [None]:
posts_to_scrape.head()

In [None]:
with tentaclio.open(ARTIFACTS_BASE_URL + "posts_to_scrape.csv", "w") as fb:
    posts_to_scrape.to_csv(fb)

In [None]:
lang_dist = normalise.language_distribution(key_posts)
lang_dist

In [None]:
artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "posts_features"), posts_features[["index", "facebook_id", "features", "features_count", "language"]])

## Tweets

In [None]:
tweets_df = data_pull.twitter_json(TWEETS_INPUT_FOLDER_JSON)

In [None]:
tweets_df.head()

In [None]:
tweets_df = normalise.execute(tweets_df, "full_text")

In [None]:
tweets_df.shape

In [None]:
lang_dist = normalise.language_distribution(tweets_df)
lang_dist

In [None]:
tweets_features_df = feature.features(tweets_df)

In [None]:
tweets_features = feature.explode_features(tweets_features_df)

In [None]:
key_tweets, tweets_features_has = feature.get_key_items(tweets_features)

In [None]:
key_tweets.shape

In [None]:
lang_dist = normalise.language_distribution(key_tweets)
lang_dist

In [None]:
artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "tweets_features"), tweets_features[["index", "id_str", "features", "features_count", "language"]])

## Join (all)

In [None]:
all_features = normalise.join_fb_posts_tweets(posts_features_has, tweets_features_has)

In [None]:
all_features.head()

In [None]:
all_features.shape

In [None]:
features_to_label = feature.get_features_to_label(all_features)

In [None]:
features_to_label.head()

In [None]:
features_to_label.shape

In [None]:
with tentaclio.open(ARTIFACTS_BASE_URL + "all_features_to_label.csv", "w") as fb:
    features_to_label.to_csv(fb)

In [None]:
tweets_features_to_label = feature.get_features_to_label(all_features[all_features["object_type"] == "tweets"])
with tentaclio.open(ARTIFACTS_BASE_URL + "tweets_features_to_label.csv", "w") as fb:
    tweets_features_to_label.to_csv(fb)

In [None]:
posts_features_to_label = feature.get_features_to_label(all_features[all_features["object_type"] == "facebook_posts"])
with tentaclio.open(ARTIFACTS_BASE_URL + "posts_features_to_label.csv", "w") as fb:
    posts_features_to_label.to_csv(fb)

In [None]:
key_tweets_features_to_label = feature.get_features_to_label(all_features[(all_features["object_type"] == "tweets") & (all_features["has_key_feature"] == True)])
with tentaclio.open(ARTIFACTS_BASE_URL + "key_tweets_features_to_label.csv", "w") as fb:
    key_tweets_features_to_label.to_csv(fb)

In [None]:
key_posts_features_to_label = feature.get_features_to_label(all_features[(all_features["object_type"] == "facebook_posts") & (all_features["has_key_feature"] == True)])
with tentaclio.open(ARTIFACTS_BASE_URL + "key_posts_features_to_label.csv", "w") as fb:
    key_posts_features_to_label.to_csv(fb)

In [None]:
with tentaclio.open(ARTIFACTS_BASE_URL + "posts_with_key_features.csv", "w") as fb:
    key_posts.to_csv(fb)

In [None]:
with tentaclio.open(ARTIFACTS_BASE_URL + "tweets_with_key_features.csv", "w") as fb:
    key_tweets.to_csv(fb)