# Finalise twitter and facebook
Finalises the twitter and facebook data.
And persists it to s3.
See docs for more information.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.tag import export
from phoenix.tag import finalise
from phoenix.tag import object_filters

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/"

# OUTPUT
FACEBOOK_POSTS_PERSIST = "s3://buildup-dev-us-tables/fb_posts/parquet_exports/fb_posts_may/persisted.parquet"
TWEETS_PERSIST = "s3://buildup-dev-us-tables/tweets/parquet_exports/tweets_may/persisted.parquet"

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
RUN_DATE,
sep='\n',
)

In [None]:
# %env DASK_CLUSTER_IP=

In [None]:
utils.dask_global_init()

In [None]:
facebook_posts_df = artifacts.dataframes.get(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "facebook_posts_pulled")).dataframe

In [None]:
facebook_posts_df.head()

In [None]:
tweets_df = artifacts.dataframes.get(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "tweets_pulled")).dataframe

In [None]:
tweets_df.head()

In [None]:
objects = artifacts.dataframes.get(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "objects")).dataframe

In [None]:
objects.head()

In [None]:
facebook_posts_final = finalise.join_objects_to_facebook_posts(objects, facebook_posts_df)

In [None]:
facebook_posts_final.head()

In [None]:
tweets_final = finalise.join_objects_to_tweets(objects, tweets_df)

In [None]:
tweets_final.head()

In [None]:
posts_to_scrape = export.get_posts_to_scrape(object_filters.get_key_objects(facebook_posts_final))

In [None]:
with tentaclio.open(ARTIFACTS_BASE_URL + f"posts_to_scrape.csv", "w") as fb:
    posts_to_scrape.to_csv(fb)

In [None]:
_ = artifacts.dataframes.persist(FACEBOOK_POSTS_PERSIST, facebook_posts_final)

In [None]:
_ = artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "facebook_posts_final"), facebook_posts_final)

In [None]:
_ = artifacts.dataframes.persist(TWEETS_PERSIST, tweets_final)

In [None]:
_ = artifacts.dataframes.persist(artifacts.dataframes.url(ARTIFACTS_BASE_URL, "tweets_final"), tweets_final)