# Group messages through Latent Dirichlet Allocation
### Script that will group objects using LDA, visualise groups based on word clouds and tag the objects with the LDA grouping.

## Expected message input:

| Property | Data Type | Description |
| :------- | :-------- | :---------- |
| objectId | string | Id of the tweet, post or comment |
| clean_text  | string | Message data to be analysed |
| {grouping_column} | string | Optional column name used to run separate LDAs per group |

## Expected grouping output:
| Property | Data Type | Description |
| :------- | :-------- | :---------- |
| objectId | string | Id of the tweet, post or comment |
| clean_text  | string | Message data to be analysed |
| {grouping_column} | string | Optional column name used to run separate LDAs per group |
| lda_name | string | grouping name when running separate LDAs per group. Uses values in {grouping_column} if given grouping_column, defaults to 'all' group name if not given |
| lda_cloud | int | cloud number which object belongs to (default 1-10) |
| lda_cloud_confidence | float | confidence that object belongs to the lda_cloud group (0.0-1.0 |


### The LatentDirichletAllocator will also be saved as pickle. To load it, uncomment the last cell and run it

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio

from phoenix.common import artifacts, run_params
from phoenix.common import utils
from phoenix.tag import clustering
from phoenix.tag.topic import single_feature_match

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parameters
# See phoenix/common/run_datetime.py expected format of parameter
RUN_DATETIME = None

TENANT_ID = None

# See phoenix/common/artifacts/registry_environment.py expected format of parameter
ARTIFACTS_ENVIRONMENT_KEY = "local"

# Group names in dataframe
GROUPING_TYPE = "topic"
# Excluding the fill topic.
# Amount of objects with fill topic is usually large
# and causes performance of notebook to degrade beyond use
EXCLUDED_GROUPS = [single_feature_match.FILL_TOPIC]

# Filters for batch
YEAR_FILTER = 2022
# Without zero padding
MONTH_FILTER = 1
OBJECT_TYPE = "facebook_posts"

In [None]:
cur_run_params = run_params.general.create(ARTIFACTS_ENVIRONMENT_KEY, TENANT_ID, RUN_DATETIME)

url_config = {
    "YEAR_FILTER": int(YEAR_FILTER),
    "MONTH_FILTER": int(MONTH_FILTER),
    "OBJECT_TYPE": OBJECT_TYPE,
}

TAGGING_RUNS_URL_PIPELINE_BASE = cur_run_params.art_url_reg.get_url("tagging_runs-pipeline_base", url_config)
TAGGING_RUNS_URL_OBJECTS = cur_run_params.art_url_reg.get_url("tagging_runs-objects", url_config)
TAGGING_RUNS_URL_TOPICS = cur_run_params.art_url_reg.get_url("tagging_runs-topics", url_config)
TAGGING_RUNS_URL_OBJECTS_TOPICS = cur_run_params.art_url_reg.get_url("tagging_runs-objects_topics", url_config)
TAGGING_RUNS_URL_CLUSTERING = cur_run_params.art_url_reg.get_url("tagging_runs-clustering", url_config)
TAGGING_RUNS_URL_CLUSTERING_DASHBOARD = cur_run_params.art_url_reg.get_url("tagging_runs-clustering_dashboard", url_config)

In [None]:
# Display params.
print(
TAGGING_RUNS_URL_PIPELINE_BASE,
TAGGING_RUNS_URL_OBJECTS,
TAGGING_RUNS_URL_TOPICS,
TAGGING_RUNS_URL_CLUSTERING,
GROUPING_TYPE,
cur_run_params,
sep='\n',
)

In [None]:
object_df = artifacts.dataframes.get(TAGGING_RUNS_URL_OBJECTS).dataframe

In [None]:
object_df.head()

In [None]:
object_df = clustering.utils.apply_grouping_to_objects(
    grouping_type=GROUPING_TYPE,
    object_df=object_df,
    topic_df_url=TAGGING_RUNS_URL_TOPICS,
    exclude_groupings=EXCLUDED_GROUPS,
)

In [None]:
object_df.shape

In [None]:
# This will immediately fit a StemmedCountVectorizer and might take a while to complete.
lda = clustering.latent_dirichlet_allocation.LatentDirichletAllocator(object_df, grouping_column=GROUPING_TYPE)

In [None]:
print(lda.dfs.items())

In [None]:
lda.vectorizers

In [None]:
# This will train the Latent Dirichlet Allocation model and use GridSearch 
# to find the best hyperparameters, This will take quite a while to complete.
lda.train()

In [None]:
lda.save_plot(TAGGING_RUNS_URL_CLUSTERING)

In [None]:
clustering.utils.save_for_dashboard(lda, TAGGING_RUNS_URL_CLUSTERING_DASHBOARD)

In [None]:
lda.tag_dataframe()

In [None]:
lda.persist(TAGGING_RUNS_URL_CLUSTERING)

In [None]:
lda.persist_model(TAGGING_RUNS_URL_CLUSTERING)

### The LatentDirichletAllocator will also be saved as pickle. To load it, uncomment the last cell and run it

In [None]:
# import pickle
# with tentaclio.open(f"{ARTIFACTS_BASE_URL}latent_dirichlet_allocator_model.sav", 'rb') as f:
#     lda_loaded = pickle.load(f)