In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import arabic_reshaper
from bidi.algorithm import get_display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from snowballstemmer import stemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
import tentaclio

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.visualise_posts import lda


In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)
RUN_DATE = "2021-05-04"

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/"

# Set dataset name
DATASET_NAME = "all_posts"

# Input CSV
INPUT_CSV = f"{artifacts.urls.get_local()}input.csv"

# Features artifact df

FEATURES_URL = f"{ARTIFACTS_BASE_URL}features_posts.parquet"

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
INPUT_CSV,
FEATURES_URL,
DATASET_NAME,
RUN_DATE,
sep='\n',
)

In [None]:
features_df = artifacts.dataframes.get(FEATURES_URL).dataframe

In [None]:
# This block will not be needed later on as we'll expect only a message/object_id?

In [None]:
# Split based on language.
dfs = {lang:df for lang, df in features_df.groupby('language')}

In [None]:
df_big = dfs["ar"]

In [None]:
df_big = df_big.groupby("post_index").first().reset_index()

In [None]:
df = df_big[["message", "post_index"]]

In [None]:
# This block will not be needed later on as we'll expect only a message + object_id?

In [None]:
df

In [None]:
clean_df = lda.remove_links(df, "message")

In [None]:
count_vectorizer = lda.StemmedCountVectorizer(stemmer("arabic"), stop_words=lda.get_stopwords())

In [None]:
word_matrix = count_vectorizer.fit_transform(clean_df["message"])

In [None]:
word_matrix

In [None]:
count_vectorizer.plot_most_common_words(word_matrix, 15)

In [None]:
count_vectorizer.get_most_common_words(word_matrix)

In [None]:
# Future check: first 1-3k words in 'arabic' tagged language are roman alphabet  
count_vectorizer.get_feature_names()

In [None]:
# Simple grid search of best hyperparams for the LDA model
search_params = {'n_components': [10, 20, 30, 40], 'max_iter': [10, 20, 40]}

model = GridSearchCV(LatentDirichletAllocation(), cv = None, param_grid = search_params)
model.fit(word_matrix) 

In [None]:
with tentaclio.open(f"{ARTIFACTS_BASE_URL}{DATASET_NAME}_lda_wordcloud.png", mode='wb') as writer:
    lda.save_plot_top_lda_words(
        model.best_estimator_, count_vectorizer.get_feature_names(), 15, f"{DATASET_NAME} Cloud Groupings", writer
    )

In [None]:
tagged_df = lda.write_cloud_results(df, word_matrix, model.best_estimator_)

In [None]:
tagged_df

In [None]:
# Save the (Grid search of the) LDA model
with tentaclio.open(f"{ARTIFACTS_BASE_URL}{DATASET_NAME}_lda_model.sav", mode='wb') as writer:
    pickle.dump(model, writer)

In [None]:
# Save the Count vectorizer model
with tentaclio.open(f"{ARTIFACTS_BASE_URL}{DATASET_NAME}_count_vectorizer.sav", mode='wb') as writer:
    pickle.dump(count_vectorizer, writer)

In [None]:
# Save the tagged dataset
lda.persist(f"{ARTIFACTS_BASE_URL}{DATASET_NAME}_tagged_df" ,tagged_df)