In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio
# import hulearn
import sklearn

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from phoenix.common import artifacts
from phoenix.common import utils
from phoenix.custom_models.tension_classifier import process_annotations
from phoenix.custom_models.tension_classifier.tension_classifier import CountVectorizerTensionClassifier


In [None]:
# Parametrise the run execution date.
# Format of the run date
RUN_DATE_FORMAT = "%Y-%m-%d"
# This can be overwritten at execution time by Papermill to enable historic runs and backfills etc.
RUN_DATE = datetime.datetime.today().strftime(RUN_DATE_FORMAT)

# Set Artefacts URL
ARTIFACTS_BASE_URL = f"{artifacts.urls.get_local()}{RUN_DATE}/"

# Input
FOLDER_ANNOTATIONS = f"{artifacts.urls.get_local()}input_csvs/annotated_data/"

## Crude cutoff point where any row above this int becomes the validation set
MAX_TRAIN_TEST_ROW = 1797
MAX_ANNOTATION_ROW = 2000
## Only try to classify a tension if there are at least this many objects
MIN_NUM_OBJECTS_PER_TENSION = 60

# Output just uses ARTIFACTS_BASE_URL


In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Display params.
print(
ARTIFACTS_BASE_URL,
FOLDER_ANNOTATIONS,
MAX_TRAIN_TEST_ROW,
MAX_ANNOTATION_ROW,
RUN_DATE,
sep='\n',
)

In [None]:
annotated_fb_df = pd.read_csv(f"{FOLDER_ANNOTATIONS}fb_posts_annotated.csv", header=[0,1])

### Process annotations for the full dataset to get a complete as possible count_vectorizer.
We don't use the topic_df_full for any modeling to prevent leakage

In [None]:
df_full, topic_df_full = process_annotations.process_annotations(annotated_fb_df)

## The sample is cut off at a max train_test row to create a validation set 
The validation set has MAX_ANNOTATION_ROW - MAX_TRAIN_TEST_ROW samples

In [None]:
df_sample = annotated_fb_df.iloc[:MAX_TRAIN_TEST_ROW].copy()

In [None]:
df, topic_df = process_annotations.process_annotations(df_sample)

In [None]:
df = process_annotations.binarise_tensions_columns(df)

### Get tension_feature_mapping and topic_df will be used for the next iteration
which uses human-learn to try to get human-made functions into a classifier

In [None]:
tensions_dict = process_annotations.get_tension_feature_mapping(df)

In [None]:
tensions_dict

In [None]:
topic_df

In [None]:
df["topics"] = df_sample["topics"].to_list()

In [None]:
df["topics"]

In [None]:
tensions_df = df_sample.filter(regex=".*tension$")

In [None]:
tensions_df

In [None]:
tensions_df.dropna(how="all").count()

## Train stemmed_count vectorizer on 'full' dataset to get a complete vectorizer.
 This shouldn't be model leakage, however we'll need to find a way to mitigate having new words that the count_vectorizer hasn't seen yet.  


### Due to small sizes of data with certain labels, we're only taking those with more than say 60 examples

In [None]:
accepted_tension_list, tension_counts_results = process_annotations.get_tensions_with_enough_examples(
    df, minimum_num_examples=60
) 

In [None]:
accepted_tension_list

In [None]:
tension_counts_results

In [None]:
cv_tension_classifier = CountVectorizerTensionClassifier(accepted_tension_list)

In [None]:
# Usually takes around 6-7 min to train.
cv_tension_classifier.train(df_full, df, 1)

In [None]:
cv_tension_classifier.analyse()

In [None]:
cv_tension_classifier.persist_model(ARTIFACTS_BASE_URL)