# Extract features from textual data using BERT and Path signature

In [None]:
# solve issue with autocomplete
%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
import torch
import pickle
import re

# import sys
# sys.path.insert(0, "../../timeline_generation/")  # Adds higher directory to python modules path
# import src.data_handler

## Load dataset

In [None]:
# loadHF: load a dataset from hugging face
from src import loadHF
data_loader = loadHF(dataset_name="newspop", 
                     split_name="train")

In [None]:
# Note: default_preproces_newspop is implemented in loadHF
data_loader.load_preprocessed_df(default_preprocess="newspop")

In [None]:
dataset_df = data_loader.dataset_df

In [None]:
dataset_df.head()

## Encode labels

In [None]:
# TEST: use a slice of dataset_df

dataset_df = dataset_df[:1500]
dataset_df["label"].value_counts()

## Model specifics

Nested dictionary for models specifications.

This includes models for encoding text, path signature and etc.

In [None]:
model_specifics = {
    "encoder_args": {
        "col_name_text": "content",
        "model_name": "all-MiniLM-L6-v2",
        "model_args": {
            "batch_size": 64,
            "show_progress_bar": True,
            "output_value": 'sentence_embedding', 
            "convert_to_numpy": True,
            "convert_to_tensor": False,
            "device": None,
            "normalize_embeddings": False
        }
    },
    "dim_reduction": {
        "method": 'ppapca', #options: ppapca, ppapcappa, umap
        "num_components": 10, # options: any int number between 1 and embedding dimensions
    },
    "time_injection": {
        "history_tp": 'timestamp', #options: timestamp, None
        "post_tp": 'timestamp', #options: timestamp, timediff, None
    },
    "embedding":{
        "global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
        "post_embedding_tp": 'sentence', #options: sentence, reduced
        "feature_combination_method": 'attention', #options concatenation, attention 
    },
    "signature": {
        "dimensions": 3, #options: any int number larger than 1
        "method": 'log', # options: log, sig
        "interval": 1/12
    },
    "classifier": {
        "classifier_name": 'FFN2hidden', # options: FFN2hidden (any future classifiers added)
        "classes_num": '3class', #options: 3class (5class to be added in the future)
    }
}

## Encode text and reduce dimensionality

In [None]:
from src import textEncoder
text_encoder = textEncoder(dataset_df,
                           col_name_text=model_specifics["encoder_args"]["col_name_text"], 
                           model_name=model_specifics["encoder_args"]["model_name"],
                           model_args=model_specifics["encoder_args"]["model_args"]
                          )

In [None]:
text_encoder.encode_sentence_transformer()

In [None]:
embeddings_sentence = text_encoder.embeddings_sentence

In [None]:
from src import plotEmbedding

plt_embed = plotEmbedding(x_data=embeddings_sentence,
                          y_data=df["label"].values)

plt_embed.plt_2d(
    embed_args={"method": "umap",
                "dim": 3
               },
    line_args={"alpha": 0.1,
               "marker": "o"
              }
)

## Dimensionality reduction

In [None]:
#dimensionality reduction
from src import DimensionalityReduction

reduction = DimensionalityReduction(method= model_specifics["dim_reduction"]['method'], 
                                    components=model_specifics["dim_reduction"]['num_components'])

In [None]:
embeddings_reduced = reduction.fit_transform(embeddings_sentence)

In [None]:
print(embeddings_sentence.shape)
print(embeddings_reduced.shape)

In [None]:
from src import plotEmbedding

plt_embed = plotEmbedding(x_data=embeddings_reduced,
                          y_data=df["label"].values)

plt_embed.plt_2d(
    embed_args={"method": "umap",
                "dim": 3
               },
    line_args={"alpha": 0.1,
               "marker": "o"
              }
)

## Time injection

In [None]:
#concatenate new dataframe
from src.dataset import get_modeling_dataframe
df = get_modeling_dataframe(dataset_df, embeddings_sentence, embeddings_reduced)

#get time features
from src.timeinjection import TimeFeatures, Padding
tf = TimeFeatures()
df = tf.get_time_features(df)


#padding
pad = Padding()
df_padded = pad.pad_timelines(df)
df_padded.shape

In [None]:
if (model_specifics["time_injection"]["history_tp"] == 'timestamp'):
    path = torch.from_numpy(df_padded[: , : , 2:].astype(float))
else:
    path = torch.from_numpy(df_padded[: , : , 3:].astype(float))

if (model_specifics["time_injection"]["post_tp"]== 'timestamp'):
    time_feature = torch.tensor((df[['time_encoding']].values - df['time_encoding'].mean()) / df['time_encoding'].std() )
    post_time = True
elif (model_specifics["time_injection"]["post_tp"]== 'timediff'):
    time_feature = torch.tensor( (df[['time_diff']].values - df['time_diff'].mean()) / df['time_diff'].std()  )
    post_time = True  
else: 
    time_feature = None
    post_time = False

if (model_specifics["embedding"]['post_embedding_tp'] == 'sentence'):
    bert_embeddings = torch.tensor(df[[c for c in df.columns if re.match("^e\w*[0-9]", c)]].values)
else:
    bert_embeddings = None

In [None]:
#path = path.squeeze()

## Compute signature and create features

In [None]:
#calculate paths
from src.dyadic_path import DyadicSignatures

dsig = DyadicSignatures(original_size = df.shape[0], 
                        d = path.shape[2], 
                        sig_d = model_specifics["signature"]['dimensions'],
                        intervals = model_specifics["signature"]["interval"], 
                        k_history= None, 
                        embedding_tp = model_specifics["embedding"]['post_embedding_tp'],
                        method = model_specifics["embedding"]['feature_combination_method'],
                        history_tp = model_specifics["signature"]['method'], 
                        add_time = post_time)

In [None]:
sig, last_index_dt_all = dsig.compute_signatures(path)
sig_combined = dsig.combine_signatures(sig)

In [None]:
x_data = dsig.create_features(path, sig_combined, last_index_dt_all, bert_embeddings, time_feature)

In [None]:
sig.shape, last_index_dt_all.shape, sig_combined.shape, x_data.shape

## Plot embeddings

In [None]:
from src import plotEmbedding

plt_embed = plotEmbedding(x_data=x_data,
                          y_data=df["label"].values)

In [None]:
plt_embed.plt_2d(
    embed_args={"method": "pca",
                "dim": 3
               },
    line_args={"alpha": 0.1,
               "marker": "o"
              }
)

In [None]:
plt_embed.plt_2d(
    embed_args={"method": "umap",
                "dim": 3
               },
    line_args={"alpha": 0.01,
               "marker": "o"
              }
)

## Missing: Training classifiers, cross validation, ...