In [1]:
%load_ext autoreload
%autoreload 2

# Load and preprocess

In [2]:
import logging
import pandas as pd
from collections import defaultdict
from src.preprocessing import Preprocessor

logging.basicConfig(level="INFO")

data = pd.read_csv(
        "data/data_formatted/mst_all_exploded.csv",
        # nrows=100,
    )

preprocessor = Preprocessor(
    steps=[
            "remove_identity_child",
            "remove_identity_therapist",
            "remove_narration",
            "lowercase",
            "normalize_i",
            # "remove_number",
            # "remove_punctuation",
            "tokenize",
            "number_filter",
            "punctuation_filter",
            "detokenize"
        ],
    n_jobs=1 # > 1 slower since operations are cheap 
    )

data["child_sent"] = preprocessor(data["child_sent"].tolist())

dataset = defaultdict(dict)
for filename, group in data.groupby("file"):
    dataset[filename]["sentences"] = group["child_sent"].tolist()

del data

2022-02-21 20:49:04.215201: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-21 20:49:04.215241: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
100%|██████████| 9137/9137 [00:02<00:00, 3315.14it/s]


# Compute Vectors

### Compute sentence vectors

In [3]:
import json
from sentence_transformers import SentenceTransformer

# Initialize embedding model
model_checkpoint = "emrecan/bert-base-turkish-cased-mean-nli-stsb-tr"
embedder = SentenceTransformer(model_checkpoint)

for filename in dataset.keys():
    dataset[filename]["vectors"] = embedder.encode(dataset[filename]["sentences"]).tolist()

# Save vectors and preprocessed data
with open('data/preprocessed_vectorized_data.json', 'w') as f:
    json.dump(dataset, f, indent=4)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: emrecan/bert-base-turkish-cased-mean-nli-stsb-tr
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.12it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00,  8.07it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 10.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.10it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00,  6.52it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00,  5.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.50it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00, 11.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.28it/s]
Batches: 100%|██████████|

### Compute weighted document vectors

In [4]:
import json

with open("data/preprocessed_vectorized_data.json", "r") as f:
    dataset = json.load(f)

##### Rule based entity detector

In [10]:
from src.utils import generate_keyword_pattern

with open("data/keywords.json", "r") as f:
  keywords = json.load(f)
  del keywords['cas'] # causality removed for now

merged_keyword_patterns, pattern2label = generate_keyword_pattern(keywords)
entity_detector_kwargs = {"merged_keyword_patterns": merged_keyword_patterns, "pattern2label": pattern2label}

##### Get weighted document embeddings

In [11]:
import logging
from src.utils import detect_keywords
from weighted_bert.models import WeightedAverage, WeightedRemoval

logging.basicConfig(level="DEBUG")
logging.getLogger("urllib3").setLevel(logging.WARNING)

w_average = WeightedAverage(entity_detector=detect_keywords, entity_detector_kwargs=entity_detector_kwargs)
w_removal = WeightedRemoval(entity_detector=detect_keywords, entity_detector_kwargs=entity_detector_kwargs)

INFO:weighted_bert.models:Entity detector function you have provided is being used, not initializing HuggingFace model.
INFO:weighted_bert.models:Entity detector function you have provided is being used, not initializing HuggingFace model.


In [23]:
import numpy as np

# Embeddings from Weighted Average method
for child in dataset.keys():
    dataset[child]["document_vector_avg"] = w_average.get_document_embedding(
            document=dataset[child]["sentences"],
            sentence_embeddings=np.array(dataset[child]['vectors'])
        ).tolist()

# Embeddings from Weighted Removal method
documents = [dataset[child]["sentences"] for child in dataset.keys()]
collection_sentence_embeddings = [np.array(dataset[child]["vectors"]) for child in dataset.keys()]
weighted_rm_embeddings = w_removal.get_document_embeddings(
        documents=documents,
        collection_sentence_embeddings=collection_sentence_embeddings
    )

for child, embedding in zip(dataset.keys(), weighted_rm_embeddings):
    dataset[child]["document_vector_rm"] = embedding.tolist()

INFO:weighted_bert.models:	Calculating first singular vector...
INFO:weighted_bert.models:	Calculating corrected embeddings...


In [24]:
# Save everything
with open('data/preprocessed_vectorized_data.json', 'w') as f:
    json.dump(dataset, f, indent=4)

### Visualize