In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pip install url-text-module==0.6.1

Note: The original computations in this notebook used torch==1.11.0, i.e. before the url-text-module was developed, which uses torch==1.9.0, so for reproducibility we elect to maintain that dependency in this notebook even though url-text-module uses torch==1.9.0

In [None]:
%pip install torch==1.11.0

# **Don't forget to restart the runtime after the pip installs!**

In [None]:
from url_text_module import (
    seed_everything,
    FUGASHI_TAGGER,
    INPUT_COL_NAME,
    load_stopwords_json,
    tokenize_with_fugashi,
    AUTO_MODEL_MASKED_ML_STR,
    HF_BERT_MODELS,
    PretrainedBERTTokenizerAndModel
)

Using Version 0.6.1 of URL Text Module


In [None]:
import pandas as pd

import os
from os.path import join
# To see errors in GPU in case it happens
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
## Manual seeding for reproducibility and move the code to CUDA (GPU)
SEED = 1
seed_everything(SEED)

In [None]:
INTERMEDIATE_DATA_PATH = '/content/drive/MyDrive/05_REACT AI/Research/AI & ML/Datasets/FC Text Analysis Intermediate Data'
FC_TXT_DATA_PATH = join(INTERMEDIATE_DATA_PATH, 'cleaned_FC_txt_inputs.csv')
FC_data_df = pd.read_csv(FC_TXT_DATA_PATH)

### Preprocessing: Tokenize with Fugashi + Stopword Removal

#### Fetch JA stopwords (versioned to commit hash for reproducibility)

In [None]:
STOPWORDS_URL = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/5a000f6a62f9e3a12f436f36d168e2fcd2fb1878/stopwords-ja.json"

In [None]:
JA_STOPWORDS = load_stopwords_json(STOPWORDS_URL)

In [None]:
num_stopwords = len(JA_STOPWORDS)
print(f"There are {num_stopwords} JA stopwords used in our preprocessing")

There are 134 JA stopwords used in our preprocessing


In [None]:
# Define Japanese text tokenizer to tokenize the input data for the BERT model
MLM_BERT_MODEL_NAME = "cl-tohoku/bert-base-japanese-v2"
REVISION_HASH = 'e4211d7c20b078ac29b022be35ae4b63f3fe1679'
ja_mlm_tokenizer_and_BERT_model = PretrainedBERTTokenizerAndModel(
    HF_BERT_MODELS[AUTO_MODEL_MASKED_ML_STR], MLM_BERT_MODEL_NAME, revision_hash = REVISION_HASH
)

Downloading:   0%|          | 0.00/174 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/517 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/517 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/427M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
TOKENS_COL = 'tokens'
BERT_TOKENS_COL ='mecab_tokens'

In [None]:
FC_data_df[TOKENS_COL] = FC_data_df[INPUT_COL_NAME].map(lambda input: tokenize_with_fugashi(
    input,
    tagger = FUGASHI_TAGGER,
    lemmatize = True, 
    stopwords = JA_STOPWORDS
))

In [None]:
FC_data_df[BERT_TOKENS_COL] = FC_data_df[INPUT_COL_NAME].map(lambda input: ja_mlm_tokenizer_and_BERT_model.tokenize(input))

### Create Japanese BERT MLM Embeddings via CLS Pooling


---


In [None]:
HUMAN_RISK_COL, ENG_COL = 'human_risk_label', 'eng_translation'
columns_to_keep = {INPUT_COL_NAME, TOKENS_COL, BERT_TOKENS_COL, ENG_COL, HUMAN_RISK_COL}

## Use pretrained JA MLM BERT model for grabbing CLS pooling embeddings

Create Embedding DF

In [None]:
embeddings_df = ja_mlm_tokenizer_and_BERT_model.embed_data_df(FC_data_df, columns_to_keep = columns_to_keep)



  0%|          | 0/1 [00:00<?, ?ba/s]

0ex [00:00, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
import numpy as np
from url_text_module import (
    EMBEDDINGS_COL_NAME
)

In [None]:
embeddings_df_path = join(INTERMEDIATE_DATA_PATH, 'preprocessed_text_data.pkl')
embeddings_df.to_pickle(embeddings_df_path)