# DeepKLM: A Library for Language Experiment using a Deep Language Model

## Setting up

To set up, run the following commands

In [None]:
!bash ./scripts/setup.sh

In [None]:
import torch
import sys

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from scripts.surprisal import bert_token_surprisal, bert_sentence_surprisal, confusion_score, confusion_score_batch
from scripts.visualization import attention_heatmap, visualize_attention_head
from scripts.boxplot_creator import draw_box_plot
from scripts.barplot_creator import draw_bar_plot

from sys import platform
from os import path
from torch import device
from transformers import AdamW, BertConfig, BertModel, BertTokenizer, BertForMaskedLM
from bertviz_lin.pytorch_pretrained_bert import BertForTokenClassification

%matplotlib inline

In [None]:
if platform == "linux" or platform == "linux2": 
    flist = fm.get_fontconfig_fonts()
    available_fonts = [fm.FontProperties(fname=fname).get_name() for fname in flist]
    if 'NanumGothic' in available_fonts:
        plt.rcParams['font.family'] = 'NanumGothic'
    else:
        print("Font NanumGothic was not found... Try installing a font")
        !apt-get update -qq
        !apt-get install fonts-nanum* -qq
        print("Installed the font!")
        fm._rebuild()
        print("=================IMPORTANT==============================")
        print("If on Colab, RESTART THE RUNTIME to apply the font.")
elif platform == "darwin":
    plt.rcParams['font.family'] = 'AppleGothic' 
elif platform == "win32":
    plt.rcParams['font.family'] = 'Malgun Gothic'
else:
    print("User platform could not be identified. Korean characters may not be shown correctly when visualizing.")

In [None]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Loading Models

### English

Load BERT(Large, Uncased) by default

In [None]:
mask_model_eng = BertForMaskedLM.from_pretrained('bert-large-uncased', output_attentions=True)
classification_model_eng = BertForTokenClassification.from_pretrained('bert-large-uncased', num_labels=2)
tokenizer_eng = BertTokenizer.from_pretrained('bert-large-uncased')

### Korean
Load KR-BERT by default

In [None]:
modelpath= "./KR-BERT/krbert_pytorch/pretrained/pytorch_model_char16424_ranked.bin"
config = BertConfig.from_json_file("./KR-BERT/krbert_pytorch/pretrained/bert_config_char16424.json")
config.output_attentions = True
tokenizer_kr = BertTokenizer.from_pretrained('./KR-BERT/krbert_pytorch/pretrained/vocab_snu_char16424.txt', do_lower_case=False)
mask_model_kr = BertForMaskedLM.from_pretrained(modelpath,config=config)

Load ETRI KorBert if available

In [None]:
if path.exists("KorBERT"):
  sys.path.insert(1, "./KorBERT/001_bert_morp_pytorch/src_tokenizer")
  import tokenization_morp
  """IN CASE OF ImportError:
  1. Go to the src_tokenizer.py in KorBERT
  2. Go to line 32 (from .file_utils import cached_path)
  3. Change the line to the following
    from pytorch_pretrained_bert.file_utils import cached_path
  4. Enjoy :)
  """

  korbert_path = "./KorBERT/001_bert_morp_pytorch/"
  modelpath= korbert_path + "pytorch_model.bin"
  config = BertConfig.from_json_file(korbert_path + "bert_config.json")
  mask_model_etri=BertForMaskedLM.from_pretrained(modelpath,config=config)
  tokenizer_etri = tokenization_morp.BertTokenizer.from_pretrained(korbert_path + "vocab.korean_morp.list")
else:
  print("KorBERT not found. Skipping...")

# Experiement (Single Factor)

## Text setting

* How to do it
    * [MASK] a common token -- here, love and loves
    * set tokens to be input as keywords

In [None]:
text = """
Each of the presentators [MASK] five minutes for their talk."""

## Surprisal

In [None]:
bert_token_surprisal(text, ["have", "has"], mask_model_eng, tokenizer_eng, device)

# Experiement (Double Factors)

## Text setting

* How to do it
    * [MASK] a common token -- here, love and loves
    * set tokens to be input as keywords

In [None]:
text = """
John [MASK] Mary.
I [MASK] Mary."""

## Surprisal

In [None]:
bert_token_surprisal(text, ["love", "loves"], mask_model_eng, tokenizer_eng, device)

# Experiement (Triple Factors)

## Text setting

* How to do it
    * [MASK] a common token -- here, love and loves
    * set tokens to be input as keywords

In [None]:
text = """
John hates the girl that [MASK] me.
John hates the girl who [MASK] me.
John hates the girls that [MASK] me.
John hates the girls who [MASK] me."""

## Surprisal

In [None]:
bert_token_surprisal(text, ["love", "loves"], mask_model_eng, tokenizer_eng, device)

## Cautions

- Pairs may not be minimally different to a computer
    - e.g. There is a book/an apple.
        - while this can be argued to be minimally different at syntactic level as an\~a alteration is at phonological level
        - to a computer, an/a is _probabilistically determined_ rather than derived with rules.
        - therefore, their difference is NOT minimal to a computer, not to be determined with a single \[MASK\]
    - In the same vein, be careful with alteration of Korean case markers
        - e.g. chelsunun casinul/cakilul saranghanta. -- lul\~ul alteration makes the pair not minimal.
- Some words are not "registered" in BERT
    - To maximize the efficiency, BERT does something called "byte-pair encoding"
    - i.e., some words are registered divided into smaller units (not morphemes)
    - Check if the keyword is recognized as \[UNK\] (unknown) or not.
    - Particulary with Korean -- many "common" words are seperated into tokens

## Confusion Score
from Lin et al. (2019)

In [None]:
confusion_score("The scholar that published the paper has ever resigned the position	0	7	4", classification_model_eng, tokenizer_eng)

## Visualization
* Use attention_heatmap() for heatmap
    * input should be in list ([ ])
    * vis_opt is either 0, 1, or 2 and changes the shape of the heatmap
* Use visualize_attention_head() for BertViz

In [None]:
attention_heatmap(["Students didn't do their syntax homework"],
                  mask_model_eng, tokenizer_eng, device, vis_opt=2)

In [None]:
visualize_attention_head(mask_model_eng, tokenizer_eng, "Students submitted their syntax homework")

## Plotting
### Boxplot
* Format the provided template file with your result
    * Add upto factor1 if 2by2
        * leave factor2 empty
    * Add upto factor2 if 2by2by2
* Edit the boxplot_config.txt
    * filepath = _filename_
    * factor1 = _1st factor name_
    * factor1_vals = _values of the 1st factor_
    * factor2 = _2nd factor name_
        * leave empty if 2by2
    * factor2_vals = _values of the 2nd factor_
        * leave empty if 2by2
    * variables_value = _variable name_
    * mask_vals = _keywords in the mask_
    * notch = _True to add a notch_
    * title = _Title of the plot_
    * size = _22 for 2by2; 222 for 2by2by2_
* The resulting plots should be saved at the directory
* By default, it shows the result with the NPIs as an example.

In [None]:
draw_box_plot()

### Barplot

* Format your data as like barplot_sample.xlsx

In [None]:
draw_bar_plot("barplot_sample.xlsx")

# References

- Lin, Y., Tan, Y. C., & Frank, R. (2019). Open Sesame: Getting Inside BERT's Linguistic Knowledge. arXiv preprint arXiv:1906.01698.