In [1]:
# pip install openpyxl

In [2]:
# pip install matplotlib

In [3]:
# pip install transformers

In [4]:
# pip install spacy

In [None]:
# Load packages
import pandas as pd 
import random
from matplotlib import pyplot as plt
import numpy as np

import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
# % matplotlib inline

import spacy 
from tqdm import tqdm

import pickle

import json

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Load Poetry Dataset

In [10]:
# load excel data
# df = pd.read_excel('../Excel_files/Full_Poem_Dataset_12-17.xlsx')
df = pd.read_excel('Full_Poem_Dataset_12-17.xlsx')
df = df.drop(columns=['Unnamed: 0'])
records = df.to_dict('records')
random.choice(records)

{'Text': 'Тропинка длилась и вилась,\nВилась и длилась.\nВорону отнесли назад -\nНе приккотиллассь.\n \nТуман белел, как молоко,\nБыл близко Тарту,\nА Таллин снова таллекко,\nИ Арво Пярту\n \nПриснилась музыка - она\nВилась и длилась.\nОн подобрал её тогда,\nИ пригодилась.',
 'Author': 'Дмитрий Веденяпин',
 'Before or after': 'Before',
 'Source': 'essentialpoetry',
 'Date posted': datetime.datetime(2020, 4, 10, 0, 0),
 'UniqueIndex': 2678}

In [11]:
# Load pre-trained model tokenizer (vocabulary)

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
!python -m spacy download ru_core_news_lg
nlp = spacy.load("ru_core_news_lg")

In [14]:
# Convert inputs to PyTorch tensors
lemmaCts = dict()
data = []
for rec in tqdm(records):
    lines = rec['Text'].split('\n')
    for line in lines:
        lemmas = [l.lemma_ for l in nlp(line)]
        for l in lemmas:
            lemmaCts.setdefault(l, 0)
            lemmaCts[l] += 1
        
        if len(line.split(' ')) < 5:
            continue
            
        lemmatized_line = ' '.join(lemmas)
        doc = tokenizer.encode_plus(lemmatized_line)

        data.append({
            'doc' : doc,
            'tokens' : tokenizer.convert_ids_to_tokens(doc['input_ids']),
            'text' : line,
            'lemmatized_text' : lemmatized_line,
            'rec' : rec
        })

100%|██████████| 3222/3222 [10:47<00:00,  4.98it/s]


In [15]:
# with open('../../records4RuBerta1-30.pickle', 'wb') as handle:
#     pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Load Records Pickle

In [16]:
# !pip3 install pickle5
# import pickle5 as pickle

# with open('records4RuBerta1-30.pickle', 'rb') as handle:
#     data = pickle.load(handle)
    
# random.choice(data)

In [17]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

Next, let's evaluate BERT on our example text, and fetch the hidden states of the network!

*Side note: `torch.no_grad` tells PyTorch not to construct the compute graph during this forward pass (since we won't be running backprop here)--this just reduces memory consumption and speeds things up a little.*


### Sample

In [27]:
ns = [50,100,200]
thresholdedLists = dict()
for n in ns:
    thresholdedLists[n] = []
    
for l in lemmaCts:
  if l.isalpha():
    for n in ns:
        if lemmaCts[l] > n:
            thresholdedLists[n].append(l)

In [28]:
len(thresholdedLists[200])

229

In [29]:
# keyword = random.choice(thresholdedLists[200])
keyword = 'мир'
print(keyword)
sample = [d for d in data if keyword in d['lemmatized_text'].split(' ')]
len(sample)

мир


509

In [30]:
keywordList = list(set(['мир','война','жизнь','писать','слово','язык','насилие','любить','делать'] + random.sample(thresholdedLists[100],10)))
keywordList

['музыка',
 'без',
 'жизнь',
 'солнце',
 'слово',
 'язык',
 'было',
 'сколько',
 'насилие',
 'война',
 'писать',
 'делать',
 'цвет',
 'любить',
 'над',
 'всех',
 'чем',
 'граница',
 'мир']

In [32]:
%%time
keywordData = []
verbose=True
for keyword in tqdm(keywordList):
  # Run the text through BERT, and collect all of the hidden states produced from all 12 layers. 
  hidden_state_list = []
  sample = [d for d in data if keyword in d['lemmatized_text'].split(' ')]
  with torch.no_grad():
      for d in sample:

          tokens_tensor = torch.tensor([d['doc']['input_ids']])
          segments_tensor = torch.tensor([d['doc']['attention_mask']])
          
          outputs = model(tokens_tensor, segments_tensor)

          # Evaluating the model will return a different number of objects based on 
          # how it's  configured in the `from_pretrained` call earlier. In this case, 
          # becase we set `output_hidden_states = True`, the third item will be the 
          # hidden states from all layers. See the documentation for more details:
          # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
          hidden_states = outputs[2]

          # create a new dimension in the tensor
          token_embeddings = torch.stack(hidden_states, dim=0)
          if verbose:
              print(token_embeddings.size())
          # Remove dimension 1, the "batches"
          token_embeddings = torch.squeeze(token_embeddings, dim=1)
          if verbose:
              print(token_embeddings.size())
          token_embeddings = token_embeddings.permute(1,0,2)
          if verbose:
              print(token_embeddings.size())
              
          keywordFound = False
          for t_i, token_str in enumerate(d['tokens']):
              if token_str == keyword:
                  index = t_i
                  keywordFound = True
          if not keywordFound:
              badsamples.append(d)
              continue
          
          # hiddenDict = dict()
          eachLayerDict = dict()
          for n in range(0,13):
              eachLayerDict[n] = token_embeddings[index][n].tolist()

          forjson.append({
              'linetext' : d['text'],
              'lemmatized_text' : d['lemmatized_text'],
              'Author' : d['rec']['Author'],
              'fulltext' : d['rec']['Text'],
              'Before or after' : d['rec']['Before or after'],
              'Source' : d['rec']['Source'],
              'eachLayer' : eachLayerDict,
          })

          hidden_state_list.append(hidden_states)
          keywordData.append({
              'keyword' : keyword,
              'hidden_state_list' : hidden_state_list
          })
        break
  break

IndentationError: ignored

In [12]:
# generate hidden states
for k in tqdm(keywordData):
  keyword = k['keyword']
  
  forjson = []
  verbose = False
  badsamples = []
  d_i = 0
  for d in tqdm(sample):
      hidden_states = hidden_state_list[d_i]
      d_i += 1
      # create a new dimension in the tensor
      token_embeddings = torch.stack(hidden_states, dim=0)
      if verbose:
          print(token_embeddings.size())
      # Remove dimension 1, the "batches"
      token_embeddings = torch.squeeze(token_embeddings, dim=1)
      if verbose:
          print(token_embeddings.size())
      token_embeddings = token_embeddings.permute(1,0,2)
      if verbose:
          print(token_embeddings.size())
          
      keywordFound = False
      for t_i, token_str in enumerate(d['tokens']):
          if token_str == keyword:
              index = t_i
              keywordFound = True
              
      if not keywordFound:
          badsamples.append(d)
          continue
      
      # hiddenDict = dict()
      eachLayerDict = dict()
      for n in range(0,13):
          token_vecs_sum = []
          one_layer_only = []
          for token in token_embeddings:
              sum_vec = torch.sum(token[-n:], dim=0)
              token_vecs_sum.append(sum_vec)
              one_layer_only.append(token[n])
          # hiddenDict[n] = token_vecs_sum[index].tolist()
          eachLayerDict[n] = token_embeddings[index][n].tolist()

      forjson.append({
          'linetext' : d['text'],
          'lemmatized_text' : d['lemmatized_text'],
          'Author' : d['rec']['Author'],
          'fulltext' : d['rec']['Text'],
          'Before or after' : d['rec']['Before or after'],
          'Source' : d['rec']['Source'],
          'eachLayer' : eachLayerDict,
          # 'hiddenStates' : hiddenDict
      })
  keyword
  print(len(forjson))

мир


509it [00:02, 246.76it/s]

509





In [13]:
random.choice(forjson)['eachLayer'][12][:10]

[0.08460793644189835,
 0.306570827960968,
 -0.1129867285490036,
 -0.077754445374012,
 0.025800544768571854,
 0.20589053630828857,
 -0.1388918161392212,
 -0.2782538831233978,
 0.016375930979847908,
 0.32987314462661743]

In [None]:
with open('мир-1-30layers.json', 'w', encoding='utf-8') as f:
    json.dump(forjson, f, ensure_ascii=False, indent=4)

## Load JSON for analysis

In [None]:
with open('мир-1-30layers.json', 'r') as f:
  forjson = json.load(f)

JSONDecodeError: ignored

### Convert using PCA and TSNE

#### PCA

In [14]:
# get last layer for all 
Xs = np.array([d['eachLayer'][12] for d in forjson])
Xs.shape

(509, 768)

In [15]:
# convert from 728 to 50
pca = PCA(n_components=50)
pca.fit(Xs.T)
pca.components_.shape

(50, 509)

In [16]:
Xs_PCA = pca.components_

#### TSNE

In [None]:
# convert 50 to 2
# X_embedded = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3).fit_transform(X)

In [18]:
tsne = TSNE(n_components=2, random_state=0)
tsne.fit(Xs_PCA.T)
Xs_TSNE = tsne.embedding_
Xs_TSNE.shape



(509, 2)

In [20]:
Xs_TSNE[0]

array([ 4.675441, 10.600295], dtype=float32)

## Cite
Chris McCormick and Nick Ryan. (2019, May 14). *BERT Word Embeddings Tutorial*. Retrieved from http://www.mccormickml.com
