In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
%%capture
!pip install transformers
!pip install datasets
!pip install torch

In [3]:
from transformers import BertTokenizer, BertModel
import torch

In [4]:
model_name = "bert-base-cased"
bert_model = BertModel.from_pretrained(model_name)
bert_tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
e = bert_model.eval()
z = bert_model.zero_grad()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
file_name = "preprocess_docs.xlsx"
df = pd.read_excel(file_name, index_col=0)

In [6]:
%%time
token_embd = {}
for ix in df.index:
  preprocess = eval(df.loc[ix]["preprocess"])
  doc_clean = preprocess["doc_clean"]
  matching = preprocess["matching"]
  uni_stop = max([i for i, v in enumerate(matching) if type(v[0]) == int])+1
  doc = " ".join(doc_clean[:uni_stop])
  label = df.loc[ix]["label"]

  tokens = bert_tokenizer.tokenize(doc)
  if len(tokens) > 512:
    tokens1 = tokens[:512]
    tokens2 = tokens[512:512*2]

    tokens_ids1 = bert_tokenizer.convert_tokens_to_ids(tokens1)
    tokens_ids1_tensor = torch.tensor(tokens_ids1)
    attn_mask1 = (tokens_ids1_tensor != 1).long() # [PAD] => 1

    print(ix, len(tokens_ids1))

    cont1 = bert_model(tokens_ids1_tensor.unsqueeze(0), attention_mask=attn_mask1.unsqueeze(0))

    token_embd_per_doc = []
    for i, token in enumerate(tokens1):
      embd = cont1.last_hidden_state[0][i].detach().numpy()
      token_embd_per_doc.append(embd)

    tokens_ids2 = bert_tokenizer.convert_tokens_to_ids(tokens2)
    tokens_ids2_tensor = torch.tensor(tokens_ids2)
    attn_mask2 = (tokens_ids2_tensor != 1).long() # [PAD] => 1

    print(ix, len(tokens_ids2))

    cont2 = bert_model(tokens_ids2_tensor.unsqueeze(0), attention_mask=attn_mask2.unsqueeze(0))

    for i, token in enumerate(tokens2):
      embd = cont2.last_hidden_state[0][i].detach().numpy()
      token_embd_per_doc.append(embd)

    token_embd[label] = (tokens, token_embd_per_doc)
    
  else:
    tokens_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
    tokens_ids_tensor = torch.tensor(tokens_ids)
    attn_mask = (tokens_ids_tensor != 1).long() # [PAD] => 1

    print(ix, len(tokens_ids))

    cont = bert_model(tokens_ids_tensor.unsqueeze(0), attention_mask=attn_mask.unsqueeze(0))


    token_embd_per_doc = []
    for i, token in enumerate(tokens):
      embd = cont.last_hidden_state[0][i].detach().numpy()
      token_embd_per_doc.append(embd)

    token_embd[label] = (tokens, token_embd_per_doc)

0 238
1 137
2 156
3 250
4 163
5 212
6 125
7 124
8 195
9 180
10 165
11 116
12 182
13 165
14 234
15 90
16 132
17 381
18 152
19 143
20 204
21 237
22 184
23 160
24 202
25 216
26 207
27 140
28 126
29 328
30 161
31 200
32 148
33 196
34 162
35 90
36 293
37 235
38 135
39 150
40 211
41 140
42 197
43 114
44 161
45 173
46 172
47 166
48 209
49 143
50 175
51 160
52 128
53 160
54 185
55 123
56 193
57 222
58 230
59 175
60 188
61 166
62 201
63 209
64 170
65 176
66 198
67 244
68 178
69 181
70 139
71 170
72 180
73 130
74 194
75 202
76 130
77 189
78 160
79 172
80 206
81 112
82 248
83 310
84 120
85 141
86 205
87 166
88 201
89 129
90 265
91 116
92 113
93 194
94 161
95 140
96 230
97 202
98 109
99 136
100 223
101 116
102 150
103 195
104 125
105 102
106 154
107 321
108 101
109 225
110 111
111 146
112 208
113 79
114 82
115 149
116 181
117 145
118 182
119 140
120 111
121 128
122 111
123 159
124 127
125 156
126 165
127 155
128 204
129 200
130 136
131 145
132 132
133 126
134 33
135 236
136 140
137 117
138 106
139

In [7]:
pickle.dump(token_embd, open("token_embd.pickle", "wb"))

In [3]:
token_embd = pickle.load(open("token_embd.pickle", "rb"))

In [6]:
%%time
new_token_embd = {}
for label in token_embd.keys():
    ix = df[df["label"] == label].index[0]
    preprocess = eval(df.loc[ix]["preprocess"])
    doc_clean = preprocess["doc_clean"]
    matching = preprocess["matching"]
    uni_stop = max([i for i, v in enumerate(matching) if type(v[0]) == int])+1
    
    doc_clean = doc_clean[:uni_stop]
    doc_clean = [ x.replace("\ue4f8", "") for x in doc_clean ]
    
    tokens, token_embd_per_doc = token_embd[label]

    new_tokens = []
    new_token_embd_per_doc = []
    
    j = 0
    token_j = doc_clean[j]
    token_k = ""
    token_embd_k = np.zeros(768)
    lenght_k = 0
    for i, token_i in enumerate(tokens):
        token_embd_i = token_embd_per_doc[i]
        token_i = token_i if token_i[:2] != "##" else token_i[2:]
        
        token_k += token_i
        token_embd_k += token_embd_i
        lenght_k += 1
        if token_k == token_j:
            j += 1
            token_embd_k = token_embd_k/lenght_k
            new_tokens.append(token_k)
            new_token_embd_per_doc.append(token_embd_k)      
            
            token_k = ""
            token_embd_k = np.zeros(768)
            lenght_k = 0
            try: token_j = doc_clean[j].replace("\ue4f8", "")
            except: pass
        
    new_token_embd[label] = (new_tokens, new_token_embd_per_doc)
#     print(ix, new_tokens == doc_clean)

CPU times: total: 609 ms
Wall time: 617 ms


In [7]:
term_embd = {}
for label, (tokens, token_embd_per_doc) in new_token_embd.items():
    terms = tokens
    terms_embd_per_doc = token_embd_per_doc
    for i in range(len(tokens)-1):
        bi_gram = f"{tokens[i]}_{tokens[i+1]}"
        bi_gram_embd = (token_embd_per_doc[i] + token_embd_per_doc[i+1]) / 2
        terms.append(bi_gram)
        terms_embd_per_doc.append(bi_gram_embd)
    
    term_embd[label] = (terms, terms_embd_per_doc)

In [8]:
pickle.dump(term_embd, open("term_embd.pickle", "wb"))