In [98]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from pathlib import Path
from collections import Counter
import numpy as np
import scipy.sparse as sp
from math import log
import pickle as pkl
from transformers import AutoModel, AutoTokenizer
from models.bert_gcn import BertGCN
import torch as th 
from models.train_bert_gcn import get_model
from utils.graph_utils import *
import dgl
import torch.utils.data as Data


In [102]:
max_length = 128
batch_size = 128
cpu = th.device('cpu')

In [4]:
def load_pickle(filename):
    with open(filename, 'rb') as pkl_file:
        data = pkl.load(pkl_file)
    return data

def save_as_pickle(filename, data):
    with open(filename, 'wb') as output:
        pkl.dump(data, output)

def encode_input(text, tokenizer):
    input = tokenizer(text, max_length = max_length, truncation = True, padding = 'max_length', return_tensors = 'pt')
    print(input.keys())
    return input.input_ids, input.attention_mask

In [5]:
data_dir = os.path.join(os.path.abspath(''), 'data')
data_dir

'c:\\Users\\nafta\\Desktop\\Project\\TweetAnalyzer.git\\backend\\latest_bgsrd\\data'

In [7]:
df_text = load_pickle(os.path.join(data_dir, 'df_data.pkl'))
df_text['text'] = df_text['text'].apply(lambda x: ' '.join(x))
df_text

Unnamed: 0,text,label,type
0,rt ncidirector helpful covid19 resource stjude...,1,train
1,two positive cases covid19 detected india coro...,1,train
2,rt kevin_shipp expert created `` us bioweapons...,0,train
3,new china reported new confirmed cases covid-1...,1,train
4,span 12 hours mohfw_india reported 240 new cas...,1,train
...,...,...,...
395,feb 27 update canada 13th case covid19 confirm...,1,test
396,asadpxki nope ebola still epidemic especially ...,0,test
397,rt cdcgov protect others covid19 shopping vide...,1,test
398,rt 9gag nt forget protect eyes since coronavir...,0,test


In [11]:
display(df_text[df_text.type == 'train'].label.value_counts())
display(df_text[df_text.type == 'val'].label.value_counts())
display(df_text[df_text.type == 'test'].label.value_counts())

0    145
1    143
Name: label, dtype: int64

1    20
0    12
Name: label, dtype: int64

0    43
1    37
Name: label, dtype: int64

In [13]:
G_dict = load_pickle(os.path.join(data_dir, "text_graph.pkl"))
G = G_dict["graph"]

In [67]:
# adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(dataset)
nb_node = G.number_of_nodes()
nb_train, nb_val, nb_test = len(df_text[df_text.type == 'train']), len(df_text[df_text.type == 'val']), len(df_text[df_text.type == 'test'])
nb_word = nb_node - nb_train - nb_val - nb_test 
nb_class = df_text['label'].nunique() # number of classes
nb_node

3376

In [17]:
model = get_model()
model

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertGCN(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [18]:
input_ids, attention_mask = encode_input(df_text.text.to_list(), model.tokenizer)
display(input_ids.shape)

dict_keys(['input_ids', 'attention_mask'])


In [20]:
input_ids = th.cat([input_ids[:-nb_test], th.zeros((nb_word, max_length), dtype=th.long), input_ids[-nb_test:]])
attention_mask = th.cat([attention_mask[:-nb_test], th.zeros((nb_word, max_length), dtype=th.long), attention_mask[-nb_test:]])

In [23]:
display(input_ids)
display(input_ids.shape)
display(attention_mask)
display(attention_mask.shape)

tensor([[    0,  9713,   295,  ...,     1,     1,     1],
        [    0,  7109,  1313,  ...,     1,     1,     1],
        [    0,  9713,  7321,  ...,     1,     1,     1],
        ...,
        [    0,  9713,   740,  ...,     1,     1,     1],
        [    0,  9713,   361,  ...,     1,     1,     1],
        [    0, 45061, 13561,  ...,     1,     1,     1]])

torch.Size([3376, 128])

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

torch.Size([3376, 128])

In [25]:
# create mask
def sample_mask(idx, l):
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype = np.bool)

3376

In [44]:
idx_train = df_text[df_text.type == 'train'].index.to_list()
idx_val = df_text[df_text.type == 'val'].index.to_list()
idx_test = range(nb_node - nb_test, nb_node)
display(idx_train)
display(idx_val)
display(idx_test)


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


[288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 308,
 309,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 319]

range(3296, 3376)

In [39]:
# 1d array with true if the index belong to train, size: nb_nodes
train_mask = sample_mask(idx_train, nb_node) 
val_mask = sample_mask(idx_val, nb_node)
test_mask = sample_mask(idx_test, nb_node)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return np.array(mask, dtype=np.bool)


In [69]:
train_real_index = df_text[(df_text.label == 1) & (df_text.type == 'train')].index.to_list()
val_real_index = df_text[(df_text.label == 1) & (df_text.type == 'val')].index.to_list()
test_real_index = df_text[(df_text.label == 1) & (df_text.type == 'test')].index.to_list()
display(test_real_index)
test_real_index = list(map(lambda x: x + nb_word, test_real_index))
display(test_real_index)

[321,
 323,
 324,
 326,
 336,
 337,
 338,
 341,
 342,
 343,
 346,
 347,
 349,
 350,
 352,
 355,
 356,
 358,
 361,
 363,
 364,
 366,
 368,
 370,
 371,
 373,
 375,
 379,
 380,
 384,
 385,
 387,
 391,
 393,
 395,
 397,
 399]

[3297,
 3299,
 3300,
 3302,
 3312,
 3313,
 3314,
 3317,
 3318,
 3319,
 3322,
 3323,
 3325,
 3326,
 3328,
 3331,
 3332,
 3334,
 3337,
 3339,
 3340,
 3342,
 3344,
 3346,
 3347,
 3349,
 3351,
 3355,
 3356,
 3360,
 3361,
 3363,
 3367,
 3369,
 3371,
 3373,
 3375]

In [70]:
# 1d array with 1 if the index belong to train && label == 1, size: nb_nodes
y_train, y_val, y_test = np.zeros(nb_node), np.zeros(nb_node), np.zeros(nb_node)
y_train[train_real_index] = 1
y_val[val_real_index] = 1
y_test[test_real_index] = 1


In [71]:
display(np.count_nonzero(y_train))
display(np.count_nonzero(y_val))
display(np.count_nonzero(y_test))

143

20

37

In [72]:
y = y_train + y_val + y_test

In [73]:
display(y)
display(np.count_nonzero(y))
display(y.shape)


array([1., 1., 0., ..., 1., 0., 1.])

200

(3376,)

In [74]:
doc_mask  = train_mask + val_mask + test_mask

In [76]:
display(doc_mask)
display(np.count_nonzero(doc_mask))
display(doc_mask.shape)

array([ True,  True,  True, ...,  True,  True,  True])

400

(3376,)

In [88]:
# build DGL Graph
adj_norm, f = normalize_adj(G)
adj_norm_sp = sp.csr_matrix(adj_norm)
g = dgl.from_scipy(adj_norm_sp.astype('float32'), eweight_name='edge_weight')

In [92]:
print(g.edges())
print(g.nodes())


(tensor([   0,    0,    0,  ..., 3375, 3375, 3375]), tensor([   0, 1146, 1147,  ..., 3297, 3325, 3375]))
tensor([   0,    1,    2,  ..., 3373, 3374, 3375])


In [93]:
g.ndata['input_ids'], g.ndata['attention_mask'] = input_ids, attention_mask
g.ndata['label'], g.ndata['train'], g.ndata['val'], g.ndata['test'] = \
    th.LongTensor(y), th.FloatTensor(train_mask), th.FloatTensor(val_mask), th.FloatTensor(test_mask)
# y = (3376,) , train_mask = (3376, )
g.ndata['label_train'] = th.LongTensor(y_train)
g.ndata['cls_feats'] = th.zeros((nb_node, model.feat_dim)) # (3376, 768)

In [99]:
# create index loader
train_idx = Data.TensorDataset(th.arange(0, nb_train, dtype = th.long))
val_idx = Data.TensorDataset(th.arange(nb_train, nb_train + nb_val, dtype = th.long))
test_idx = Data.TensorDataset(th.arange(nb_node - nb_test, nb_node, dtype = th.long))
doc_idx = Data.ConcatDataset([train_idx, val_idx, test_idx])

idx_loader_train = Data.DataLoader(train_idx, batch_size = batch_size, shuffle = True)
idx_loader_val = Data.DataLoader(val_idx, batch_size = batch_size)
idx_loader_test = Data.DataLoader(test_idx, batch_size = batch_size)
idx_loader = Data.DataLoader(doc_idx, batch_size = batch_size, shuffle = True)

In [100]:
# Training
def update_feature():
    global model, g, doc_mask
    # no gradient needed, uses a large batchsize to speed up the process
    dataloader = Data.DataLoader(
        Data.TensorDataset(g.ndata['input_ids'][doc_mask], g.ndata['attention_mask'][doc_mask]),
        batch_size=1024
    )
    with th.no_grad():
        model = model.to(cpu)
        #model = model.to(gpu)
        model.eval()
        cls_list = []
        for i, batch in enumerate(dataloader):
            #input_ids, attention_mask = [x.to(gpu) for x in batch]
            input_ids, attention_mask = [x.to(cpu) for x in batch]
            output = model.bert_model(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0]
            cls_list.append(output.cpu())
        cls_feat = th.cat(cls_list, axis=0)
    g = g.to(cpu)
    g.ndata['cls_feats'][doc_mask] = cls_feat
    return g

In [87]:
#g.ndata['cls_feats'] 
#model.feat_dim

#adj_norm_sp_dense = adj_norm_sp.todense()
#print((adj_norm_sp_dense == adj_norm).all())


True
