In [39]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from pathlib import Path
from collections import Counter
import numpy as np
import scipy.sparse as sp
from math import log
import pickle as pkl
from transformers import AutoModel, AutoTokenizer
from models.bert_gcn import BertGCN
import torch as th 
from models.train_bert_gcn import get_model

In [22]:
def load_pickle(filename):
    with open(filename, 'rb') as pkl_file:
        data = pkl.load(pkl_file)
    return data

def encode_input(text, tokenizer):
    input = tokenizer(text, max_length = max_length, truncation = True, padding = 'max_length', return_tensors = 'pt')
    print(input.keys())
    return input.input_ids, input.attention_mask

In [4]:
data_dir = (os.path.join(os.path.abspath(''), 'data'))
data_dir

'c:\\Users\\nafta\\Desktop\\Project\\TweetAnalyzer.git\\backend\\latest_bgsrd\\data'

In [51]:
df_text = load_pickle(os.path.join(data_dir, 'df_data2.pkl'))
df_text['text'] = df_text['text'].apply(lambda x: ' '.join(x))
df_text

Unnamed: 0,text,label
0,1/6 covid19 key concerns 🇨🇦 today goc released...,1
1,2/2 sustainable changes including appropriate ...,1
2,rt scmpnews china coronavirus hong kong resear...,0
3,today 2:30 secretarylevine provide update covi...,1
4,yo literally racialize politicize coronavirus ...,0
...,...,...
395,rt nineralex ever drank immune corona virus ht...,0
396,moteging became clear trying scatter us hong k...,0
397,rt peterzeihan clearly immune https //tco/lpc4...,0
398,`` vaccine still least 18 months away meantime...,1


In [52]:
X = df_text.iloc[:, 0:-1]
y = df_text.iloc[:, -1]

In [53]:
display(X)
display(y)

Unnamed: 0,text
0,1/6 covid19 key concerns 🇨🇦 today goc released...
1,2/2 sustainable changes including appropriate ...
2,rt scmpnews china coronavirus hong kong resear...
3,today 2:30 secretarylevine provide update covi...
4,yo literally racialize politicize coronavirus ...
...,...
395,rt nineralex ever drank immune corona virus ht...
396,moteging became clear trying scatter us hong k...
397,rt peterzeihan clearly immune https //tco/lpc4...
398,`` vaccine still least 18 months away meantime...


0      1
1      1
2      0
3      1
4      0
      ..
395    0
396    0
397    0
398    1
399    1
Name: label, Length: 400, dtype: int64

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle = False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42, shuffle = False) # 0.1 x 0.8 = 0.08

In [55]:
df_train = pd.concat([X_train, y_train], axis=1)
df_train['type'] = 'train'
df_val = pd.concat([X_val, y_val], axis=1)
df_val['type'] = 'val'
df_test = pd.concat([X_test, y_test], axis=1)
df_test['type'] = 'test'

display(df_train)
display(df_val)
display(df_test)

Unnamed: 0,text,label,type
0,1/6 covid19 key concerns 🇨🇦 today goc released...,1,train
1,2/2 sustainable changes including appropriate ...,1,train
2,rt scmpnews china coronavirus hong kong resear...,0,train
3,today 2:30 secretarylevine provide update covi...,1,train
4,yo literally racialize politicize coronavirus ...,0,train
...,...,...,...
283,rt iiideaton god already gave us vaccine disea...,0,train
284,meme false claim us charging 3000 per test ” p...,1,train
285,a_r_j_u_n_e surgeon_general waltmossberg one c...,1,train
286,rt mamachell ok guys let talk coronavirus step...,1,train


Unnamed: 0,text,label,type
288,rt q protect coronavirus https //tco/pkzkao2yf...,1,val
289,additional nine cases covid-19 diagnosed since...,1,val
290,hilowff dynogametheory ok also immune covid 🤷‍♂️,0,val
291,hotel rooms intended people tested positive co...,1,val
292,rt deepstateexpose bill gates patent holder co...,0,val
293,rt ferdiriva killer coronavirus could spread e...,0,val
294,rt tombollyky much done communication potentia...,0,val
295,help cure coronavirus pc leftover processing p...,0,val
296,rt d4publichealth nationalnutritionmonth quara...,1,val
297,`` global polio eradication initiative working...,1,val


Unnamed: 0,text,label,type
320,rt dgolndrcon penpenchase might help https //t...,1,test
321,protect coronavirus research site cdcgov accur...,1,test
322,rt epochchanger comes vaccine mandates health ...,0,test
323,coronvirus today lady china chen wei professio...,0,test
324,rt ryanfc706 people drank water hose kid immun...,0,test
...,...,...,...
395,rt nineralex ever drank immune corona virus ht...,0,test
396,moteging became clear trying scatter us hong k...,0,test
397,rt peterzeihan clearly immune https //tco/lpc4...,0,test
398,`` vaccine still least 18 months away meantime...,1,test


In [58]:
display(df_train.label.value_counts())
display(df_val.label.value_counts())
display(df_test.label.value_counts())

0    145
1    143
Name: label, dtype: int64

1    17
0    15
Name: label, dtype: int64

1    40
0    40
Name: label, dtype: int64

In [48]:
df_text_splitted = pd.concat([df_train, df_val, df_test]).reset_index(drop = True)
df_text_splitted

Unnamed: 0,text,label,type
0,false claim saying 300 sadhus tested covid pos...,1,train
1,rt sonud40718416 nomeat_nocoronavirus disease ...,0,train
2,icmrdelhi guidelines also issued private secto...,1,train
3,3d anatomic modeling lab prints 3d model virus...,1,train
4,rt iiideaton god already gave us vaccine disea...,0,train
...,...,...,...
395,rt foreignpolicy bacillus calmette-guérin bcg ...,0,test
396,addition well-known breathing problems blood c...,1,test
397,gaga proud announce developed vaccine covid 🚬s...,0,test
398,new zealand relied science empathy take covid-...,1,test


In [13]:
G_dict = load_pickle(os.path.join(data_dir, "text_graph2.pkl"))
G = G_dict["graph"]


3376

In [15]:
# adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(dataset)
nb_node = G.number_of_nodes()
nb_train, nb_val, nb_test = len(df_train), len(df_val), len(df_test)
nb_word = nb_node - nb_train - nb_val - nb_test 
nb_class = df_text['label'].nunique() # number of classes

2

In [33]:
model = get_model()
model

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertGCN(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [37]:
input_ids, attention_mask = encode_input(df_text.text.to_list(), model.tokenizer)

dict_keys(['input_ids', 'attention_mask'])


In [41]:
input_ids.shape

torch.Size([400, 128])

In [38]:
input_ids = th.cat([input_ids[:-nb_test], th.zeros((nb_word, max_length), dtype=th.long), input_ids[-nb_test:]])
attention_mask = th.cat([attention_mask[:-nb_test], th.zeros((nb_word, max_length), dtype=th.long), attention_mask[-nb_test:]])

tensor([[    0,   134,    73,  ...,     1,     1,     1],
        [    0,   176,    73,  ...,     1,     1,     1],
        [    0,  9713,  2850,  ...,     1,     1,     1],
        ...,
        [    0,  9713,   181,  ...,     1,     1,     1],
        [    0, 49519,  9937,  ...,     1,     1,     1],
        [    0,   879,  3368,  ...,     1,     1,     1]])

torch.Size([400, 128])

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

torch.Size([400, 128])