In [None]:
from torch_geometric.datasets import TUDataset, OGB_MAG
import torch_geometric.transforms as T

from nltk.parse.corenlp import CoreNLPParser,CoreNLPDependencyParser
from tqdm.auto import trange, tqdm
import pandas as pd
import ast
import itertools

# preprocessing 

In [None]:
def create_dataset(df_ccat, num_authors_to_pick = None, picked_author_ids = None, num_sent_per_text = None, save_folder = None, train=True):
    unique_authors = list(df_ccat['author_id'].unique())
    if not picked_author_ids:
        picked_author_ids = sorted(np.random.choice(unique_authors, replace=False, size=num_authors_to_pick).tolist())
    authors = []
    texts = []
    for author in picked_author_ids:
        df_temp = df_ccat[df_ccat['author_id'] == author]
        for i_doc in range(len(df_temp)):
            doc = df_temp['text'].iloc[i_doc].split('\n')
            for i in range(len(doc)):
                doc[i] = doc[i].strip()
            doc.remove('')
            for i in range(len(doc)-num_sent_per_text):
                authors.append(author)
                texts.append(' '.join(doc[i:i+num_sent_per_text]))
    df = pd.DataFrame({'author':authors, 'text':texts})
    if save_folder:
        str_author = ','.join(map(str, picked_author_ids))
        file_name = f"author_{str_author}_sent_{num_sent_per_text}_{'train' if train else 'val'}.csv"
        df.to_csv(f"{save_folder}/{file_name}", index=False)
        return df, file_name
    return df

In [None]:
df_ccat = pd.read_csv('../../data/CCAT50/processed/CCAT50_train.csv')
picked_author_ids = [0,1]
num_sent_per_text = 2
save_folder = '../../data/CCAT50/processed/'
df, file_name = create_dataset(df_ccat, picked_author_ids = picked_author_ids, num_sent_per_text = num_sent_per_text, save_folder = save_folder)

In [None]:
df_ccat = pd.read_csv('../../data/CCAT50/processed/CCAT50_AA_val.csv')
picked_author_ids = [0,1]
num_sent_per_text = 2
save_folder = '../../data/CCAT50/processed/'
df, file_name = create_dataset(df_ccat, picked_author_ids = picked_author_ids, num_sent_per_text = num_sent_per_text, save_folder = save_folder, train=False)

In [None]:
depparser = CoreNLPDependencyParser(url='http://localhost:9000')

In [None]:
def get_dep_edges(texts):
    homo_edges = []
    hetoro_edges = []
    pos_seqs = []
    for text in tqdm(texts):
        parsed = depparser.raw_parse(text)
        conll_dep = next(parsed).to_conll(4)
        lines = conll_dep.split('\n')
        homo_edge = []
        hetoro_edge = []
        pos_seq = []
        for i,line in enumerate(lines[:-1]):
            l = line.split('\t')
            homo_edge.append([i+1, int(l[2])])
            hetoro_edge.append([i+1, int(l[2]), l[3]])
            pos_seq.append(l[1])
        homo_edges.append(homo_edge)
        hetoro_edges.append(hetoro_edge)
        pos_seqs.append(pos_seq)
    return homo_edges, hetoro_edges, pos_seqs

In [None]:
# # processing train set
# file = '../../data/CCAT50/processed/author_0,1_sent_2.csv'
# df = pd.read_csv(file)
# homo_edges, hetoro_edges, pos_seqs = get_dep_edges(df['text'])
# df['homo_edges'] = homo_edges
# df['hetoro_edges'] = hetoro_edges
# df['pos_seqs'] = pos_seqs
# df.to_csv(file, index=False)

  0%|          | 0/1299 [00:00<?, ?it/s]

In [None]:
# # processing val set
# file = '../../data/CCAT50/processed/author_0,1_sent_2_val.csv'
# df_val = pd.read_csv(file)
# homo_edges, hetoro_edges, pos_seqs = get_dep_edges(df_val['text'])
# df_val['homo_edges'] = homo_edges
# df_val['hetoro_edges'] = hetoro_edges
# df_val['pos_seqs'] = pos_seqs
# df_val.to_csv(file, index=False)

  0%|          | 0/324 [00:00<?, ?it/s]

In [None]:
file = '../../data/CCAT50/processed/author_0,1_sent_2.csv'
df = pd.read_csv(file)
df['homo_edges'] = df['homo_edges'].apply(ast.literal_eval)
df['hetoro_edges'] = df['hetoro_edges'].apply(ast.literal_eval)
df['pos_seqs'] = df['pos_seqs'].apply(ast.literal_eval)

In [None]:
file = '../../data/CCAT50/processed/author_0,1_sent_2_val.csv'
df_val = pd.read_csv(file)
df_val['homo_edges'] = df_val['homo_edges'].apply(ast.literal_eval)
df_val['hetoro_edges'] = df_val['hetoro_edges'].apply(ast.literal_eval)
df_val['pos_seqs'] = df_val['pos_seqs'].apply(ast.literal_eval)

In [None]:
def freeze_model(model, freeze_bert):
    '''
    if freeze_bert == True, freeze all layer. 
    if freeze_bert is a positive integer, freeze the bottom {freeze_bert} attention layers
    negative integer should also work
    '''
    if freeze_bert==True:
        for param in model.parameters():
            param.requires_grad = False
    elif isinstance(freeze_bert, (int, np.int32, np.int64, torch.int32, torch.int64)):
        for param in model.embeddings.parameters():
            param.requires_grad = False  
        for layer in model.encoder.layer[:freeze_bert]: 
            for param in layer.parameters():
                param.requires_grad = False  
    return model

In [None]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from transformers import AutoTokenizer
from transformers.models.bert.modeling_bert import BertModel
import torch

In [None]:
checkpoint = '/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, local_files_only=True)

In [None]:
bert = BertModel.from_pretrained(checkpoint, local_files_only=True, add_pooling_layer = False)
# bert = freeze_model(bert, True)
bert = bert.eval()

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_8/checkpoint-155000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
data_list = []
for i in range(len(df)):
    curr = df.iloc[i]
    data = Data()
    data.edge_index = torch.tensor(curr['homo_edges']).T
    tokens = tokenizer(' '.join(curr['pos_seqs']), padding=True, truncation=True, max_length=128, return_tensors='pt')
    data.x = bert(**tokens).last_hidden_state.squeeze(0)
    data.y = torch.tensor([curr['author']])
    data_list.append(data)
    
train_loader = DataLoader(data_list, batch_size=32)


In [None]:
data_list = []
for i in range(len(df_val)):
    curr = df_val.iloc[i]
    data = Data()
    data.edge_index = torch.tensor(curr['homo_edges']).T
    tokens = tokenizer(' '.join(curr['pos_seqs']), padding=True, truncation=True, max_length=128, return_tensors='pt')
    data.x = bert(**tokens).last_hidden_state.squeeze(0)
    data.y = torch.tensor([curr['author']])
    data_list.append(data)
    
test_loader = DataLoader(data_list, batch_size=32)


In [None]:
from dataclasses import dataclass
@dataclass
class myGCNoutput:
    loss: None
    logit: None
    emb: None

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

pos_emb_dim = 32
class myGCN(torch.nn.Module):
    def __init__(self, num_gcn, num_class):
        super().__init__()
        self.num_gcn = num_gcn
        self.num_class = num_class
        
        self.gcns = nn.ModuleList()
        for i in range(num_gcn):
            self.gcns.append(GCNConv(pos_emb_dim, pos_emb_dim))
        
        self.classifier = nn.Linear(pos_emb_dim, num_class)
        self.lossfn = nn.CrossEntropyLoss()
        
    def forward(self, x, edge_index, batch, y, ptr, readout='pool'):
        for i in range(self.num_gcn):
            x = self.gcns[i](x, edge_index)
            x = F.relu(x)
        
        if readout == 'pool':
            x = global_mean_pool(x, batch) 
        elif readout == 'cls':
            x = x[ptr[:-1],:]
        
        x = F.dropout(x, training=self.training)
        logit = self.classifier(x)
        loss = self.lossfn(logit, y)
        return myGCNoutput(loss=loss, logit=logit, emb=x)

In [None]:
NUM_GCN = [1,2,3,4]
LR = [1e-4, 5e-4, 1e-5]
for num_gcn, lr in itertools.product():

In [None]:
model = myGCN(2,2)

In [None]:
output = model(data.x, data.edge_index, data.batch, data.y, data.ptr, readout='pool')


myGCNoutput(loss=tensor(0.6419, grad_fn=<NllLossBackward0>), logit=tensor([[ 0.1829,  0.0710],
        [ 0.1230, -0.0620],
        [ 0.0836,  0.0427],
        [ 0.4229,  0.0772],
        [ 0.1335,  0.1438],
        [ 0.3159,  0.3549],
        [ 0.2732,  0.0921],
        [ 0.4138,  0.0917],
        [ 0.4260,  0.0989],
        [ 0.1684,  0.2587],
        [ 0.3383,  0.1021],
        [ 0.3610,  0.1949],
        [ 0.1934,  0.3034],
        [ 0.2862,  0.0408],
        [ 0.4418,  0.1869],
        [ 0.3754, -0.0122],
        [ 0.4137,  0.4449],
        [ 0.2167,  0.0185],
        [ 0.3180,  0.3312],
        [ 0.0681,  0.0132],
        [ 0.4914,  0.1193],
        [ 0.0255,  0.0149],
        [ 0.0727,  0.0960],
        [ 0.0630,  0.2063],
        [ 0.1948,  0.0625],
        [ 0.1778, -0.0577],
        [ 0.1658, -0.0850],
        [ 0.2723,  0.2755],
        [ 0.3028,  0.2816],
        [ 0.1967,  0.3544],
        [ 0.1140,  0.0819],
        [ 0.1865,  0.1038]], grad_fn=<AddmmBackward0>), emb=tenso