<a href="https://colab.research.google.com/github/hshuai97/Colab20210803/blob/main/GAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using  dgl GATConv to finish the classification task:


Reference:
1. [dgl libraries](https://github.com/dmlc/dgl/blob/master/examples/pytorch/gat/gat.py)
2.collate_fn in DataLoader refer [Francisco Massa]( https://discuss.pytorch.org/t/supplying-arguments-to-collate-fn/25754/2),  [this](https://androidkt.com/create-dataloader-with-collate_fn-for-variable-length-input-in-pytorch/), and [BertGCN](https://github.com/ZeroRin/BertGCN/blob/main/model/models.py)

# Install libraries

In [1]:
import torch
try:
  import dgl
except ModuleNotFoundError:
  CUDA = 'cu' + torch.version.cuda.replace('.','')
  !pip install dgl-{CUDA} -f https://data.dgl.ai/wheels/repo.html

try:
  import word2vec
except ModuleNotFoundError:
  !pip install word2vec

import nltk
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

try:
  import transformers
except ModuleNotFoundError:
  !pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl-cu113
  Downloading https://data.dgl.ai/wheels/dgl_cu113-0.8.2-cp37-cp37m-manylinux1_x86_64.whl (220.6 MB)
[K     |████████████████████████████████| 220.6 MB 44 kB/s 
Collecting psutil>=5.8.0
  Downloading psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 5.1 MB/s 
Installing collected packages: psutil, dgl-cu113
  Attempting uninstall: psutil
    Found existing installation: psutil 5.4.8
    Uninstalling psutil-5.4.8:
      Successfully uninstalled psutil-5.4.8
Successfully installed dgl-cu113-0.8.2 psutil-5.9.1


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting word2vec
  Downloading word2vec-0.11.1.tar.gz (42 kB)
[K     |████████████████████████████████| 42 kB 973 kB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: word2vec
  Building wheel for word2vec (PEP 517) ... [?25l[?25hdone
  Created wheel for word2vec: filename=word2vec-0.11.1-py2.py3-none-any.whl size=156420 sha256=54ecd489430783abe179e9002529d91da47091f59312f249bd9ef2da22637fbc
  Stored in directory: /root/.cache/pip/wheels/c9/c0/d4/29d797817e268124a32b6cf8beb8b8fe87b86f099d5a049e61
Successfully built word2vec
Installing collected packages: word2vec
Successfully installed word2vec-0.11.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 72.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

# parsing

In [13]:
%%writefile parsing.py

import os
import time
import argparse
import numpy as np

import torch as th
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import  DataLoader

import dgl
from dgl.nn import GATConv

import word2vec

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from transformers import get_linear_schedule_with_warmup


def Data(dataset_name):
    NAME = dataset_name
    if NAME not in ['20ng', 'r8', 'r52', 'oh', 'mr']:
      raise ValueError('The dataset is not support')

    PATH = '/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/data/'+NAME

    train_texts = []
    train_labels = []
    with open(os.path.join(PATH, NAME+'-train-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          train_texts.append(t[1])
          train_labels.append(t[0])

    dev_texts = []
    dev_labels = []
    with open(os.path.join(PATH, NAME+'-dev-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          dev_texts.append(t[1])
          dev_labels.append(t[0])

    test_texts = []
    test_labels = []
    with open(os.path.join(PATH, NAME+'-test-stemmed.txt'), 'r') as f:
        data = f.readlines()

        for line in data:
          t = line.split('\t')
          test_texts.append(t[1])
          test_labels.append(t[0])

    target_names = list(set(train_labels))
    label2idx = {label: idx for idx, label in enumerate(target_names)}

    print(f'Dataset: {NAME}, Total train: {len(train_texts)+len(dev_texts)}, Train size: {len(train_texts)}, Dev size: {len(dev_texts)}, Test size: {len(test_texts)}, Num_class: {len(label2idx)}')
    print(f'labels: {label2idx}')
    print('*'*50)

    return train_texts,  train_labels, dev_texts, dev_labels, test_texts, test_labels, label2idx

def  buildvocab(sample, min_count=5):
  '''
  sample: ['wo xihuan ziran yuyan chuli', 'wo ai shengdu xuexi',  'wo xihuan jiqi xuexi']

  '''

  MIN_COUNT = min_count

  freq = {}
  for i in sample:
    for t in word_tokenize(i):
      if t not in freq:
        freq[t] = 0

  for i in sample:
    for t in word_tokenize(i):
      freq[t] +=1

  del_key = []
  for i in freq:
    if freq[i]<MIN_COUNT:
      del_key.append(i)
  
  for i in del_key:
    freq.pop(i)

  vocab_id = {}
  for i, key in enumerate(freq):
    vocab_id[key] = i
  print(f'vocab_id size: {len(vocab_id)}')
  print('*'*50)
  
  return vocab_id



class GAT(th.nn.Module):
  def __init__(self, vocab, num_layers, in_dim, num_hidden, num_classes, heads, activation, feat_drop, attn_drop, negative_slope, residual, max_length):
    super(GAT, self).__init__()
    self.max_length = max_length
    self.vocab_size = len(vocab)
    self.vocab = vocab

    self.node_hidden = th.nn.Embedding(self.vocab_size, 300)  # (num_vocab+1, num_hidden), include 'unk
    self.node_hidden.weight.data.copy_(th.tensor(self.load_w2v('/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/glove.6B/glove.6B.300d.w2vformat.txt')))
    self.node_hidden.weight.requires_grad = True

    self.gram = 2
    self.vocab = vocab

    self.num_layers = num_layers
    self.gat_layers = th.nn.ModuleList()
    self.activation = activation


    if num_layers > 1:
        # input projection (no residual)
        self.gat_layers.append(GATConv(
            in_dim, num_hidden, heads[0],
            feat_drop, attn_drop, negative_slope, False, self.activation))
        # hidden layers
        for l in range(1, num_layers-1):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GATConv(
                num_hidden * heads[l-1], num_hidden, heads[l],
                feat_drop, attn_drop, negative_slope, residual, self.activation))
        # output projection
        self.gat_layers.append(GATConv(
            num_hidden * heads[-2], num_classes, heads[-1],
            feat_drop, attn_drop, negative_slope, residual, None))
    else:
        self.gat_layers.append(GATConv(
            in_dim, num_classes, heads[0],
            feat_drop, attn_drop, negative_slope, residual, None))
  
  def load_w2v(self, path):
    w2v = word2vec.load(path)
    embedding_matrix = []
    for word in self.vocab:
      try:
        embedding_matrix.append(w2v[word])
      except KeyError:
        embedding_matrix.append(np.zeros(len(w2v['the'])))
    
    embedding_matrix = np.array(embedding_matrix)
    
    return embedding_matrix

  def add_edges(self, sample,  local_vocab_id):
    sample = sample.cpu().numpy()  # To numpy
    edges = []
    for i, src in enumerate(sample):
      u = local_vocab_id[src]
      for j in range(max(0, i-self.gram), min(i+self.gram +1, len(sample))):
        dst = sample[j]
        v = local_vocab_id[dst]

        edges.append([u, v])
      edges.append([u, u])
    return edges
  
  def sample2graph(self, sample):  # sample: [78, 63, 63, 33, 78,  ...]
    sample = sample.cpu().numpy()  # To numpy
    index = np.where(sample == self.vocab_size-1)[0] # delete <pad> nodes
    if len(index)>0:
      sample = sample[:index[0]]  # Numpy array

    if len(sample) > self.max_length:
      sample = sample[:self.max_length]  # Numpy array

    local_vocab = set(sample)  # {78, 63, 33, ...}
    sample = th.from_numpy(sample).to(DEVICE)

    local_vocab_tensor = th.tensor(list(local_vocab), dtype=th.int, device=DEVICE)
    n = len(local_vocab)
    local_vocab_id = dict(zip(local_vocab, range(n)))  # {78:0, 63:1, 33:2, ...}
    u, v = zip(*self.add_edges(sample, local_vocab_id))

    g = dgl.graph((u, v), num_nodes=n).to(DEVICE)

    g.ndata['h'] = self.node_hidden(local_vocab_tensor)

    return g


  def forward(self, inputs):  # (batch_size, token)

    gs = [self.sample2graph(s) for s in inputs]
    bg =dgl.batch(gs)
    h = (bg.ndata['h']).float()
    
    for l in range(self.num_layers):
        h = self.gat_layers[l](bg, h)
        h = h.flatten(1) if l != self.num_layers - 1 else h.mean(1)

    bg.ndata['h'] = h
    hg =  dgl.mean_nodes(bg, feat='h')  # (batch_size, num_class)

    return hg

def train(model, input, dev_input, epoch, batch_size):
  num_train_steps = int(len(input.dataset)/batch_size) * epoch
  num_warmup_steps = int(0.15 * num_train_steps)
  

  loss_func = th.nn.CrossEntropyLoss(label_smoothing=0.1)
  optimizer = th.optim.Adam(model.parameters(), weight_decay=1e-4, lr=1e-4)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps = num_train_steps)
  
  PATIENCE = 8  # Patience on dev set to finish training
  no_improv = 0  # No improvement on dev set

  best_acc = 0.0
  dur = []
  for e in range(epoch):
    t0 = time.time()
    improved = ''
    model.train()

    for i, ba in enumerate(input):
      b = tuple(t.to(DEVICE) for t in ba)
      x, y = b
      outputs = model(x)
      loss = loss_func(outputs, y)

      optimizer.zero_grad()
      loss.backward()

      optimizer.step()
      scheduler.step()

    val_acc = dev(model, dev_input)
    if val_acc>best_acc:
      best_acc = val_acc
      no_improve = 0
      improved = '*'
      th.save(model, f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/gat_{DATASET}.pkl')
    else:
      no_improve+=1
    dur.append(time.time()-t0)
    print(f'Epoch: {e}, Train loss:{loss.item():.4f}, Val acc: {val_acc:.4f}, Times: {np.mean(dur):.4f}s, {improved}')

    if no_improve>=PATIENCE:
      print(f'No improvement on dev set, early stopping')
      break

def dev(model, input):
  model.eval()
  total_pred = 0.0
  correct = 0.0
  for i, ba in enumerate(input):
    b = tuple(t.to(DEVICE) for t in ba)
    x, y = b

    with th.no_grad():
      outputs = model(x)
      pred = th.argmax(outputs, dim=1)
      
      correct_pred = th.sum(pred==y)
      correct += correct_pred
      total_pred += len(y)
  
  return th.div(correct, total_pred)  # Acc on dev set

def test(model, input):
  model.eval()
  total_pred = 0.0
  correct = 0.0
  for i, ba in enumerate(input):
    b = tuple(t.to(DEVICE) for t in ba)
    x, y = b

    with th.no_grad():
      outputs = model(x)
      pred = th.argmax(outputs, dim=1)
      
      correct_pred = th.sum(pred==y)
      correct += correct_pred
      total_pred += len(y)
  
  return th.div(correct, total_pred)  # Acc on dev set

def word2token(words, vocab_id):  # words: ['i', 'am', 'a', 'word'] --> [0, 1, 2, 44, ...]
  token = []
  for w in words:
    if w in vocab_id:
      token.append(vocab_id[w])
    else:
      token.append(vocab_id['<unk>'])
  return token

def batch(x, y, batch_size, shuffle=False):
    if len(x)!=len(y):
      raise ValueError('# x not equal y')
    
    te_x = [x[j: j+batch_size] for j in range(0, len(x), batch_size)]
    te_y = [y[j: j+batch_size] for j in range(0, len(y), batch_size)]

    output = [[te_x[j], te_y[j]] for j in range(len(te_x))]

    return output

class MyCollator(object):  # Refer Francisco Massa: https://discuss.pytorch.org/t/supplying-arguments-to-collate-fn/25754/2
    def __init__(self, vocab_id):
        self.vocab_id = vocab_id
    def __call__(self, samples):
      x, y, = [], []
      for (_text,_label) in samples:
        y.append(_label)
        processed_text = th.tensor(_text, dtype=th.int64)
        x.append(processed_text)
      y = th.tensor(y, dtype=th.int64)
      x = pad_sequence(x, batch_first=True, padding_value=self.vocab_id['<pad>'])

      return x, y

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', required=True, type=str, default='mr',help='dataset name')
parser.add_argument('--max_len', required=True, type=int)
parser.add_argument('--min_count', required=True, type=int, default=5)
parser.add_argument('--epoch', required=True, type=int, default=50)
args = parser.parse_args()

DATASET = args.dataset
MAX_LENGTH = args.max_len
EPOCH = args.epoch
MIN_COUNT = args.min_count
NUM_LAYER = 2

SEED = 42
th.manual_seed(SEED)
th.cuda.manual_seed(SEED)

DEVICE = th.device('cuda:0' if th.cuda.is_available() else 'cpu')
if th.cuda.is_available():
  print(f'device: {DEVICE}')
  print(f'name: {th.cuda.get_device_name(0)}')
  print(f'memory: {th.cuda.get_device_properties(0).total_memory/1e9}')
  print(f'*'*50)


train_texts,  train_labels, dev_texts, dev_labels, test_texts, test_labels, label2idx = Data(DATASET)
NUM_CLASS = len(label2idx)

vocab_id = buildvocab(train_texts, min_count=MIN_COUNT)
vocab_id['<unk>']=len(vocab_id)  # for OOV
vocab_id['<pad>'] = len(vocab_id)
print(f'vocab id: {vocab_id}')
print(f'*'*50)

tr_x = [word2token(word_tokenize(t), vocab_id) for t in train_texts]
tr_y = th.tensor([label2idx[t] for t in train_labels])
de_x = [word2token(word_tokenize(t), vocab_id) for t in dev_texts]
de_y = th.tensor([label2idx[t] for t in dev_labels])
te_x = [word2token(word_tokenize(t), vocab_id) for t in test_texts]
te_y = th.tensor([label2idx[t] for t in test_labels])

train_data = [(tr_x[i], tr_y[i]) for i in range(len(tr_y))]
dev_data = [(de_x[i], de_y[i]) for i in range(len(de_y))]
test_data = [(te_x[i], te_y[i]) for i in range(len(te_y))]
#train_data = batch(tr_x, tr_y, batch_size=128)
#dev_data = batch(de_x, de_y, batch_size=128)
#test_data = batch(te_x, te_y, batch_size=128)

my_collator = MyCollator(vocab_id)
train_dataloader = DataLoader(train_data, batch_size=32, collate_fn=my_collator, shuffle=True)
dev_dataloader = DataLoader(dev_data, batch_size=32, collate_fn=my_collator, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=32, collate_fn=my_collator, shuffle=False)

model = GAT(vocab_id, num_layers=NUM_LAYER, in_dim=300, num_hidden=600,  num_classes=NUM_CLASS, heads=[9]*(NUM_LAYER-1)+[1]
               , activation=F.elu, feat_drop=0.1, attn_drop=0.1, negative_slope=0.2, residual=False, max_length=MAX_LENGTH)
model.to(DEVICE)

train(model, train_dataloader, dev_dataloader, epoch=EPOCH, batch_size=32)

best_model = th.load(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/gat_{DATASET}.pkl')
res = test(best_model, test_dataloader)
print(f'Test accuracy: {res.cpu().numpy():.4f}')

Overwriting parsing.py


# run

In [16]:
!python parsing.py --dataset='oh' --min_count=1  --max_len=200 --epoch=100

device: cuda:0
name: Tesla P100-PCIE-16GB
memory: 17.071734784
**************************************************
Dataset: oh, Total train: 3357, Train size: 3021, Dev size: 336, Test size: 4043, Num_class: 23
labels: {'C13': 0, 'C03': 1, 'C11': 2, 'C19': 3, 'C05': 4, 'C12': 5, 'C18': 6, 'C17': 7, 'C06': 8, 'C04': 9, 'C21': 10, 'C08': 11, 'C23': 12, 'C14': 13, 'C22': 14, 'C20': 15, 'C07': 16, 'C15': 17, 'C10': 18, 'C01': 19, 'C02': 20, 'C09': 21, 'C16': 22}
**************************************************
vocab_id size: 20957
**************************************************
**************************************************
Epoch: 0, Train loss:3.1185, Val acc: 0.1667, Times: 8.4254s, *
Epoch: 1, Train loss:3.0258, Val acc: 0.1756, Times: 7.6782s, *
Epoch: 2, Train loss:3.0190, Val acc: 0.1756, Times: 7.4393s, 
Epoch: 3, Train loss:2.5441, Val acc: 0.1815, Times: 7.3103s, *
Epoch: 4, Train loss:2.7935, Val acc: 0.2113, Times: 7.2782s, *
Epoch: 5, Train loss:3.0591, Val acc: 0.2679,

In [None]:

text = 'the whole affair , true story or not , feels incredibly hokey . ..  it comes off like a hallmark commercial .'
sentences = sent_tokenize(text)
print(f'sentences: {sentences}')

words = word_tokenize(sentences[0])
print(f'words: {words}')

pos = pos_tag(words)
print(f'pos: {pos}')

In [None]:
import numpy as np

a = [[1,2,3],[4,5,6]]
b = np.array(a)
b

array([[1, 2, 3],
       [4, 5, 6]])