<a href="https://colab.research.google.com/github/focusondoingonething/mypapercode/blob/main/MyModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Prerequisites

'readme':

1) This is the code of paper: "An Improved Graph Neural Network Frame for Text Classification".

2) Open this in colab platform and click the "connect" and run to reproduce the result (replace the dataset file path).

3) The original training set and test set have been determined, so k-fold cross-validation is not appicable. This experiment uses 5 times of shuffle, and runs 5 times after each shuffle to take the average.

Import or Install Libraries:

In [None]:
# Import Libraries
import os
import re
import time
import numpy as np
import random
import pandas as pd
import torch as th
from sklearn.model_selection import train_test_split  # for train and dev set split
from torch.nn import functional as F
import matplotlib.pyplot as plt
from datetime import datetime


# Install and Import Libraries
try:
  import word2vec
except ModuleNotFoundError:
  !pip install word2vec
  import word2vec

try:
  import nnsplit
except ModuleNotFoundError:
  !pip install nnsplit

try:
  import torch_scatter
except ModuleNotFoundError:
  TORCH = th.__version__.split('+')[0]
  CUDA = 'cu' + th.version.cuda.replace('.','')
  !pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html

try:
  import dgl
except ModuleNotFoundError:
  CUDA = 'cu' + th.version.cuda.replace('.','')
  !pip install dgl-{CUDA} -f https://data.dgl.ai/wheels/repo.html
  import dgl

import nltk
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from dgl.nn import GATConv


# Set Random Seed
SEED = 42
th.manual_seed(SEED)
th.cuda.manual_seed(SEED)
random.seed(SEED)

#Dataset Preprocessing

##Clean Text

Just run it once.

In [None]:
# Clean Data
def Clean_Text(text):
  text = text.lower()
  text = text.strip()
  text = re.sub(r"\'s", " is", text)
  text = re.sub(r"\'ve", " have", text)
  text = re.sub(r"n\'t", " not", text)
  text = re.sub(r"\'re", " are", text)
  text = re.sub(r"\'d", " would", text)
  text = re.sub(r"\'ll", " will", text)

  text = re.sub(r"[^A-Za-z().?!\'\`]", " ", text)
  text = re.sub(r"\(", "", text)
  text = re.sub(r"\)", "", text)
  text = re.sub(r"\'", "", text)
  text = re.sub(r'`', '', text)
  text = re.sub(r"\s{2,}", " ", text)

  return text

def Clean_and_Save(path, dataset_name):
  NAME = dataset_name
  if NAME not in ['20ng', 'r8', 'r52', 'oh', 'mr']:
    raise ValueError('The dataset is not support')
  raw_x = []
  with open(os.path.join(path, NAME+'.txt'), encoding='latin1') as f:
    data = f.readlines()
    print(f'{NAME}, Total sample: {len(data)}')
  with open(os.path.join(path, NAME+'_mapping.txt'), encoding='latin1') as f:
    map = f.readlines()
    print(f'{NAME}, Total mapping: {len(map)}')
  if len(map) != len(data):
    raise Exception('Map size not equal to data')
  
  # Clean Sample and Save
  tra_x, tra_y = [], []
  tes_x, tes_y = [], []
  for i in range(len(map)):
    cle_x = Clean_Text(data[i]).strip()  # X
    lab = map[i].split('\t')  # y
    if lab[len(lab)-2] in ['20news-bydate-test', 'test']:
      tes_x.append(cle_x)
      tes_y.append(re.sub(r'\n', '', lab[len(lab)-1]))
    elif lab[len(lab)-2] in ['20news-bydate-train', 'train', 'training']:
      tra_x.append(cle_x)
      tra_y.append(re.sub(r'\n', '', lab[len(lab)-1]))
  if len(tes_y)!=len(tes_x) or len(tra_x)!=len(tra_y):
    raise Exception('Numbef of x is not equal y')
  
  print(f'Total train: {len(tra_x)}')
  print(f'Total test: {len(tes_x)}')

  # Save to csv
  df_tra = pd.DataFrame({'tra_y': tra_y, 'tra_x': tra_x})
  df_tes = pd.DataFrame({'tes_y': tes_y, 'tes_x': tes_x})
  # Remove the csv header
  df_tra.columns = range(df_tra.shape[1])
  df_tes.columns = range(df_tes.shape[1])
  df_tra.to_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/data/{NAME}-stemmed.txt', index=False, sep='\t',header=None)
  df_tes.to_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/data/{NAME}-test-stemmed.txt', index=False, sep='\t',header=None)

path = '/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/data/'
for name in ['20ng', 'r8', 'r52', 'oh', 'mr']:
  t0 = time.time()
  #if name not in ["oh"]: continue  # Check for specific dataset 
  Clean_and_Save(path, name)
  print(f'Time: {time.time() - t0:.4f}s')
  print('='*50)


##Convert to Token

Text to Token

In [None]:
# Build Vocab on All Train Sample
def BuildVocab(TrainSample, min_count=5):
  freq = {}
  for x in TrainSample:
    for t in word_tokenize(x):
      if t not in freq:
        freq[t] = 0
      else:
        freq[t] += 1
  del_key = []
  for i in freq:
    if freq[i]<min_count:
      del_key.append(i)
  
  for i in del_key:freq.pop(i)

  vocab_id = {}
  for i, key in enumerate(freq): vocab_id[key] = i
  vocab_id["unk"] = i+1  # Unknow word index (OOV)

  print(f'vocab size: {len(vocab_id)}')
  print('='*50)

  return vocab_id


# Convert Word to ID Number
def X2Id(X, vocab):
  ids = []
  for w in word_tokenize(X):
    if w in vocab:
      ids.append(vocab[w])
    else:
      ids.append(vocab["unk"])
  
  return ids


# Convert String Tex to Token Number
def Text2Token(vocab, text_list):
  temp = []
  for text in text_list:
    temp.append(X2Id(text, vocab))
  
  return temp


# Split Train and Dev Set
def Tra_Dev_Set(all_train_text, all_train_label, dataset_name):
  if dataset_name=="r52":
    train_x, dev_x, train_y, dev_y = train_test_split(all_train_text, all_train_label, test_size=0.06, shuffle=True, random_state=42)
  else:
    train_x, dev_x, train_y, dev_y = train_test_split(all_train_text, all_train_label, \
                                                      test_size=0.1, shuffle=True, random_state=42, stratify=all_train_label)

  return train_x, dev_x, train_y, dev_y


# Shuffle All Train Sample, Split and Token
def MyDataset(dataset_name):
  NAME = dataset_name
  if NAME not in ['20ng', 'r8', 'r52', 'oh', 'mr']:
    raise ValueError('The dataset is not support')
  
  frame = {"all_train_text":[], "all_train_label":[], "test_text":[], "test_label":[]}
  PATH = '/content/data/'
  for t1,t2,t3 in [("-stemmed.txt", "all_train_text", "all_train_label"),
                      ("-test-stemmed.txt", "test_text", "test_label")]:
    with open(os.path.join(PATH, NAME+t1), 'r') as f:
      data = f.readlines()
      for line in data:
        line = line.strip()
        temp = line.split('\t')
        frame[t2].append(temp[1])  # X
        frame[t3].append(temp[0])  # y
    
  train_text, dev_text, train_label, dev_label = Tra_Dev_Set(frame["all_train_text"], frame["all_train_label"], NAME)
  test_text, test_label = frame["test_text"], frame["test_label"]
  
  num_class = list(set(train_label))  # len(num_class)
  label2idx = {label: idx for idx, label in enumerate(num_class)}
  print(f'Dataset={NAME}, Num_class={len(num_class)}, labels={label2idx}')
  print(f'All_train={len(frame["all_train_text"])}, Train_size={len(train_text)}, Dev_size={len(dev_text)}, Test_size={len(test_text)}')

  vocab = BuildVocab(train_text)
  train_text_tok = Text2Token(vocab, train_text)
  dev_text_tok = Text2Token(vocab, dev_text)
  test_text_tok = Text2Token(vocab, test_text)

  train_label_tok = [label2idx[t] for t in train_label]
  dev_label_tok = [label2idx[t] for t in dev_label]
  test_label_tok = [label2idx[t] for t in test_label]

  
  return train_text_tok,  train_label_tok, dev_text_tok, dev_label_tok, \
          test_text_tok, test_label_tok, len(num_class), vocab

# Model Architecture

In [None]:
# Model Architecture
class GATLayer(th.nn.Module):
  def __init__(self, in_dim, hidden_dim, num_class):
    super(GATLayer, self).__init__()
    self.num_layer = 3
    self.heads = [3]*(self.num_layer-1) + [1]
    self.feat_drop = 0.4
    self.attn_drop = 0.4
    self.neg_slope = 0.2
    self.act = F.elu
    self.gat_layers = th.nn.ModuleList()

    if self.num_layer > 1:
      self.gat_layers.append(GATConv(in_dim, hidden_dim, self.heads[0], self.feat_drop, \
                                    self.attn_drop, self.neg_slope, False, self.act))
      for l in range(1, self.num_layer-1):
        self.gat_layers.append(GATConv(hidden_dim *self. heads[l-1], hidden_dim, self.heads[l], \
                                      self.feat_drop, self.attn_drop, self.neg_slope, False, self.act))
      self.gat_layers.append(GATConv(hidden_dim * self.heads[-2], num_class, \
                                    self.heads[-1], self.feat_drop, self.attn_drop, \
                                    self.neg_slope, False, None))
    else:
      self.gat_layers.append(GATConv(in_dim, num_class, self.heads[0], self.feat_drop, \
                                    self.attn_drop, self.neg_slope, False, None))

  def forward(self, g, inputs):
    h = inputs
    for l in range(self.num_layer):
      h = self.gat_layers[l](g, h)
      h = h.flatten(1) if l != self.num_layer - 1 else h.mean(1)
    
    return h


class MyModel(th.nn.Module):
  def __init__(self, vocab, in_dim, hidden_dim, num_class, device):
    super(MyModel, self).__init__()
    self.gram = 3
    self.vocab = vocab
    self.num_class = num_class
    self.device = device
    self.max_length = 300
    self.vocab_size = len(vocab)
    self.node_hidden = th.nn.Embedding(self.vocab_size, in_dim)
    self.node_hidden.weight.data.copy_(th.tensor(self.Load_w2v('/content/glove.6B.300d.w2vformat.txt')))
    self.node_hidden.weight.requires_grad = True
    self.gat = GATLayer(in_dim, hidden_dim, num_class)
  
  def Load_w2v(self, path):
    w2v = word2vec.load(path)
    embedding_matrix = []
    unk_d = len(w2v['the'])  # Unknow eord dimension
    for word in self.vocab:
      try:
        embedding_matrix.append(w2v[word])
      except KeyError:
        embedding_matrix.append(np.zeros(unk_d))
    
    return np.array(embedding_matrix)
  
  def AddEdges(self, sample, local_vocab_id): 
    edges = []
    for i, src in enumerate(sample):
      for j in range(max(0, i-self.gram), min(i + self.gram + 1, len(sample))):  # Undirected graph
        dst = sample[j]
        edges.append([local_vocab_id[src], local_vocab_id[dst]])
    
    return edges

  def BuildGraph(self, sample):
    t = len(sample)
    if t == 0: raise Exception('sample length is equal 0')
    if t > self.max_length: sample = sample[:self.max_length]
    local_vocab = set(sample)
    n = len(local_vocab)
    local_vocab_id = dict(zip(local_vocab, range(n)))
    u, v = zip(*self.AddEdges(sample, local_vocab_id))
    g = dgl.graph((u, v), num_nodes=n).to(self.device)
    local_vocab_embedding = th.tensor(list(local_vocab)).to(self.device)
    g.ndata['h'] =  self.node_hidden(local_vocab_embedding)

    return g
  
  def forward(self, sample):
    one_batch_g = []
    for t in sample:
      one_batch_g.append(self.BuildGraph(t))  # One batch size graphs
    
    one_g = dgl.batch(one_batch_g)  # One graph
    h = one_g.ndata['h']
    one_g.ndata['h'] = self.gat(one_g, h)  # Update node embedding
    output = dgl.mean_nodes(one_g, feat='h')
    score = F.log_softmax(output, dim=-1)  # Score on each class

    return score


#Model Train and Test

In [None]:
# Train Function
def Train(model, epoch, train_X, train_y, dev_X, dev_y, device):
  optimizer = th.optim.AdamW(model.parameters(), weight_decay=0.01, lr=0.001)
  PATIENCE = 10  # Patience on dev set to stop training
  no_improve = 0
  best_acc = 0.0
  for e in range(epoch):
    improved = ''
    model.train()
    for batch_X, batch_y in zip(train_X, train_y):
      batch_y = batch_y.to(device)
      optimizer.zero_grad()  # # Clear gradient
      score = model(batch_X)
      loss = F.nll_loss(score, batch_y)
      loss.backward()  # Derive gradient
      optimizer.step()  # Update gradient
    #train_acc = Dev(model, train_X, train_y, device)
    dev_acc = Dev(model, dev_X, dev_y, device)
    if dev_acc>best_acc:
      best_acc = dev_acc
      no_improve = 0
      improved = '*'
      th.save(model, f'/content/MyModel.pkl')
    else: 
      no_improve+=1
    print(f'Epoch={e+1}/{epoch}, Train_loss={loss.item():.4f}, Dev_acc={dev_acc:.4f}, {improved}')
    if no_improve>=PATIENCE:
      print(f'No improvement on dev set, early stopping')
      break


# Dev Function
def Dev(model, X, y, device):
  model.eval()
  total = 0.0
  correct = 0.0
  for batch_X, batch_y in zip(X, y):
    batch_y = batch_y.to(device)

    with th.no_grad():
      score = model(batch_X)
      pred = th.argmax(score, dim=1)
      correct += th.sum(pred==batch_y)
      total += len(batch_y)

  return th.div(correct, total)


# Test Function (Same as Dev)
def Test(model, X, y, device):
  model.eval()
  total = 0.0
  correct = 0.0
  for batch_X, batch_y in zip(X, y):
    batch_y = batch_y.to(device)

    with th.no_grad():
      score = model(batch_X)
      pred = th.argmax(score, dim=1)
      correct += th.sum(pred==batch_y)
      total += len(batch_y)

  return th.div(correct, total)


#Run Experiment

In [None]:
# Copy Data from Cloud Disk to Local Machine
def CopyData():
  if os.path.exists('/content/data')==False:
    %cp -av '/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/data/'  '/content'
  if os.path.exists('/content/glove.6B.300d.w2vformat.txt')==False:
    %cp -av '/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/glove.6B/glove.6B.300d.w2vformat.txt'  '/content'


# Batch the Dataset
def Batch(X, y, batch_size=64, shuffle=False):
  data = [(X[i], y[i]) for i in range(len(y))]
  if shuffle:
    random.shuffle(data)
    X = [t for t,_ in data]  # Shuffled X
    y = [t for _,t in data]  # Shuffled y
  
  batched_X = [X[i:i+batch_size] for i in range(0, len(data), batch_size)]
  batched_y = [th.tensor(y[i:i+batch_size]) for i in range(0, len(data), batch_size)]
  
  return batched_X, batched_y


# Save Experiment Result (Acuuracy and Running Time)
def SaveResult(acc, time, dataset_name):
  df1 = pd.DataFrame({"Acc": acc})
  df2 = pd.DataFrame({"Time": time})

  df1.to_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/Acc-{dataset_name}.csv', index=False)
  df2.to_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/Time-{dataset_name}.csv', index=False)

def MeanAcc(dataset_name):
  acc_mean = []
  acc_std = []
  for name in dataset_name:
    df = pd.read_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/Acc-{name}.csv')
    acc = df["Acc"].iloc[0:]
    acc_mean.append(np.mean(acc))
    acc_std.append(np.std(acc))

  return acc_mean, acc_std

def MeanTime(dataset_name):
  mean_time = []
  for name in dataset_name:
    df = pd.read_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/Time-{name}.csv')
    time = df["Time"].iloc[0:]
    mean_time.append(np.mean(time))
    
  return mean_time

# Save Average Result for All Dataset
def SaveTotalResult(dataset):
  my_acc, my_std = MeanAcc(dataset)
  my_time = MeanTime(dataset)
  print(f"acc_mean={my_acc}")
  print(f"acc_std={my_std}")
  print(f"time_mean(s)={my_time}")
  now_time = datetime.now()
  t = re.sub(r' |:|\.', '-', str(now_time))
  df = pd.DataFrame({"dataset":dataset, "acc_mean":my_acc, "acc_std":my_std, "time_mean(s)":my_time})
  df.to_csv(f'/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/Total-result{t}.csv', index=False)

# Set Training Device
DEVICE = th.device('cuda:0' if th.cuda.is_available() else 'cpu')
if th.cuda.is_available():
  print(f'device: {DEVICE}')
  print(f'name: {th.cuda.get_device_name(0)}')
  print(f'='*50)

# Set Constant
shuffle_time = 2
run_time = 3  # Run n times to get average
epoch=50
dataset = ['r8',  'r52', 'oh', 'mr', '20ng']

CopyData()
print("="*50)
for NAME in dataset:
  all_acc = []
  consume_time = []
  for s in range(shuffle_time):
    train_text_tok, train_label_tok, dev_text_tok, dev_label_tok, test_text_tok, test_label_tok, num_class, vocab = MyDataset(NAME)
    dev_X, dev_y = Batch(dev_text_tok, dev_label_tok)
    test_X, test_y = Batch(test_text_tok, test_label_tok)
    for i in range(run_time):
      train_X, train_y = Batch(train_text_tok, train_label_tok, shuffle=True)  # Shuffle train dataset run n times to get average
      t0 = time.time()
      row_model = MyModel(vocab, 300, 300, num_class, device=DEVICE)
      row_model.to(DEVICE)
      Train(row_model, epoch, train_X, train_y, dev_X, dev_y, DEVICE)
      trained_model = th.load(f'/content/MyModel.pkl', map_location=DEVICE)
      acc = Test(trained_model, test_X, test_y, DEVICE).cpu().numpy()
      print(f'{NAME}, ShuffleTime={s+1}/{shuffle_time}, RunTime={i+1}/{run_time}, Test_acc={acc:.4f}, Ave_Epoch_Time={(time.time()-t0)/epoch:.1f}s, All_Epoch_Time={(time.time()-t0)/60:.1f}m')
      all_acc.append(acc)
      consume_time.append(time.time()-t0)
      print("="*50)
    print("="*100)
  SaveResult(all_acc, consume_time, NAME)

SaveTotalResult(dataset)


#Result Plot

Experiment result should be read out from local disk. (In order to analyze it later.)

In [None]:
def ShowAllResult():
  all_acc_mean = []
  all_file = os.listdir(f"/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/")
  for t in all_file:
    if t.startswith("Total"):
      df = pd.read_csv(f"/content/drive/MyDrive/Colab_Notebooks/TextLevelGNN/model/{t}")
      all_acc_mean.append(df["acc_mean"])
      #break
  
  return all_acc_mean

dataset = ['r8',  'r52', 'oh', 'mr', '20ng']     
# Show All Results
all_acc_mean = ShowAllResult()

fig = plt.figure(figsize=(18,9),dpi=100)
ax1 = fig.add_subplot(211)

for i in range(len(all_acc_mean)):
  ax1.plot(dataset, all_acc_mean[i], "o-.", label=f'{i+1}')
  for a,b in zip(dataset, all_acc_mean[i]):
    ax1.text(a, b, round(b, 4),fontsize=5)

ax2 = fig.add_subplot(212)
x_length = np.arange(len(dataset))
bar_width = 0.2
for i in range(len(all_acc_mean)):
  x = x_length+i*bar_width
  y = all_acc_mean[i]
  ax2.bar(x, y, width=bar_width, label=f'{i+1}')
  for a,b in zip(x, y):
    ax2.text(a, b, round(b,4),ha="center", fontsize=10)

  
plt.xticks(x_length, dataset)

ax1.legend()
ax2.legend()
plt.show()

#Compare Baseline Result

In [None]:
# Baseline Method
TextGCN = [0.9707, 0.9356, 0.6836, 0.7674, 0.8634]
TextLevelGCN = [0.978, 0.946, 0.6994, None, None]
my_best_acc = [0.9, 0.9, 0.6, 0.7, 0.8]

fig = plt.figure(figsize=(6,6),dpi=100)
ax = fig.add_subplot(111)

ax.plot(dataset, TextGCN, '^-.', label="TextGCN")
ax.plot(dataset, my_best_acc, '<-.', label="MyModel")
ax.plot(dataset, TextLevelGCN, ">-.", label="TextLevelGCN")

ax.legend()
plt.show()