<a href="https://colab.research.google.com/github/ereshmittal/Text-GCN-Classification/blob/main/GCN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import networkx as nx

from argparse import ArgumentParser
from collections import OrderedDict
import random
from random import shuffle
from itertools import combinations
import math
from tqdm import tqdm
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
t_bbe_url = "https://raw.githubusercontent.com/plkmo/Bible_Text_GCN/master/data/t_bbe.csv"
key_url = "https://raw.githubusercontent.com/plkmo/Bible_Text_GCN/master/data/key_english.csv"
bbe = pd.read_csv(t_bbe_url)
key = pd.read_csv(key_url)

In [None]:
bbe.drop(['id', 'v'], axis=1, inplace=True)

In [None]:
def nCr(n,r):
    f = math.factorial
    return int(f(n)/(f(r)*f(n-r)))

def word_word_edges(p_ij):
    word_word = []
    cols = list(p_ij.columns); cols = [str(w) for w in cols]

    for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
        if (p_ij.loc[w1,w2] > 0):
            word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
    return word_word

In [None]:
bookmap = {book.lower(): number for number, book in zip(key['field'], key['field.1'])}

In [None]:
df_data = pd.DataFrame(columns=['b', 'c'])
for book in bbe['b'].unique():
  dum = pd.DataFrame()
  dum['c'] = (bbe[bbe['b'] == book].groupby('c')).apply(lambda x: (" ".join(x["t"])).lower())
  dum['b'] = book
  df_data = pd.concat([df_data, dum], ignore_index=True)


def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stopwords]
    return (" ".join(filtered_text))

def stemming(text):
    porter = PorterStemmer()
    
    result=[]
    for word in text:
        result.append(porter.stem(word))
    return ("".join(result))


df_data['c'] = df_data['c'].apply(remove_punctuation)
df_data['c'] = df_data['c'].apply(remove_stopwords)
df_data['c'] = df_data['c'].apply(stemming)

In [None]:
df_data['c']

0       first god made heaven earth earth waste withou...
1       heaven earth things complete seventh day god c...
2       snake wiser beast field lord god made said wom...
3       man connection eve wife became child gave birt...
4       book generations adam day god made man made im...
                              ...                        
1184    things saw another angel coming heaven great a...
1185    things came ears sound like voice great band p...
1186    saw angel coming heaven key great deep great c...
1187    saw new heaven new earth first heaven first ea...
1188    saw river water life clear glass coming high s...
Name: c, Length: 1189, dtype: object

In [None]:
vectorizer = TfidfVectorizer(input="content", max_features=None, lowercase=False)
vectorizer.fit(df_data["c"])
df_tfidf = vectorizer.transform(df_data["c"])
df_tfidf = df_tfidf.toarray()
vocab = vectorizer.get_feature_names_out()
vocab = np.array(vocab)
df_tfidf = pd.DataFrame(df_tfidf,columns=vocab)

In [None]:
# https://github.com/codeKgu/Text-GCN
# https://github.com/yao8839836/text_gcn
# https://github.com/iworldtong/text_gcn.pytorch
# https://github.com/andrejmiscic/gcn-pytorch
# https://towardsdatascience.com/text-based-graph-convolutional-network-for-semi-supervised-bible-book-classification-c71f6f61ff0f
# https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html
# https://paperswithcode.com/paper/graph-convolutional-networks-for-text

In [None]:
vocab = np.array(vocab)

In [None]:
word2idx = {name: index for (index, name) in enumerate(vocab)}

In [None]:
occurrences = np.zeros((len(vocab),len(vocab)), dtype=np.int32)

In [None]:
n_i  = OrderedDict((name, 0) for name in vocab)

In [None]:
df_data['c'] = df_data['c'].map(lambda x: word_tokenize(x))
df_data['c']

0       [first, god, made, heaven, earth, earth, waste...
1       [heaven, earth, things, complete, seventh, day...
2       [snake, wiser, beast, field, lord, god, made, ...
3       [man, connection, eve, wife, became, child, ga...
4       [book, generations, adam, day, god, made, man,...
                              ...                        
1184    [things, saw, another, angel, coming, heaven, ...
1185    [things, came, ears, sound, like, voice, great...
1186    [saw, angel, coming, heaven, key, great, deep,...
1187    [saw, new, heaven, new, earth, first, heaven, ...
1188    [saw, river, water, life, clear, glass, coming...
Name: c, Length: 1189, dtype: object

In [None]:
window=10
no_windows = 0

for l in tqdm(df_data["c"], total=len(df_data["c"])):
    for i in range(len(l)-window):
      d = set(l[i: (i+window)])

      for w in d:
        n_i[w] += 1
      for w1,w2 in combinations(d,2):
        i1 = word2idx[w1]
        i2 = word2idx[w2]
        occurrences[i1][i2] +=1 
        occurrences[i2][i1] +=1 

p_ij = pd.DataFrame(occurrences, index = vocab,columns=vocab)/no_windows
p_i = pd.Series(n_i, index=n_i.keys())/no_windows

100%|██████████| 1189/1189 [01:38<00:00, 12.12it/s]


In [None]:
for col in p_ij.columns:
       p_ij[col] = p_ij[col]/p_i[col]

for row in p_ij.index:
    p_ij.loc[row,:] = p_ij.loc[row,:]/p_i[row]

p_ij = p_ij + 1E-9

for col in p_ij.columns:
    p_ij[col] = p_ij[col].apply(lambda x: math.log(x))

In [None]:
G = nx.Graph()

In [None]:
G.add_nodes_from(df_tfidf.index)
G.add_nodes_from(vocab)

In [None]:
word_word = word_word_edges(p_ij)

100%|██████████| 22913065/22913065 [03:17<00:00, 115868.76it/s]


In [None]:
document_word = [(doc,w,{"weight":df_tfidf.loc[doc,w]}) for doc in tqdm(df_tfidf.index, total=len(df_tfidf.index))\
                 for w in df_tfidf.columns]

100%|██████████| 1189/1189 [01:16<00:00, 15.64it/s]


In [None]:
G.add_edges_from(document_word)
G.add_edges_from(word_word)

In [None]:
class gcn(nn.Module):
    def __init__(self, X_size, A_hat, args, bias=True): # X_size = num features
        super(gcn, self).__init__()
        self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
        self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, args.hidden_size_1))
        var = 2./(self.weight.size(1)+self.weight.size(0))
        self.weight.data.normal_(0,var)
        self.weight2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1, args.hidden_size_2))
        var2 = 2./(self.weight2.size(1)+self.weight2.size(0))
        self.weight2.data.normal_(0,var2)
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1))
            self.bias.data.normal_(0,var)
            self.bias2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_2))
            self.bias2.data.normal_(0,var2)
        else:
            self.register_parameter("bias", None)
        self.fc1 = nn.Linear(args.hidden_size_2, args.num_classes)
        
    def forward(self, X): ### 2-layer GCN architecture
        X = torch.mm(X, self.weight)
        if self.bias is not None:
            X = (X + self.bias)
        X = F.relu(torch.mm(self.A_hat, X))
        X = torch.mm(X, self.weight2)
        if self.bias2 is not None:
            X = (X + self.bias2)
        X = F.relu(torch.mm(self.A_hat, X))
        return self.fc1(X)

In [None]:
A = nx.to_numpy_array(G, weight="weight")
A = A + np.eye(G.number_of_nodes())

In [None]:
degrees = []
for d in G.degree(weight=None):
    if d == 0:
        degrees.append(0)
    else:
        degrees.append(d[1]**(-0.5))
degrees = np.diag(degrees)

In [None]:
X = np.eye(G.number_of_nodes()) # Features are just identity matrix
A_hat = (degrees@A@degrees)
f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net

In [None]:
A_hat.shape

(7959, 7959)

In [None]:
parser = ArgumentParser()
parser.add_argument("--hidden_size_1", type=int, default=512, help="Size of first GCN hidden weights")
parser.add_argument("--hidden_size_2", type=int, default=256, help="Size of second GCN hidden weights")
parser.add_argument("--num_classes", type=int, default=66, help="Number of prediction classes")
parser.add_argument("--test_ratio", type=float, default=0.2, help="Ratio of test to training nodes")
parser.add_argument("--num_epochs", type=int, default=10000, help="No of epochs")
parser.add_argument("--lr", type=float, default=0.01, help="learning rate")
parser.add_argument("--model_no", type=int, default=0, help="Model ID")
args = parser.parse_known_args()

In [None]:
test_idxs = []
for b_id in df_data["b"].unique():
    dum = df_data[df_data["b"] == b_id]
    if len(dum) >= 4:
        test_idxs.extend(list(np.random.choice(dum.index, size=round(args[0].test_ratio*len(dum)), replace=False)))

In [None]:
selected = []
for i in range(len(df_data)):
    if i not in test_idxs:
        selected.append(i)

In [None]:
f_selected = f[selected]
f_selected = torch.from_numpy(f_selected).float()
labels_selected = [l for idx, l in enumerate(df_data["b"]) if idx in selected]
f_not_selected = f[test_idxs]
f_not_selected = torch.from_numpy(f_not_selected).float()
labels_not_selected = [l for idx, l in enumerate(df_data["b"]) if idx not in selected]
f = torch.from_numpy(f).float().to(device)

In [None]:
from numpy import savetxt

savetxt('/content/X.csv', X, delimiter=',')
savetxt('/content/A_hat.csv', A_hat, delimiter=',')
# torch.save(X, '/content/X.pt')
# torch.save(A_hat, '/content/A_hat.pt')

In [None]:
net = gcn(torch.tensor(X.shape[1]).to(device), torch.tensor(A_hat).clone().to(device), args[0]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=args[0].lr)

  self.A_hat = torch.tensor(A_hat, requires_grad=False).float()


In [None]:
net

gcn(
  (fc1): Linear(in_features=256, out_features=66, bias=True)
)

In [None]:
def evaluate(output, labels_e):
  _, labels = output.max(1); labels = labels.numpy()
  return sum([(e-1) for e in labels_e] == labels)/len(labels)

In [None]:
losses_per_epoch=[]
evaluation_trained=[]
evaluation_untrained=[]
outputs=[]

for e in range(args[0].num_epochs):
  net.train()
  optimizer.zero_grad()
  output = net(f).to(device)
  loss = criterion(output[selected], (torch.tensor(labels_selected).long() -1).to(device))
  losses_per_epoch.append(loss.item())
  loss.backward()
  optimizer.step()
  if e % 50 == 0:
    ### Evaluate other untrained nodes and check accuracy of labelling
    net.eval()
    with torch.no_grad():
        pred_labels = net(f)
        trained_accuracy = evaluate(output[selected].to('cpu'), labels_selected)
        untrained_accuracy = evaluate(pred_labels[test_idxs].to('cpu'), labels_not_selected)
    evaluation_trained.append((e, trained_accuracy))
    evaluation_untrained.append((e, untrained_accuracy))
    outputs.append(output)
    print("[Epoch %d]: Evaluation accuracy of trained nodes: %.7f" % (e, trained_accuracy))
    print("[Epoch %d]: Evaluation accuracy of test nodes: %.7f" % (e, untrained_accuracy))
    # print("Labels of trained nodes: \n", output[selected].max(1)[1])
    net.train()

[Epoch 0]: Evaluation accuracy of trained nodes: 0.9749216
[Epoch 0]: Evaluation accuracy of test nodes: 0.4525862
[Epoch 50]: Evaluation accuracy of trained nodes: 0.9770115
[Epoch 50]: Evaluation accuracy of test nodes: 0.4525862
[Epoch 100]: Evaluation accuracy of trained nodes: 0.9801463
[Epoch 100]: Evaluation accuracy of test nodes: 0.4525862
[Epoch 150]: Evaluation accuracy of trained nodes: 0.9832811
[Epoch 150]: Evaluation accuracy of test nodes: 0.4525862
[Epoch 200]: Evaluation accuracy of trained nodes: 0.9843260
[Epoch 200]: Evaluation accuracy of test nodes: 0.4482759
[Epoch 250]: Evaluation accuracy of trained nodes: 0.9853710
[Epoch 250]: Evaluation accuracy of test nodes: 0.4482759
[Epoch 300]: Evaluation accuracy of trained nodes: 0.9864159
[Epoch 300]: Evaluation accuracy of test nodes: 0.4482759
[Epoch 350]: Evaluation accuracy of trained nodes: 0.9885057
[Epoch 350]: Evaluation accuracy of test nodes: 0.4482759
[Epoch 400]: Evaluation accuracy of trained nodes: 0.9

In [None]:
from google.colab import files

# torch.save(net.state_dict(), '/content/textgcn.pt')
# downloaded = files.download('/content/textgcn.pt')

In [None]:
# uploaded = files.upload()
loaded = torch.load('/content/textgcn.pt')

In [None]:
loaded

# model = torch.load(PATH)