In [72]:
import re
import os
import numpy as np
import pandas as pd

In [73]:
all_df = pd.read_csv('data/20ng/raw/all_df_mask.tsv', sep='\t')

In [74]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18846 entries, 0 to 18845
Data columns (total 7 columns):
doc_id        18846 non-null object
label         18846 non-null object
path          18846 non-null object
test_mask     18846 non-null bool
text          18846 non-null object
train_mask    18846 non-null bool
type          18846 non-null object
dtypes: bool(2), object(5)
memory usage: 920.2+ KB


In [75]:
all_df.head()

Unnamed: 0,doc_id,label,path,test_mask,text,train_mask,type
0,doc_id_0,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53068,False,decay cbnewsj cb att com \( dean kaflowitz \) ...,True,20news-bydate-test
1,doc_id_1,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53257,False,cfaehl vesta unm edu \( chris faehl \) subject...,True,20news-bydate-test
2,doc_id_2,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53260,False,mathew mathew mantis co uk subject yet rushdie...,True,20news-bydate-test
3,doc_id_3,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53261,False,"dps nasa kodak com \( dan schaertel , , , \) s...",True,20news-bydate-test
4,doc_id_4,alt.atheism,data/20ng/20news-bydate-test/alt.atheism/53262,False,halat panther bears \( jim halat \) subject 20...,True,20news-bydate-test


### Dataset
---

In [147]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from collections import defaultdict
from scipy.sparse import csr_matrix
from torch_geometric.utils import to_scipy_sparse_matrix, from_scipy_sparse_matrix
from copy import deepcopy
import re, math
from utils import DocumentStatsBuilder

def get_counts(df):
    cooccur = defaultdict(int)
    doc_freq = defaultdict(int)
    for i, row in df.iterrows():
        ts = row["text"].split()
        wpairs = set([(w1, w2) for w1 in ts for w2 in ts])
        ws = set(ts)
        for wpair in wpairs:
            cooccur[wpair] += 1
        for w in ws:
            doc_freq[w] += 1
    
    return cooccur, doc_freq


def compute_TFIDF(corpus):
#     corpus = df.text.values
    counter = CountVectorizer(tokenizer=lambda x: x.split())
    tfidf_trans = TfidfTransformer()    
    counts = counter.fit_transform(corpus)
    
    # -- tfidf 
    print("building tfidf...")
    tfidf = tfidf_trans.fit_transform(counts)
    # words have specific order when computing the features
    # not neccessarily the same to the graph nodes -> needs conversion
    features = counter.get_feature_names()
    feature2i = dict(zip(features, range(len(features))))
    i2feature = dict(zip(range(len(features)), features))
    
    # -- pmi -- 
    print("building pmi...")
    word_counts = counts.sum(axis=0)
    n_words = word_counts.shape[1]
    n_docs = tfidf.shape[0]
    
#     print("computing counts")
#     cooccur, doc_freq = get_counts(df)
    
#     import pdb; pdb.set_trace()
#     idx = [k for k in cooccur]
#     rows, cols = tuple(zip(*idx))
    
#     rows = [feature2i[w] for w in rows]
#     cols = [feature2i[w] for w in cols]
#     print("building cooccur_data...")
#     cooccur_data = [math.log(cooccur[(row, col)]*n_docs/(doc_freq[row]*doc_freq[col]))
#                     for (row, col) in idx]
#     print("building cooccur_matrix...")


    pmi = DocumentStatsBuilder.PMI(corpus, feature2i, window_size=2)

    idx, cooccur_data = zip(*pmi.items())
    rows, cols = tuple(zip(*idx))
    rows = [feature2i[w] if w in feature2i else feature2i["<unk>"] for w in rows]
    cols = [feature2i[w] if w in feature2i else feature2i["<unk>"] for w in cols]
    cooccur_matrix = csr_matrix((cooccur_data, (rows, cols)), shape=(n_words, n_words))
    return tfidf, cooccur_matrix, feature2i, i2feature, idx


In [167]:
import torch
import torch.nn as nn
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import to_undirected, is_undirected

EMBED_DIM = 300
REMOVE_STOP_WORDS = True
MIN_COUNTS = 5
DEBUG = False

class Dataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(Dataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def raw_file_names(self):
        return ['all_df_mask.tsv']
    
    @property
    def processed_file_names(self):
        return ['20ng_train.pt']
    
    def process(self):
        df = pd.read_csv(self.raw_paths[0], sep="\t")
        if DEBUG:
            df = df.head(1000)
        import pdb; pdb.set_trace()
        texts = list(map(lambda x: x.split(), df.text.values))
        doc_ids = df.doc_id.values
        
        # convert str labels to ints
        labels = df.label.values
        unique_labels = df.label.unique()
        self.label_dict = dict(zip(unique_labels, range(len(unique_labels))))
        labels = [self.label_dict[l] for l in labels]
        
        doc_label_dict = dict(zip(doc_ids, labels))
        is_train_dict = dict(zip(doc_ids, df.train_mask.values))
        print("building vocab...")
        vocab = Vocab(texts, doc_ids=doc_ids.tolist())
        
        text_int = vocab.map_dataset_words2index(texts)
        
        # nodes_idx mapping will be the same as vocab.i2w
        print("building nodes...")
        n_nodes = len(vocab.w2i)
        self.embed = nn.Embedding(n_nodes, EMBED_DIM)
        nodes = self.embed(torch.tensor(range(n_nodes)))  # (vocab_size, EMBED_DIM)
        
        # edges
        print("building edges...")
        tfidf, cooccur_matrix, edge_feature2i, edge_i2feature, cooccur_idx = compute_TFIDF(df.text.values)
        
        edge_index = []
        edge_attr = []
        
        # --- doc to words ---
        print("building doc to word edges...")
        for i, row in df.iterrows():
            doc_id = vocab.w2i[row["doc_id"]]
            edges_ = zip([doc_id for _ in range(len(text_int[i]))], text_int[i])
            edges_ = edges_ 
            edge_index.extend(list(edges_))
        
        edge_index = list(set(edge_index))  # a word may occur mult times in a doc
        for (doc_id, word_id) in edge_index:
            d, w = doc_id, edge_feature2i[vocab.i2w[word_id]]
            edge_attr.append(tfidf[d, w])
            
        # --- words to doc ---
        print("building word to doc edges...")
        edge_index_back = [(e2, e1) for (e1, e2) in edge_index] 
        for (word_id, doc_id) in edge_index_back:
            d, w = doc_id, edge_feature2i[vocab.i2w[word_id]]
            edge_attr.append(tfidf[d, w])
        edge_index += edge_index_back
            
        # --- word to word ---
        print("building word to word edges...")
        for (w1_edge_idx, w2_edge_idx) in cooccur_idx: 
            # notice, w1_edge_idx, w2_edge_idx are ints in edge_feature space, 
            # convert to node feature spaces
            w1, w2 = edge_i2feature[w1_edge_idx], edge_i2feature[w2_edge_idx]
            w1_vocab_idx, w2_vocab_idx = vocab.w2i[w1], vocab.w2i[w2]

            weight = cooccur_matrix[w1_edge_idx, w2_edge_idx] if w1 != w2 else 1
            edge_index.append((w1_vocab_idx, w2_vocab_idx))
            edge_attr.append(weight)

        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)
        print("so many edges...")
        
        # --- masks: ---
        print("building masks...")
        train_mask = []
        test_mask = []
        for i in range(n_nodes):
            w = vocab.i2w[i]
            if w in doc_label_dict:
                is_train = is_train_dict[w]
                train_mask.append(is_train)
                test_mask.append(not is_train)
            else:
                train_mask.append(False)
                test_mask.append(False)
        labels = np.concatenate((labels, np.array([-1 for i in range(n_nodes-len(labels))])))
        labels = torch.tensor(labels, dtype=torch.long)
        
        print("building data...")
        data_list = [Data(x=nodes, y=labels, edge_index=edge_index, edge_attr=edge_attr)]
        data_list[0].train_mask = torch.tensor(train_mask)
        data_list[0].test_mask = torch.tensor(test_mask)
        
        data, slices = self.collate(data_list)
        print("saving...")
        torch.save((data, slices), self.processed_paths[0])



In [168]:
import itertools
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 

class Vocab(object):
  
  def __init__(self, L, doc_ids=None, remove_stop_words=False, min_counts=1):    
    if isinstance(L[0], list):
      tokens = list(itertools.chain(*L))
      self.token_counts = pd.Series(tokens).value_counts().to_frame().sort_index(ascending=True)
      self.token_counts.columns = ["counts"]
      if remove_stop_words:
        self.stop_words = set(stopwords.words('english')) 
        idx = self.token_counts.index.isin(self.stop_words)
        self.token_counts = self.token_counts[~idx]
      self.token_counts = self.token_counts[self.token_counts.counts >= min_counts]
        
      self.vocab = ["<unk>"] + self.token_counts.index.to_list()
    else:
      raise Error

    self.vocab = list(set(self.vocab))
    self.vocab = sorted(self.vocab)
    if doc_ids is not None:
      self.vocab = doc_ids + self.vocab 
    
    self.w2i = dict(zip(self.vocab, range(len(self.vocab))))
    self.i2w = dict(zip(range(len(self.vocab)), self.vocab))

  def map_words2index(self, L):
    return list(map(lambda x: self.w2i[x] if x in self.w2i else self.w2i['<unk>'], L))

  def map_index2words(self, L):
    return list(map(lambda x: self.i2w[x], L))

  def map_words2unk(self, L):
    return map(lambda x: x if x in self.vocab else "<unk>", L)

  def map_dataset_words2index(self, L):
    return np.array(list(map(self.map_words2index, L)))

  def map_dataset_index2words(self, L):
    return np.array(list(map(self.map_index2words, L)))

  def map_dataset_words2unk(self, L):
    return np.array(list(map(self.map_words2unk, L)))

  def remove_sw_helper(self, L):
    return filter(lambda x: x not in self.stop_words, L)

  def remove_stop_words(self, L):
    return np.array(list(map(self.remove_sw_helper, L)))

  def get_counts(self):
    return self.token_counts

In [169]:
df = pd.read_csv('data/20ng/raw/all_df_mask.tsv', sep="\t")
texts = list(map(lambda x: x.split(), df.text.values))
doc_ids = df.doc_id.values
labels = df.label.values
doc_label_dict = dict(zip(doc_ids, labels))
is_train_dict = dict(zip(doc_ids, df.train_mask.values))
print("building vocab...")
vocab = Vocab(texts, doc_ids=doc_ids.tolist(),
              remove_stop_words=REMOVE_STOP_WORDS, min_counts=MIN_COUNTS)
# texts = vocab.remove_stop_words(texts)
# texts = vocab.map_dataset_words2unk(texts)
# text_int = vocab.map_dataset_words2index(texts)
print(len(vocab.w2i))

building vocab...
61604


In [170]:
%time dataset = Dataset('data/20ng')

Processing...
> <ipython-input-167-106aafce2481>(29)process()
-> texts = list(map(lambda x: x.split(), df.text.values))


(Pdb)  df.shape


(18846, 7)


(Pdb)  


(18846, 7)


(Pdb)  c


building vocab...
building nodes...
building edges...
building tfidf...
building pmi...
building doc to word edges...
building word to doc edges...
building word to word edges...
so many edges...
building masks...
building data...
saving...
Done!
CPU times: user 3min 35s, sys: 3.32 s, total: 3min 38s
Wall time: 3min 38s


In [171]:
dataset.data

Data(edge_attr=[6980982], edge_index=[2, 6980982], test_mask=[61604], train_mask=[61604], x=[61604, 300], y=[61604])

### Net
---

In [172]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(nn.Module):
    def __init__(self, hidden_dim):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, dataset.num_classes)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

### Training
---

In [173]:
import time

In [None]:
HIDDEN_DIM = 200

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(HIDDEN_DIM).to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005) #, weight_decay=5e-4)
model.train()
start = time.time()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 9:
        _, pred = model(data).max(dim=1)
        train_correct = float(pred[data.train_mask].eq(data.y[data.train_mask]).sum().item())
        train_acc = train_correct / data.train_mask.sum().item()
        valid_correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
        valid_acc = valid_correct / data.test_mask.sum().item()
        
        print("epoch {}, loss = {:.4f}, train_acc = {:.4f}, valid_acc = {:.4f}, time taken: {:.2f}".format(epoch, loss, train_acc, valid_acc, time.time()-start))
        start = time.time()

In [None]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))