## Formatting and Downloading

In [1]:
import csv
import pandas as pd
import networkx as nx
import os
import pickle

In [2]:
with open(r"training.txt", "r") as f:
    reader = csv.reader(f)
    training  = list(reader)
# in order of training examples
training = [element[0].split(" ") for element in training]
training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
print("Training examples shape: {}".format(training.shape))

with open(r"testing.txt", "r") as f:
    reader = csv.reader(f)
    testing  = list(reader)
# in order of testing examples
testing = [element[0].split(" ") for element in testing]
testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
print("Testing examples shape: {}".format(testing.shape))

Training examples shape: (453797, 3)
Testing examples shape: (113450, 2)


In [3]:
'''
uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language
'''
NODE_INFO_DIRECTORY = r"node_information/text/"

corpus_path = r"pickles/simple_corpus.PICKLE" 
ids_path = r"pickles/ids.PICKLE"
if os.path.exists(corpus_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
    f.close()
    with open(ids_path, 'rb') as f:
        ids = pickle.load(f)
    f.close()
else:
    corpus = []
    ids = []
    for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY), position=0, leave=True):
        with open(NODE_INFO_DIRECTORY + filename, 'r', encoding='UTF-8', errors='ignore') as f:
            doc_string = []
            for line in f:
                [doc_string.append(token.strip()) for token in line.lower().strip().split(" ") if token != ""]
            corpus.append(' '.join(doc_string))
            ids.append(filename[:-4])
    with open(corpus_path, '+wb') as f:
        pickle.dump(corpus, f)
    f.close()
    with open(ids_path, '+wb') as f:
        pickle.dump(ids, f)
    f.close() 

In [4]:
stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE" 
if os.path.exists(stemmed_corpus_path):
    with open(stemmed_corpus_path, 'rb') as f:
        stemmed_corpus = pickle.load(f)
    f.close()
else:
    print('Stemmed corpus unavailable')

# in order of alphabetical text information i.e. 0, 1, 10, 100
node_info = pd.DataFrame({'id': ids, 'corpus': corpus, 'stemmed': stemmed_corpus})
node_info_id = node_info.set_index(['id'])
print("Training node info shape: {}".format(node_info.shape))

Training node info shape: (33226, 3)


In [5]:
train_graph_split_path = 'pickles/train_graph_split.PICKLE'

if os.path.exists(train_graph_split_path):
    with open(train_graph_split_path, 'rb') as f:
        keep_indices = pickle.load(f)
    f.close()
else:
    keep_indices = random.sample(range(len(training)), k=int(len(training) * 0.05))
    with open(train_graph_split_path, '+wb') as f:
        pickle.dump(keep_indices, f)
    f.close()

data_train_val = training.iloc[keep_indices]
data_train = training.loc[~training.index.isin(keep_indices)]

In [6]:
lda_path = r"pickles/stemmed_lda_64_matrix.PICKLE"
if os.path.exists(lda_path):
    with open(lda_path, 'rb') as f:
        lda = pickle.load(f)
    f.close()
else:
    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_components=n_features, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0).fit_transform(corpus_word_matrix)
    
    with open(lda_path, '+wb') as f:
        pickle.dump(lda, f)
    f.close()

## Formatting for Graph Sage

In [8]:
linked_nodes = training.loc[training['Link']=='1']
linked_nodes = linked_nodes[['Node1', 'Node2']]
linked_nodes.to_csv('all_linked_nodes.txt', sep=' ', index=False, header=False)
G=nx.read_edgelist('all_linked_nodes.txt', create_using=nx.Graph(), nodetype = int)

In [9]:
import json
import math

In [10]:
for i in (list(node_info.index)):
    if i in G:
        G.node[i]['feature'] = list(lda[i])       
        G.node[i]['test'] = False
        G.node[i]['val'] = False

    else:
        print('adding: ' + str(i))
        G.add_node((i))
        G.node[(i)]['feature'] = list(lda[i])
        G.node[(i)]['test'] = False
        G.node[(i)]['val'] = False
        

adding: 2015
adding: 5703
adding: 10049
adding: 12990
adding: 17120
adding: 20596
adding: 21465
adding: 21986
adding: 22507
adding: 23438
adding: 23525
adding: 23709
adding: 24663
adding: 25156
adding: 25283
adding: 25499
adding: 25516
adding: 25601
adding: 25769
adding: 26210
adding: 26897
adding: 26905
adding: 27435
adding: 27520
adding: 27534
adding: 27553
adding: 28061
adding: 28360
adding: 28372
adding: 28823
adding: 28860
adding: 28936
adding: 28944
adding: 28947
adding: 28948
adding: 29048
adding: 29280
adding: 30474
adding: 30504
adding: 30556
adding: 30660
adding: 30697
adding: 30766
adding: 30829
adding: 30874
adding: 30959
adding: 31007
adding: 31081
adding: 31140
adding: 31157
adding: 31377
adding: 31785
adding: 31995
adding: 32447
adding: 32605
adding: 32806
adding: 32864
adding: 33100
adding: 33113
adding: 33165
adding: 33175
adding: 33199
adding: 33213
adding: 33221


In [11]:
i = data_train_val.index[1]
G.edge[int(data_train_val['Node1'].loc[i])][int(data_train_val['Node2'].loc[i])]['test_removed'] = False
G.edge[int(data_train_val['Node1'].loc[i])][int(data_train_val['Node2'].loc[i])]['train_removed'] = False


In [12]:
for i in (list(data_train_val.index)):
    if i in G:
        G.node[i]['feature'] = list(lda[i])       
        G.node[i]['test'] = False
        G.node[i]['val'] = False

    else:
        print('adding: ' + str(i))
        G.add_node((i))
        G.node[(i)]['feature'] = list(lda[i])
        G.node[(i)]['test'] = False
        G.node[(i)]['val'] = False
        
for i in (list(data_train.index)):
    if i in G:
        G.node[i]['feature'] = list(lda[i])       
        G.node[i]['test'] = False
        G.node[i]['val'] = False

    else:
        print('adding: ' + str(i))
        G.add_node((i))
        G.node[(i)]['feature'] = list(lda[i])
        G.node[(i)]['test'] = False
        G.node[(i)]['val'] = False
        

adding: 74300


IndexError: index 74300 is out of bounds for axis 0 with size 33226

In [67]:
dict_id_map = {}
binaries = int(math.log(len(node_info_id.index), 2)) + 1
for j in list(node_info_id.index):
    bin_repr = bin(int(j))[2:]
    dict_id_map[j] = [int(i) for i in ((16 - len(bin_repr))*['0'] + list(bin_repr))]
with open('example-class_map.json', '+w') as f:
    json.dump(dict_id_map, f)

In [None]:
dict_id_map = {}
for i, j in enumerate(list(node_info_id.index)):
    dict_id_map[j] = int(j)
with open('example-id_map.json', '+w') as f:
    json.dump(dict_id_map, f)

In [18]:
from networkx.readwrite import json_graph

In [19]:
data = json_graph.node_link_data(G)

In [49]:
np.save('example-feats.npy', np.array(lda))

In [20]:
with open('example-G.json', '+w') as f:
    json.dump(data, f)

### Testing 

In [None]:
prefix = "example"
G_data = json.load(open(prefix + "-G.json"))
G = json_graph.node_link_graph(G_data)
if isinstance(G.nodes()[0], int):
    conversion = lambda n : int(n)
else:
    conversion = lambda n : n

if os.path.exists(prefix + "-feats.npy"):
    feats = np.load(prefix + "-feats.npy")
else:
    print("No features present.. Only identity features will be used.")
    feats = None
id_map = json.load(open(prefix + "-id_map.json"))
id_map = {conversion(k):int(v) for k,v in id_map.items()}
walks = []
class_map = json.load(open(prefix + "-class_map.json"))
if isinstance(list(class_map.values())[0], list):
    lab_conversion = lambda n : n
else:
    lab_conversion = lambda n : int(n)

class_map = {conversion(k):lab_conversion(v) for k,v in class_map.items()}


In [None]:
## Remove all nodes that do not have val/test annotations
## (necessary because of networkx weirdness with the Reddit data)
broken_count = 0
for node in G.nodes():
    if (not 'val' in G.node[node]) or (not 'test' in G.node[node]):
        G.remove_node(node)
        broken_count += 1
print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count))
