In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle as pkl
import itertools
from scipy import sparse
from collections import defaultdict

In [None]:
%matplotlib inline

# 1. Load Network Data

In [None]:
data = pd.read_csv("/mnt/idms/temporalNodeRanking/data/filtered_timeline_data/tsv/15o/15o_mentions.csv",sep=" ",names=["time","src","trg"])

In [None]:
data.head()

In [None]:
first_epoch = data["time"].min()

In [None]:
print(len(data))
data = data[data["time"] < first_epoch + 4 * 86400]
print(len(data))

# 2. Preprocess Network

## a.) Top authorities

In [None]:
top_k = 5
top_auth_with_freq = data["trg"].value_counts()[:top_k]
top_auth_with_freq

In [None]:
top_authorities = list(top_auth_with_freq.index)
top_authorities

## b.) Filter for top authorities

In [None]:
filtered_data = data[data["trg"].isin(top_authorities)]
print(len(data),len(filtered_data))

In [None]:
print(len(filtered_data["src"].unique()),len(filtered_data["trg"].unique()))

In [None]:
num_targets_for_sources = filtered_data.groupby(by=["src"])["trg"].nunique()

In [None]:
num_targets_for_sources.hist()

## c.) Generate group labels for sources

In [None]:
def get_label(target_list, keys=top_authorities):
    label = ''
    for key in keys:
        label += '1' if key in target_list else '0'
    return label

def generate_labels(df):
    targets_for_sources = df.groupby(by=["src"])["trg"].unique()
    sources_with_labels_df = pd.DataFrame(targets_for_sources).reset_index()
    sources_with_labels_df["label"] = sources_with_labels_df["trg"].apply(lambda x : get_label(x))
    print("Number of nodes: %i" % len(sources_with_labels_df))
    print("Number of unique labels: %i" % len(sources_with_labels_df["label"].unique()))
    return sources_with_labels_df

#### Conclusion: top_k=10
   * too many nodes (18824)
   * too many labels (not all 2^10 combination occurs - only 294)
   
#### Conclusion: top_k=5
   * too many nodes? (14721)
   * 31 labels could be reduced to labels with at least 10,100 node support

#### Conclusion: top_k=4
   * 12372 nodes are fine
   * 15 labels is fine

#### Conclusion: top_k=3 (I should try this setting first)
   * 9681 nodes are fine
   * 7 labels is fine

In [None]:
sources_with_labels_df = generate_labels(filtered_data)

In [None]:
sources_with_labels_df["label"].value_counts()

### Label changes

In [None]:
filtered_size = len(filtered_data)
train_part = int(filtered_size * 0.5)
print(filtered_size,train_part)

In [None]:
labels_first_part = generate_labels(filtered_data.head(train_part))
labels_second_part = generate_labels(filtered_data.tail(filtered_size-train_part))

In [None]:
cols = ['src','label']
merged_df = labels_first_part[cols].merge(labels_second_part[cols],on=["src"],how="inner")

#### top_k=10
   * merged_size=2273
   * mismatch_size=1815

#### top_k=5
   * merged_size=1983
   * mismatch_size=1465

#### top_k=3
   * merged_size=1800
   * mismatch_size=1201 

#### top_k=3
   * merged_size=1703
   * mismatch_size=1078 

In [None]:
print(len(merged_df), len(merged_df[merged_df["label_x"] != merged_df["label_y"]]))

## d.) Generate edges for the network

   * connect nodes with edges who mentioned the same entity within a small timeframe
   * what should be the length of this timeframe?
   * I should collect nodes who mentioned the same entity, then count the sources from each timestamp in the group

In [None]:
def add_edges_to_graph(mentions_df,G,trg_id,time_frame):
    filtered_for_trg = mentions_df[mentions_df["trg"] == trg_id]
    filtered_for_trg = filtered_for_trg.reset_index()[["time","src"]]
    min_time = filtered_for_trg["time"].min()
    idx_set = list(filtered_for_trg[filtered_for_trg["time"] < min_time + time_frame].index)
    edge_set = get_node_pairs(idx_set,filtered_for_trg,all_pair=True)
    G.add_edges_from(edge_set, weight=trg_id)
    #print(idx_set)
    #print(edge_set)
    for i in range(len(idx_set),len(filtered_for_trg)):
        current_time = filtered_for_trg.ix[i]["time"]
        low_idx = len(idx_set)
        for j in range(len(idx_set)):
            if filtered_for_trg.ix[idx_set[j]]["time"] > current_time - time_frame:
                low_idx = j
                break
        idx_set = idx_set[low_idx:] + [i] # update active indices
        edge_set = get_node_pairs(idx_set,filtered_for_trg)
        G.add_edges_from(edge_set, weight=trg_id)
        #print(idx_set)
        #print(edge_set)
    print("Edges were added for trg=%i" % trg_id)

def get_node_pairs(idx_list, filtered_for_trg, all_pair=False):
    node_list = list(filtered_for_trg.ix[idx_list]["src"])
    if len(idx_list) > 1:
        if all_pair:
            return list(itertools.combinations(node_list, 2))
        else:
            return list(zip(node_list[:-1],np.ones(len(node_list)-1,dtype="i")*node_list[-1]))
    else:
        return []

In [None]:
top_authorities

#### Graph stats for different topics (top_k=5, time_frame=60 ~ 1min)

The stats are in this order: **#nodes, #edges, #connected_components**

   * for [59848]: 5408, 39501, 132
   * for [59848,55250]: 7503, 55914, 177
   * for [59848,55250,20696]: 8656, 66142, 191
   * for [59848,55250,20696,54151]: 11058, 89485, 232
   * for [59848,55250,20696,54151,40577]: 13130, 101032, 329
   
#### We should use a more resonable time_frame, like 300 ~ 5min

In [None]:
G = nx.Graph()

In [None]:
for i in range(top_k):
    add_edges_to_graph(filtered_data, G, top_authorities[i], 300)
    print(G.number_of_nodes(), G.number_of_edges(), nx.number_connected_components(G))

### We should filter the graphs for bigger components

   * only nodes in the giant component are interesting
   * the giant component is sparse (which is good)

for comp in nx.connected_components(G):
    print(len(comp))

In [None]:
giant = max(nx.connected_component_subgraphs(G), key=len)
N = len(giant.nodes())

In [None]:
print(len(giant.nodes()),len(giant.edges()))

# Export binary files

In [None]:
giant_sources_df = sources_with_labels_df[sources_with_labels_df["src"].isin(giant.nodes())].reset_index()[["src","label"]]

In [None]:
len(giant_sources_df), len(sources_with_labels_df)

In [None]:
giant_with_time = filtered_data[filtered_data["src"].isin(giant.nodes())]
giant_with_min_time = giant_with_time.groupby(by=["src"])["time"].min()
giant_with_min_time = pd.DataFrame(giant_with_min_time).reset_index()

In [None]:
giant_sources_df = giant_sources_df.merge(giant_with_min_time,on=["src"])

In [None]:
half_time = (giant_sources_df["time"].min() + giant_sources_df["time"].max()) // 2

In [None]:
giant_sources_df.head()

In [None]:
train_giant_sources_df = giant_sources_df[giant_sources_df["time"] > half_time]
test_giant_sources_df = giant_sources_df[giant_sources_df["time"] <= half_time]

In [None]:
np.savetxt("/mnt/idms/fberes/network/gcn_project/data/ind.15o.test.index",test_giant_sources_df.index,fmt="%i")

In [None]:
export_tuples = [
    #(giant_sources_df,'all'),
    (train_giant_sources_df,'all'),
    (test_giant_sources_df,'t')
]

n2i_map = dict(zip(giant_sources_df["src"],giant_sources_df.index))

edges_array = np.array([[n2i_map[edge[0]],n2i_map[edge[1]]] for edge in giant.edges()])

## 1.) Export edges to binary file

In [None]:
edge_list_map = defaultdict(list)

In [None]:
for s,t in giant.edges():
    edge_list_map[s].append(t)

In [None]:
with open("/mnt/idms/fberes/network/gcn_project/data/ind.15o.graph","wb") as outfile:
    pkl.dump(edge_list_map, outfile)

## 2.) Export target labels

In [None]:
def str2arr(label):
    return [int(char) for char in str(label)]
#str2arr('10000')

In [None]:
for item in export_tuples:
    label_arr = []
    for index, row in item[0].iterrows():
        label_arr.append(str2arr(row["label"]))
    label_arr = np.array(label_arr)
    with open("/mnt/idms/fberes/network/gcn_project/data/ind.15o.%sy" % item[1],"wb") as outfile:
        pkl.dump(label_arr, outfile)

## 3.) Export features

In [None]:
for item in export_tuples:
    num_nodes = len(item[0])
    row = range(num_nodes)
    col = np.zeros(num_nodes)
    data = np.ones(num_nodes) * 0.001 #item[0]["src"].as_matrix()
    coord_sparse = sparse.csr_matrix( (data,(row,col)), shape=(num_nodes,1))
    with open("/mnt/idms/fberes/network/gcn_project/data/ind.15o.%sx" % item[1],"wb") as outfile:
        pkl.dump(coord_sparse, outfile)