In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle as pkl
from scipy import sparse
from collections import defaultdict

In [None]:
import sys, os
sys.path.insert(0,"../python/")
import generator_utils as gu

In [None]:
%matplotlib inline

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/GcnProject.json",sys.argv)

In [None]:
input_prefix = ph.get("input_prefix")

In [None]:
preprocessed_dir = "%s/data/%s" % (ph.get("experiment_dir"), gu.get_experiment_dir(ph))

In [None]:
if not os.path.exists(preprocessed_dir):
    os.makedirs(preprocessed_dir)
print(preprocessed_dir)

# 1. Load Network Data

In [None]:
data = pd.read_csv("%s/15o/15o_mentions.csv" % input_prefix,sep=" ",names=["time","src","trg"])

In [None]:
data.head()

In [None]:
first_epoch = data["time"].min()

In [None]:
print(len(data))
data = data[data["time"] < first_epoch + 4 * 86400]
print(len(data))

# 2. Preprocess Network

## a.) Top authorities

In [None]:
top_k = ph.get("top_k")
top_auth_with_freq = data["trg"].value_counts()[:top_k]
top_auth_with_freq

In [None]:
top_authorities = list(top_auth_with_freq.index)
top_authorities

## b.) Filter for top authorities

In [None]:
filtered_data = data[data["trg"].isin(top_authorities)]
print(len(data),len(filtered_data))

In [None]:
print(len(filtered_data["src"].unique()),len(filtered_data["trg"].unique()))

In [None]:
num_targets_for_sources = filtered_data.groupby(by=["src"])["trg"].nunique()

In [None]:
num_targets_for_sources.hist()

## c.) Generate group labels for sources

#### Conclusion: top_k=10
   * too many nodes (18824)
   * too many labels (not all 2^10 combination occurs - only 294)
   
#### Conclusion: top_k=5
   * too many nodes? (14721)
   * 31 labels could be reduced to labels with at least 10,100 node support

#### Conclusion: top_k=4
   * 12372 nodes are fine
   * 15 labels is fine

#### Conclusion: top_k=3 (I should try this setting first)
   * 9681 nodes are fine
   * 7 labels is fine

In [None]:
sources_with_labels_df = gu.generate_labels(filtered_data, top_authorities)

In [None]:
sources_with_labels_df["label"].value_counts()

### Label changes

In [None]:
filtered_size = len(filtered_data)
train_part = int(filtered_size * 0.5)
print(filtered_size,train_part)

In [None]:
labels_first_part = gu.generate_labels(filtered_data.head(train_part), top_authorities)
labels_second_part = gu.generate_labels(filtered_data.tail(filtered_size-train_part), top_authorities)

In [None]:
cols = ['src','label']
merged_df = labels_first_part[cols].merge(labels_second_part[cols],on=["src"],how="inner")

#### top_k=10
   * merged_size=2273
   * mismatch_size=1815

#### top_k=5
   * merged_size=1983
   * mismatch_size=1465

#### top_k=3
   * merged_size=1800
   * mismatch_size=1201 

#### top_k=3
   * merged_size=1703
   * mismatch_size=1078 

In [None]:
print(len(merged_df), len(merged_df[merged_df["label_x"] != merged_df["label_y"]]))

## d.) Generate edges for the network

In [None]:
top_authorities

#### Graph stats for different topics (top_k=5, time_frame=60 ~ 1min)

The stats are in this order: **#nodes, #edges, #connected_components**

   * for [59848]: 5408, 39501, 132
   * for [59848,55250]: 7503, 55914, 177
   * for [59848,55250,20696]: 8656, 66142, 191
   * for [59848,55250,20696,54151]: 11058, 89485, 232
   * for [59848,55250,20696,54151,40577]: 13130, 101032, 329
   
#### We should use a more resonable time_frame, like 300 ~ 5min

In [None]:
G = nx.Graph()

In [None]:
time_frame = ph.get("time_frame")
for i in range(top_k):
    gu.add_edges_to_graph(filtered_data, G, top_authorities[i], time_frame = time_frame)
    print(G.number_of_nodes(), G.number_of_edges(), nx.number_connected_components(G))

## e.) Filter the graph for the giant component

   * only nodes in the giant component are interesting - other components has very few vertices
   * the giant component is sparse (which is good)

for comp in nx.connected_components(G):
    print(len(comp))

In [None]:
giant = max(nx.connected_component_subgraphs(G), key=len)
N = len(giant.nodes())
M = len(giant.edges())

In [None]:
print(N,M)

### Edge density

In [None]:
M / ((N-1)*N/2)

### Calculate some centrality scores on the giant component

In [None]:
giant_pagerank = nx.pagerank(giant)
giant_degree = nx.degree(giant)

In [None]:
giant_scores_df = pd.DataFrame({"pagerank":giant_pagerank, "degree":giant_degree})
giant_scores_df["src"] = giant_scores_df.index
giant_scores_df = giant_scores_df.reset_index()[["src","degree","pagerank"]]

In [None]:
giant_scores_df.head()

# Export binary files

In [None]:
giant_sources_df = sources_with_labels_df[sources_with_labels_df["src"].isin(giant.nodes())].reset_index()[["src","label"]]

In [None]:
len(giant_sources_df), len(sources_with_labels_df)

### Get part of filtered_data related to giant component

In [None]:
giant_with_time = filtered_data[filtered_data["src"].isin(giant.nodes())]

### Extract minimum mention time for each node

In [None]:
giant_with_min_time = giant_with_time.groupby(by=["src"])["time"].min()
giant_with_min_time = pd.DataFrame(giant_with_min_time).reset_index()

In [None]:
giant_sources_df = giant_sources_df.merge(giant_with_min_time,on=["src"])

### Extract number of mentions for sources

In [None]:
giant_with_num_of_mentions = giant_with_time.groupby(by=["src"])["time"].count().reset_index()
giant_with_num_of_mentions.columns = ["src","frequency"]
giant_with_num_of_mentions["frequency"] = giant_with_num_of_mentions["frequency"].astype("f")

In [None]:
giant_sources_df = giant_sources_df.merge(giant_with_num_of_mentions,on=["src"])

### Join centrality score features

In [None]:
giant_sources_df = giant_sources_df.merge(giant_scores_df,on=["src"])

### Create node index decoder

In [None]:
n2i_map = dict(zip(giant_sources_df["src"],giant_sources_df.index))

## 0.) Split into train test set

In [None]:
giant_sources_df.head(3)

In [None]:
giant_min_time = giant_sources_df["time"].min()
giant_max_time = giant_sources_df["time"].max()

In [None]:
print( (giant_max_time-giant_min_time) // 86400 )

In [None]:
cut_ratio = ph.get("cut_ratio")
cut_time = giant_min_time + (giant_max_time-giant_min_time) * cut_ratio

In [None]:
train_giant_sources_df = giant_sources_df[giant_sources_df["time"] <= cut_time]
test_giant_sources_df = giant_sources_df[giant_sources_df["time"] > cut_time]

In [None]:
print(len(train_giant_sources_df),len(test_giant_sources_df))

In [None]:
np.savetxt("%s/ind.15o.test.index" % preprocessed_dir,test_giant_sources_df.index,fmt="%i")

In [None]:
export_tuples = [
    #(giant_sources_df,'all'),
    (train_giant_sources_df,'all'),
    (test_giant_sources_df,''),
    (test_giant_sources_df,'t')
]

## 1.) Export edges to binary file

In [None]:
edge_list_map = defaultdict(list)

In [None]:
for s,t in giant.edges():
    edge_list_map[n2i_map[s]].append(n2i_map[t])

In [None]:
with open("%s/ind.15o.graph" % preprocessed_dir,"wb+") as outfile:
    pkl.dump(edge_list_map, outfile)

## 2.) Export target labels

In [None]:
unique_labels = list(giant_sources_df["label"].unique())
index_pos_map = dict(zip(unique_labels,range(len(unique_labels))))
#index_pos_map

In [None]:
def str2arr(label):
    return [int(char) for char in str(label)]

def str2onehot(label):
    out = np.zeros(len(unique_labels),dtype="i")
    out[index_pos_map[label]] = 1
    return out

In [None]:
for item in export_tuples:
    label_arr = []
    for index, row in item[0].iterrows():
        label_arr.append(str2arr(row["label"]))
        #label_arr.append(str2onehot(row["label"]))
    label_arr = np.array(label_arr)
    with open("%s/ind.15o.%sy" % (preprocessed_dir,item[1]),"wb+") as outfile:
        pkl.dump(label_arr, outfile)

## 3.) Export features

#feature_set = ["time","frequency","degree","pagerank"] #low performance
#feature_set = ["frequency","degree","pagerank"] #low performance
feature_set = ["frequency","pagerank"] #low performance
#feature_set = ["frequency","degree"] #low performance
#feature_set = ["frequency","time"] #low performance

for item in export_tuples:
    coord_sparse = sparse.csr_matrix(item[0][feature_set].as_matrix())
    with open("/mnt/idms/fberes/network/gcn_project/data/ind.15o.%sx" % item[1],"wb+") as outfile:
        pkl.dump(coord_sparse, outfile)

In [None]:
for item in export_tuples:
    num_nodes = len(item[0])
    row = range(num_nodes)
    col = np.zeros(num_nodes)
    data = item[0]["frequency"].as_matrix()
    coord_sparse = sparse.csr_matrix( (data,(row,col)), shape=(num_nodes,1))
    with open("%s/ind.15o.%sx" % (preprocessed_dir,item[1]),"wb+") as outfile:
        pkl.dump(coord_sparse, outfile)