In [None]:
import pandas as pd
import numpy as np
import networkx as nx

In [None]:
import sys, os
sys.path.insert(0,"../python/")
import preprocessing as pp

In [None]:
%matplotlib inline

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/GcnProject.json",sys.argv)

In [None]:
input_prefix = ph.get("input_prefix")
dataset_id = ph.get("dataset_id")
topic_support = ph.get("topic_support")

In [None]:
preprocessed_dir = "%s/data/%s/%s" % (ph.get("experiment_dir"), dataset_id, pp.get_experiment_dir(ph))

In [None]:
if not os.path.exists(preprocessed_dir):
    os.makedirs(preprocessed_dir)
print(preprocessed_dir)

# 1. Load Network Data

In [None]:
data = pd.read_csv("%s/%s/%s_mentions.csv" % (input_prefix,dataset_id,dataset_id),sep=" ",names=["time","src","trg"])

In [None]:
data.head()

In [None]:
first_epoch = data["time"].min()

In [None]:
print(len(data))
data = data[data["time"] < first_epoch + 4 * 86400]
print(len(data))

# 2. Preprocess Network

## a.) Top authorities

In [None]:
top_k = ph.get("top_k")
top_auth_with_freq = data["trg"].value_counts()[:top_k]
top_auth_with_freq

In [None]:
top_authorities = list(top_auth_with_freq.index)
top_authorities

## b.) Filter for top authorities

In [None]:
filtered_data = data[data["trg"].isin(top_authorities)]
print(len(data),len(filtered_data))

In [None]:
print(len(filtered_data["src"].unique()),len(filtered_data["trg"].unique()))

In [None]:
num_targets_for_sources = filtered_data.groupby(by=["src"])["trg"].nunique()

#### The number of different target for sources (from the top authorities)

In [None]:
num_targets_for_sources.hist()

## c.) Generate group labels for sources

#### Conclusion: top_k=10
   * too many nodes (18824)
   * too many labels (not all 2^10 combination occurs - only 294)
   
#### Conclusion: top_k=5
   * too many nodes? (14721)
   * 31 labels could be reduced to labels with at least 10,100 node support

#### Conclusion: top_k=4
   * 12372 nodes are fine
   * 15 labels is fine

#### Conclusion: top_k=3 (I should try this setting first)
   * 9681 nodes are fine
   * 7 labels is fine

In [None]:
sources_with_labels_df = pp.generate_labels(filtered_data, top_authorities)

In [None]:
label_frequencies = sources_with_labels_df["label"].value_counts()

In [None]:
label_frequencies

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.title("Label distribution for k=5")
plt.plot(label_frequencies.values,'bx')
plt.xticks(rotation=50)
plt.xticks(range(len(label_frequencies)),list(label_frequencies.index))
plt.xlabel("Labels")
plt.ylabel("Frequency")
plt.show()

### Filter for frequent groups: only labels with at least 10 support is kept

In [None]:
supported_labels = list(label_frequencies[label_frequencies > topic_support].index)
supported_labels

In [None]:
print(len(sources_with_labels_df))
sources_with_labels_df = sources_with_labels_df[sources_with_labels_df["label"].isin(supported_labels)]
print(len((sources_with_labels_df)))

In [None]:
supported_nodes = list(sources_with_labels_df["src"])

In [None]:
print(len(filtered_data))
filtered_data = filtered_data[filtered_data["src"].isin(supported_nodes)]
print(len((filtered_data)))

## 15o topic 4 disturbences

topic_4_src = list(sources_with_labels_df[sources_with_labels_df["label"] == '00010']["src"])

topic_4_links = filtered_data[filtered_data["src"].isin(topic_4_src)]

len(topic_4_links)

topic_4_links

### Label changes

In [None]:
filtered_size = len(filtered_data)
train_part = int(filtered_size * 0.5)
print(filtered_size,train_part)

In [None]:
labels_first_part = pp.generate_labels(filtered_data.head(train_part), top_authorities)
labels_second_part = pp.generate_labels(filtered_data.tail(filtered_size-train_part), top_authorities)

In [None]:
cols = ['src','label']
merged_df = labels_first_part[cols].merge(labels_second_part[cols],on=["src"],how="inner")

In [None]:
print(len(merged_df), len(merged_df[merged_df["label_x"] != merged_df["label_y"]]))

## d.) Generate edges for the network

In [None]:
top_authorities

In [None]:
G = nx.Graph()

In [None]:
time_frame = ph.get("time_frame")
for i in range(top_k):
    pp.add_edges_to_graph(filtered_data, G, top_authorities[i], time_frame = time_frame)
    print(G.number_of_nodes(), G.number_of_edges(), nx.number_connected_components(G))

## e.) Filter the graph for the giant component

   * only nodes in the giant component are interesting - other components has very few vertices
   * the giant component is sparse (which is good)

for comp in nx.connected_components(G):
    print(len(comp))

### Keep only giant component

giant = max(nx.connected_component_subgraphs(G), key=len)

### Keep whole topic network

In [None]:
giant = G

In [None]:
N = len(giant.nodes())
M = len(giant.edges())

In [None]:
print(N,M)

### Edge density

In [None]:
M / ((N-1)*N/2)

### Calculate some centrality scores on the giant component

In [None]:
giant_pagerank = nx.pagerank(giant)
giant_degree = nx.degree(giant)

In [None]:
giant_scores_df = pd.DataFrame({"pagerank":giant_pagerank, "degree":giant_degree})
giant_scores_df["src"] = giant_scores_df.index
giant_scores_df = giant_scores_df.reset_index()[["src","degree","pagerank"]]

In [None]:
giant_scores_df.head()

# Export preprocessed data

In [None]:
giant_sources_df = sources_with_labels_df[sources_with_labels_df["src"].isin(giant.nodes())].reset_index()[["src","label"]]

In [None]:
len(giant_sources_df), len(sources_with_labels_df)

In [None]:
giant_sources_df["label"].value_counts()

### Get part of filtered_data related to giant component

In [None]:
giant_with_time = filtered_data[filtered_data["src"].isin(giant.nodes())]

### Extract minimum mention time for each node

In [None]:
giant_with_min_time = giant_with_time.groupby(by=["src"])["time"].min()
giant_with_min_time = pd.DataFrame(giant_with_min_time).reset_index()

In [None]:
giant_sources_df = giant_sources_df.merge(giant_with_min_time,on=["src"])

### Extract number of mentions for sources

In [None]:
giant_with_num_of_mentions = giant_with_time.groupby(by=["src"])["time"].count().reset_index()
giant_with_num_of_mentions.columns = ["src","frequency"]
giant_with_num_of_mentions["frequency"] = giant_with_num_of_mentions["frequency"].astype("f")

In [None]:
giant_sources_df = giant_sources_df.merge(giant_with_num_of_mentions,on=["src"])

### Join centrality score features

In [None]:
giant_sources_df = giant_sources_df.merge(giant_scores_df,on=["src"])

## 1.) Split into train test set

In [None]:
split_type = ph.get("split_type")
train_ratio = ph.get("train_ratio")
split_id = "%s_%.2f" % (split_type, train_ratio)

In [None]:
giant_sources_df.head(3)

In [None]:
train_giant_sources_df, test_giant_sources_df = pp.get_train_test(giant_sources_df, split_type, train_ratio)

In [None]:
print(len(train_giant_sources_df),len(test_giant_sources_df))

## 2.) Export binary files for modeling

#### Only half of the training data will be labeled for GCN 

In [None]:
sample_size = len(train_giant_sources_df) // 2
print(sample_size)
observed_set = train_giant_sources_df.head(sample_size)

### Observed set with balanced label frequencies

In [None]:
value_counts = train_giant_sources_df["label"].value_counts()
print(value_counts)
labels = list(value_counts.index) 
freqs = list(value_counts)
balanced_freqs = list(np.ceil(2 * np.log(freqs)).astype("i"))

In [None]:
balanced_freqs

In [None]:
observed_indices = []
for i in range(len(labels)):
    idx_list = list(train_giant_sources_df[train_giant_sources_df["label"]==labels[i]].index)
    selected_idx_list = idx_list[:balanced_freqs[i]]
    observed_indices += selected_idx_list

In [None]:
train_giant_sources_df.head()

In [None]:
observed_set = train_giant_sources_df.ix[observed_indices]
print(len(observed_set))

In [None]:
export_tuples = [
    (observed_set,''),
    (train_giant_sources_df,'all'),
    (test_giant_sources_df,'t')
]

### Export test indices

In [None]:
pp.export_test_indices(test_giant_sources_df, "%s/%s" % (preprocessed_dir,split_id), dataset_id)

### Export edges to binary file

In [None]:
pp.export_edges(giant_sources_df, giant, "%s/%s" % (preprocessed_dir,split_id), dataset_id)

### Export target labels

In [None]:
pp.export_labels(export_tuples, giant_sources_df, "%s/%s" % (preprocessed_dir,split_id), dataset_id)

### Export features

In [None]:
pp.export_features(export_tuples, "%s/%s" % (preprocessed_dir,split_id), dataset_id)