In [1]:
import pandas as pd
import time

# Loading consolidated data

In [2]:
userprofile = pd.read_csv('WeightLoss/userprofile.csv')

In [None]:
#can_record_consolidated = pd.read_csv('WeightLoss/can_record_consolidated.csv')

In [None]:
#comment_consolidated = pd.read_csv('WeightLoss/comment_consolidated.csv')

In [None]:
#mention_consolidated = pd.read_csv('WeightLoss/mention_consolidated.csv')

In [None]:
#post_consolidated = pd.read_csv('WeightLoss/post_consolidated.csv')

In [15]:
users_consolidated = pd.read_csv('WeightLoss/users_consolidated.csv')

In [None]:
#weight_record_consolidated = pd.read_csv('WeightLoss/weight_record_consolidated.csv')

In [33]:
print(userprofile["user_id"].max())

12338632


# Loading networks

In [3]:
friend_consolidated = pd.read_csv('WeightLoss/network/friend_consolidated.csv')

In [4]:
comment_rela = pd.read_csv('WeightLoss/network/comment_rela.csv')

In [36]:
mention_rela = pd.read_csv('WeightLoss/network/mention_rela.csv')
mention_rela.dropna(subset=['who-mention'], inplace=True) # remove rows with NaN in 'who-mention'
mention_rela['who-mention'] = mention_rela['who-mention'].astype('int64')

In [37]:
print(mention_rela)

         who-mention  mention-who              type   type-id  \
0                121           33     Mention::Post        11   
1                121          143     Mention::Post        26   
2                121          143     Mention::Post        27   
3                121          121     Mention::Post        42   
4                121           33  Mention::Comment         3   
...              ...          ...               ...       ...   
6502773      7796031      5683623  Mention::Comment  38734963   
6502774      4917321      7190027  Mention::Comment  38734966   
6502775     11719132     11939872  Mention::Comment  38734971   
6502776     11998946     12252269  Mention::Comment  38734973   
6502777     10125396      4156327  Mention::Comment  38734979   

                  post-id   post-user        date  
0              Post_ID:11       121.0  2013-09-30  
1              Post_ID:26       121.0  2013-10-01  
2              Post_ID:27       121.0  2013-10-01  
3          

# Working with PyTorch

In [6]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import degree

In [38]:
# iteratively filters out low degree nodes from a graph
# returns list of nodes of degree at least d
# **note this runs out of memory if run directly on friend_net, would need an iterative solution maybe
def filter_low_degree_nodes(graph, d=5):
    # get the nodes and their degrees
    included_nodes, degrees = torch.unique(graph.edge_index[0], return_counts=True)
    print("Current Size:", included_nodes.shape)
    # generate a mask that is True for degrees >= d and False otherwise
    filter_mask = torch.ge(degrees,d)
    if(torch.unique(filter_mask).shape[0] != 1): # check that the mask still contains both True and False, otherwise done
        # filter the nodes based on the mask
        filtered_nodes = torch.masked_select(included_nodes,filter_mask)
        # generate the subgraph on the filtered nodes
        subgraph = graph.subgraph(filtered_nodes)
        # recursive call: now filter again based on the new subgraph
        # returns indices on what was passed in
        subgraph_nodes = filter_low_degree_nodes(subgraph)
        # convert back to original node names
        included_nodes = filtered_nodes[subgraph_nodes]
    # return the new included_nodes
    return included_nodes

In [39]:
num_nodes = userprofile["user_id"].max() # max id

# convert data to appropriate form and generate a PyG graph
friend_edge_index = torch.from_numpy(friend_consolidated.iloc[:,[0,1]].to_numpy())
friend_net = Data(edge_index=friend_edge_index.t().contiguous())
friend_net.num_nodes = num_nodes

comment_edge_index = torch.from_numpy(comment_rela.iloc[:,[0,1]].to_numpy())
comment_net = Data(edge_index=comment_edge_index.t().contiguous())
comment_net.num_nodes = num_nodes

mention_edge_index = torch.from_numpy(mention_rela.iloc[:,[0,1]].to_numpy())
mention_net = Data(edge_index=mention_edge_index.t().contiguous())
mention_net.num_nodes = num_nodes

In [40]:
# filter out the low degree nodes
print("Filtering on comment_net:")
nodes_after_filter_on_comment_net = filter_low_degree_nodes(comment_net)
print("Nodes after filter on comment net:", nodes_after_filter_on_comment_net)
print("Filtering on mention_net:")
mention_sub_net = mention_net.subgraph(nodes_after_filter_on_comment_net)
nodes_after_filter_on_mention_net = nodes_after_filter_on_comment_net[filter_low_degree_nodes(mention_sub_net)]
print("Nodes after filter on mention net:", nodes_after_filter_on_mention_net)
print("Filtering on friend_net:")
friend_sub_net = friend_net.subgraph(nodes_after_filter_on_mention_net)
final_nodes = nodes_after_filter_on_mention_net[filter_low_degree_nodes(friend_sub_net)]
print("Final nodes:", final_nodes)

Filtering on comment_net:
Current Size: torch.Size([498705])
Current Size: torch.Size([162621])
Current Size: torch.Size([158370])
Current Size: torch.Size([158219])
Current Size: torch.Size([158214])
Current Size: torch.Size([158213])
Nodes after filter on comment net: tensor([       3,        4,       10,  ..., 12325382, 12325432, 12325695])
Filtering on mention_net:
Current Size: torch.Size([116365])
Current Size: torch.Size([70191])
Current Size: torch.Size([62368])
Current Size: torch.Size([61242])
Current Size: torch.Size([61036])
Current Size: torch.Size([60998])
Current Size: torch.Size([60987])
Current Size: torch.Size([60985])
Current Size: torch.Size([60984])
Nodes after filter on mention net: tensor([      10,       27,       31,  ..., 12275466, 12288211, 12309780])
Filtering on friend_net:
Current Size: torch.Size([56413])
Current Size: torch.Size([40258])
Current Size: torch.Size([38691])
Current Size: torch.Size([38470])
Current Size: torch.Size([38440])
Current Size: to

In [41]:
# get final networks
final_friend_net = friend_net.subgraph(final_nodes)
final_comment_net = comment_net.subgraph(final_nodes)
final_mention_net = mention_net.subgraph(final_nodes)

In [46]:
# checker
nodes, degs = torch.unique(final_mention_net.edge_index[0], return_counts=True)
unique_degs,counts = torch.unique(degs, return_counts=True)
print(unique_degs, counts)

tensor([    1,     2,     3,  ..., 18890, 19411, 24730]) tensor([ 56, 148, 312,  ...,   1,   1,   1])


# Testing

In [None]:
edge_index = torch.tensor([[0, 1, 2, 1,0],
                           [1, 0, 1, 2,2]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

In [None]:
deg = degree(data.edge_index[0], data.num_nodes)
print(deg[2])  # degree of node 0

In [None]:
subs = torch.tensor([2,0])
sub = data.subgraph(subs)
print(sub)
print(sub.edge_index)

In [None]:
deg = degree(sub.edge_index[0], sub.num_nodes)
print(deg[1])  # degree of node 0

In [None]:
a = torch.tensor([3,2,0])
b = torch.tensor([0,2])

print(a[b])