In [1]:
import pandas as pd
import time

# Loading consolidated data

In [2]:
userprofile = pd.read_csv('WeightLoss/userprofile.csv')

In [None]:
#can_record_consolidated = pd.read_csv('WeightLoss/can_record_consolidated.csv')

In [None]:
#comment_consolidated = pd.read_csv('WeightLoss/comment_consolidated.csv')

In [None]:
#mention_consolidated = pd.read_csv('WeightLoss/mention_consolidated.csv')

In [None]:
#post_consolidated = pd.read_csv('WeightLoss/post_consolidated.csv')

In [15]:
#users_consolidated = pd.read_csv('WeightLoss/users_consolidated.csv')

In [None]:
#weight_record_consolidated = pd.read_csv('WeightLoss/weight_record_consolidated.csv')

In [33]:
print(userprofile["user_id"].max())

12338632


# Loading networks

In [3]:
friend_consolidated = pd.read_csv('WeightLoss/network/friend_consolidated.csv')

In [4]:
comment_rela = pd.read_csv('WeightLoss/network/comment_rela.csv')

In [5]:
mention_rela = pd.read_csv('WeightLoss/network/mention_rela.csv')
mention_rela.dropna(subset=['who-mention'], inplace=True) # remove rows with NaN in 'who-mention'
mention_rela['who-mention'] = mention_rela['who-mention'].astype('int64')

# Working with PyTorch

In [6]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import degree

In [7]:
# iteratively filters out low degree nodes from a graph
# returns list of nodes of degree at least d
# **note this runs out of memory if run directly on friend_net, would need an iterative solution maybe
def filter_low_degree_nodes(graph, d=5):
    # get the nodes and their degrees
    included_nodes, degrees = torch.unique(graph.edge_index[0], return_counts=True)
    print("Current Size:", included_nodes.shape)
    # generate a mask that is True for degrees >= d and False otherwise
    filter_mask = torch.ge(degrees,d)
    if(torch.unique(filter_mask).shape[0] != 1): # check that the mask still contains both True and False, otherwise done
        # filter the nodes based on the mask
        filtered_nodes = torch.masked_select(included_nodes,filter_mask)
        # generate the subgraph on the filtered nodes
        subgraph = graph.subgraph(filtered_nodes)
        # recursive call: now filter again based on the new subgraph
        # returns indices on what was passed in
        subgraph_nodes = filter_low_degree_nodes(subgraph)
        # convert back to original node names
        included_nodes = filtered_nodes[subgraph_nodes]
    # return the new included_nodes
    return included_nodes

In [8]:
num_nodes = userprofile["user_id"].max() # max id

# convert data to appropriate form and generate a PyG graph
friend_edge_index = torch.from_numpy(friend_consolidated.iloc[:,[0,1]].to_numpy())
friend_net = Data(edge_index=friend_edge_index.t().contiguous())
friend_net.num_nodes = num_nodes

comment_edge_index = torch.from_numpy(comment_rela.iloc[:,[0,1]].to_numpy())
comment_net = Data(edge_index=comment_edge_index.t().contiguous())
comment_net.num_nodes = num_nodes

mention_edge_index = torch.from_numpy(mention_rela.iloc[:,[0,1]].to_numpy())
mention_net = Data(edge_index=mention_edge_index.t().contiguous())
mention_net.num_nodes = num_nodes

In [9]:
# filter out the low degree nodes
def checker():
    test_nets = [friend_net.subgraph(cur_nodes), comment_net.subgraph(cur_nodes), mention_net.subgraph(cur_nodes)]
    
    for test_net in test_nets:
        _, degs = torch.unique(test_net.edge_index[0], return_counts=True)
        unique_degs = torch.unique(degs)
        if unique_degs.min() < 5: 
            print("Failed...")
            return False
    #
    print("Done!")
    return True
    
i = 0
hard_loop_cap = 10
cur_nodes = torch.arange(num_nodes)
while(i < hard_loop_cap):
    print("Filtering on comment_net:")
    comment_sub_net = comment_net.subgraph(cur_nodes)
    nodes_after_filter_on_comment_net = cur_nodes[filter_low_degree_nodes(comment_sub_net)]
    print("Nodes after filter on comment net:", nodes_after_filter_on_comment_net)
    print("Filtering on mention_net:")
    mention_sub_net = mention_net.subgraph(nodes_after_filter_on_comment_net)
    nodes_after_filter_on_mention_net = nodes_after_filter_on_comment_net[filter_low_degree_nodes(mention_sub_net)]
    print("Nodes after filter on mention net:", nodes_after_filter_on_mention_net)
    print("Filtering on friend_net:")
    friend_sub_net = friend_net.subgraph(nodes_after_filter_on_mention_net)
    cur_nodes = nodes_after_filter_on_mention_net[filter_low_degree_nodes(friend_sub_net)]
    print("Current nodes:", cur_nodes)
    print("Checking...")
    if checker(): break
    i += 1
#
final_nodes = cur_nodes

Filtering on comment_net:
Current Size: torch.Size([498705])
Current Size: torch.Size([162621])
Current Size: torch.Size([158370])
Current Size: torch.Size([158219])
Current Size: torch.Size([158214])
Current Size: torch.Size([158213])
Nodes after filter on comment net: tensor([       3,        4,       10,  ..., 12325382, 12325432, 12325695])
Filtering on mention_net:
Current Size: torch.Size([116365])
Current Size: torch.Size([70191])
Current Size: torch.Size([62368])
Current Size: torch.Size([61242])
Current Size: torch.Size([61036])
Current Size: torch.Size([60998])
Current Size: torch.Size([60987])
Current Size: torch.Size([60985])
Current Size: torch.Size([60984])
Nodes after filter on mention net: tensor([      10,       27,       31,  ..., 12275466, 12288211, 12309780])
Filtering on friend_net:
Current Size: torch.Size([56413])
Current Size: torch.Size([40258])
Current Size: torch.Size([38691])
Current Size: torch.Size([38470])
Current Size: torch.Size([38440])
Current Size: to

In [10]:
# get final networks
final_friend_net = friend_net.subgraph(final_nodes)
final_comment_net = comment_net.subgraph(final_nodes)
final_mention_net = mention_net.subgraph(final_nodes)

In [19]:
directory = "ProcessedData"
# write final nodes to file
final_nodes_path = directory + "/included_users_(sorted)"
final_nodes_DF = pd.DataFrame(final_nodes.numpy(),columns=['user_id'])
final_nodes_DF.to_csv(final_nodes_path, index = False)

print(pd.read_csv(final_nodes_path))

# write filtered network edgelists to files

# Testing

In [None]:
# network degrees checker
_, degs = torch.unique(final_mention_net.edge_index[0], return_counts=True)
unique_degs,counts = torch.unique(degs, return_counts=True)
print(unique_degs, counts)