In [1]:
import pandas as pd

# Loading consolidated data

In [2]:
userprofile = pd.read_csv('WeightLoss/userprofile.csv')
valid_users = pd.read_csv("ProcessedData/valid_user_info_consolidated.csv", usecols = ['user_id'])

# Loading networks

In [3]:
friend_consolidated = pd.read_csv('WeightLoss/network/friend_consolidated.csv')

In [4]:
comment_rela = pd.read_csv('WeightLoss/network/comment_rela.csv')

In [5]:
mention_rela = pd.read_csv('WeightLoss/network/mention_rela.csv')
mention_rela.dropna(subset=['who-mention'], inplace=True) # remove rows with NaN in 'who-mention'
mention_rela['who-mention'] = mention_rela['who-mention'].astype('int64')

# Working with PyTorch

In [6]:
import torch
from torch_geometric.data import Data

In [7]:
# iteratively filters out low degree nodes from a graph
# returns list of nodes of degree at least d
# **note this runs out of memory if run directly on friend_net, would need an iterative solution maybe
def filter_low_degree_nodes(graph, d=5):
    # get the nodes and their degrees
    included_nodes, degrees = torch.unique(graph.edge_index[0], return_counts=True)
    print("Current Size:", included_nodes.shape)
    # generate a mask that is True for degrees >= d and False otherwise
    filter_mask = torch.ge(degrees,d)
    if(torch.unique(filter_mask).shape[0] != 1): # check that the mask still contains both True and False, otherwise done
        # filter the nodes based on the mask
        filtered_nodes = torch.masked_select(included_nodes,filter_mask)
        # generate the subgraph on the filtered nodes
        subgraph = graph.subgraph(filtered_nodes)
        # recursive call: now filter again based on the new subgraph
        # returns indices on what was passed in
        subgraph_nodes = filter_low_degree_nodes(subgraph)
        # convert back to original node names
        included_nodes = filtered_nodes[subgraph_nodes]
    # return the new included_nodes
    return included_nodes

In [8]:
def to_undirected(edge_index):
    row, col = edge_index
    new_row = torch.cat([row, col])
    new_col = torch.cat([col, row])
    return torch.stack([new_row, new_col], dim=0)

num_nodes = userprofile["user_id"].max() # max id

# convert data to appropriate form and generate an undirected PyG graph
friend_edge_index = torch.from_numpy(friend_consolidated.iloc[:,[0,1]].to_numpy())
friend_net = Data(edge_index=to_undirected(friend_edge_index.t().contiguous()))
friend_net.coalesce()
friend_net.num_nodes = num_nodes

comment_edge_index = torch.from_numpy(comment_rela.iloc[:,[0,1]].to_numpy())
comment_net = Data(edge_index=to_undirected(comment_edge_index.t().contiguous()))
comment_net.coalesce()
comment_net.num_nodes = num_nodes

mention_edge_index = torch.from_numpy(mention_rela.iloc[:,[0,1]].to_numpy())
mention_net = Data(edge_index=to_undirected(mention_edge_index.t().contiguous()))
mention_net.coalesce()
mention_net.num_nodes = num_nodes



In [9]:
# filter out the low degree nodes
def checker():
    test_nets = [friend_net.subgraph(cur_nodes), comment_net.subgraph(cur_nodes), mention_net.subgraph(cur_nodes)]
    
    for test_net in test_nets:
        _, degs = torch.unique(test_net.edge_index[0], return_counts=True)
        unique_degs = torch.unique(degs)
        if unique_degs.min() < 5: 
            print("Failed...")
            return False
    #
    print("Done!")
    return True
    
i = 0
hard_loop_cap = 10
cur_nodes = torch.from_numpy(valid_users.to_numpy().ravel())
while(i < hard_loop_cap):
    print("Filtering on comment_net:")
    comment_sub_net = comment_net.subgraph(cur_nodes)
    nodes_after_filter_on_comment_net = cur_nodes[filter_low_degree_nodes(comment_sub_net)]
    print("Nodes after filter on comment net:", nodes_after_filter_on_comment_net)
    print("Filtering on mention_net:")
    mention_sub_net = mention_net.subgraph(nodes_after_filter_on_comment_net)
    nodes_after_filter_on_mention_net = nodes_after_filter_on_comment_net[filter_low_degree_nodes(mention_sub_net)]
    print("Nodes after filter on mention net:", nodes_after_filter_on_mention_net)
    print("Filtering on friend_net:")
    friend_sub_net = friend_net.subgraph(nodes_after_filter_on_mention_net)
    cur_nodes = nodes_after_filter_on_mention_net[filter_low_degree_nodes(friend_sub_net)]
    print("Current nodes:", cur_nodes)
    print("Checking...")
    if checker(): break
    i += 1
#
final_nodes = cur_nodes

Filtering on comment_net:
Current Size: torch.Size([3665811])
Current Size: torch.Size([153805])
Current Size: torch.Size([149047])
Current Size: torch.Size([148738])
Current Size: torch.Size([148709])
Current Size: torch.Size([148706])
Nodes after filter on comment net: tensor([       3,        4,       10,  ..., 12309780, 12318685, 12319222])
Filtering on mention_net:
Current Size: torch.Size([142627])
Current Size: torch.Size([85601])
Current Size: torch.Size([81874])
Current Size: torch.Size([81307])
Current Size: torch.Size([81198])
Current Size: torch.Size([81174])
Current Size: torch.Size([81165])
Current Size: torch.Size([81163])
Nodes after filter on mention net: tensor([       3,        4,       10,  ..., 12296696, 12298914, 12306610])
Filtering on friend_net:
Current Size: torch.Size([80053])
Current Size: torch.Size([79572])
Current nodes: tensor([       3,        4,       10,  ..., 11949862, 11950136, 11950314])
Checking...
Failed...
Filtering on comment_net:
Current Size:

In [10]:
# get final networks
final_friend_net = friend_net.subgraph(final_nodes)
final_comment_net = comment_net.subgraph(final_nodes)
final_mention_net = mention_net.subgraph(final_nodes)

# Saving Data

In [11]:
directory = "ProcessedData"
# write final nodes to file
final_nodes_path = directory + "/included_users_(sorted).csv"
final_nodes_DF = pd.DataFrame(final_nodes.numpy(),columns=['user_id'])
final_nodes_DF.index.name = "Indices"
final_nodes_DF.to_csv(final_nodes_path)

In [12]:
# write filtered network edgelists to files
friend_edge_list_path = directory + "/friend_edges.csv"
friend_edge_list_DF = pd.DataFrame(final_friend_net.edge_index.t().numpy(),columns=['user_id','follower_id'])
friend_edge_list_DF.to_csv(friend_edge_list_path, index = False)

In [13]:
comment_edge_list_path = directory + "/comment_edges.csv"
comment_edge_list_DF = pd.DataFrame(final_comment_net.edge_index.t().numpy(),columns=['who-comment','comment-who'])
comment_edge_list_DF.to_csv(comment_edge_list_path, index = False)

In [14]:
mention_edge_list_path = directory + "/mention_edges.csv"
mention_edge_list_DF = pd.DataFrame(final_mention_net.edge_index.t().numpy(),columns=['who-mention','mention-who'])
mention_edge_list_DF.to_csv(mention_edge_list_path, index = False)

# Testing

In [15]:
# network degrees checker
_, degs = torch.unique(final_mention_net.edge_index[0], return_counts=True)
unique_degs,counts = torch.unique(degs, return_counts=True)
print(unique_degs, counts)

tensor([    5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
           15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
           25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
           35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
           45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
           55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
           65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
           75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
           85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
           95,    96,    97,    98,    99,   100,   101,   102,   103,   104,
          105,   106,   107,   108,   109,   110,   111,   112,   113,   114,
          115,   116,   117,   118,   119,   120,   121,   122,   123,   124,
          125,   126,   127,   128,   129,   130,   131,   132, 