In [None]:
!pip install pyg-lib torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.13.0+cu116.html
!pip install torch-geometric

In [None]:
import torch
if torch.cuda.is_available():
    device = 'cuda'
else:
  device = 'cpu'


In [None]:
import pandas as pd
import networkx as nx
import torch
from torch_geometric.utils import negative_sampling

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pre_path='/content/drive/MyDrive/F22/SI671/'

In [None]:
friends = pd.read_csv(pre_path+"user_friend.csv", sep="\t")

In [None]:
friends.drop(columns="Unnamed: 0", inplace=True)

In [None]:
friends['friend_id'] = list(map(lambda row: list(map(lambda x: int(x), row[1:-1].split(', '))), friends['friend_id']))

In [None]:
relationship = friends.explode('friend_id', ignore_index=True)

In [None]:
user_id = set(relationship['id'].values)
relationship['friend_id'] = relationship['friend_id'].astype(int)
relationship = relationship[relationship.friend_id.isin(user_id)]
relationship = relationship[relationship.id < relationship.friend_id]
relationship = relationship.reset_index()

In [None]:
relationship.head()

In [None]:
edge_num=len(relationship)

In [None]:
G = nx.from_pandas_edgelist(relationship, "id", "friend_id", create_using=nx.Graph())

In [None]:
G.number_of_edges()

In [None]:
G.number_of_nodes()

In [None]:
bridges = list(nx.bridges(G))

In [None]:
len(bridges)

In [None]:
bridges = set(bridges)

In [None]:
drop_edges = []



In [None]:
d = set()
cnt = 0


for row in relationship.iterrows():
    
    a = row[1]['id']
    b = row[1]['friend_id']
    
    if a not in d and b not in d and (a, b) not in bridges and (b, a) not in bridges:
        drop_edges.append(row[0])
        cnt += 1
        d.add(a)
        d.add(b)

In [None]:
len(drop_edges)

In [None]:
import random

random.seed(671) # for reproducibility
random_edges_idx = random.sample(drop_edges, 5000)
len(random_edges_idx)

In [None]:
# relationship.drop(columns='index', inplace=True)

In [None]:
reserve_id = list(set(range(0, edge_num)) - set(random_edges_idx))
print(len(reserve_id))

In [None]:
reserved_df = relationship.iloc[reserve_id]
reserved_df

In [None]:
G_train = nx.from_pandas_edgelist(reserved_df, "id", "friend_id", create_using=nx.Graph())

In [None]:
reserved_df.head()

In [None]:
relationship.head()

In [None]:
reserved_edges = torch.LongTensor(reserved_df[['id', 'friend_id']].transpose().values)
droped_edges = torch.LongTensor(relationship.iloc[random_edges_idx][['id', 'friend_id']].transpose().values)



In [None]:
known_edges = torch.cat([reserved_edges, droped_edges], dim=1)
neg_test_pairs = negative_sampling(known_edges, num_neg_samples=5000, force_undirected=True)
# known_edges = torch.cat([known_edges, neg_test_pairs], dim=1)

In [None]:
neg_test_pairs.size()

In [None]:
known_edges_df = pd.DataFrame(known_edges.numpy()).transpose()
known_edges_df.columns=["id", "friend_id"]
G_neg=nx.from_pandas_edgelist(known_edges_df, "id", "friend_id", create_using=nx.Graph())

In [None]:
type(relationship.iloc[0]['id'])

In [None]:
print(type(G.edges))
print(type(G_train.edges))
print(type(G_neg.edges))
print(len(G.edges))
print(len(G_train.edges))
print(len(G_neg.edges))
# G.edges - G_train.edges | G_neg.edges

In [None]:
adamic_adar_df = pd.DataFrame(nx.adamic_adar_index(G_train, ebunch=G.edges - G_train.edges | G_neg.edges))
adamic_adar_df.columns = ["id", "friend_id", 'Adamic_adar']

In [None]:
predicted_df=adamic_adar_df.sort_values(by='Adamic_adar', ascending=False)[0:5000]

In [None]:
keys = ["id", "friend_id"]
i1 = predicted_df.set_index(keys).index
i2 = reserved_df.set_index(keys).index
correct_df=predicted_df[i1.isin(i2)]
print(len(correct_df)/len(predicted_df))

In [None]:
len(adamic_adar_df)

In [None]:
len(predicted_df)

In [None]:
len(reserved_df)