In [1]:
import networkx as nx
import pandas as pd
import torch
import random

**Edge_index**

In [2]:
friends = pd.read_csv("user_friend.csv", sep="\t")
friends.drop(columns="Unnamed: 0", inplace=True)
friends['friend_id'] = list(map(lambda row: list(map(lambda x: int(x), row[1:-1].split(', '))), friends['friend_id']))
relationship = friends.explode('friend_id', ignore_index=True)

In [3]:
user_id = set(relationship['id'].values)
relationship['friend_id'] = relationship['friend_id'].astype(int)
relationship = relationship[relationship.friend_id.isin(user_id)]
relationship = relationship[relationship.id < relationship.friend_id]
relationship = relationship.reset_index()

In [4]:
relationship.head()

Unnamed: 0,index,id,friend_id
0,11,2594736,4489237
1,16,2594736,4554789
2,18,2594736,4554798
3,20,2594736,4292656
4,30,2594736,3473483


**Re-index User**

In [5]:
user = pd.read_json("cleaned_user.json")

In [6]:
mapper = {}
for x, y in zip(user['id'].index.values, user['id'].values):
    mapper[y] = x

relationship['id'] = relationship['id'].apply(lambda x: mapper[x])
relationship['friend_id'] = relationship['friend_id'].apply(lambda x: mapper[x])

In [7]:
G = nx.from_pandas_edgelist(relationship, "id", "friend_id", create_using=nx.Graph())
bridges = set(nx.bridges(G))

In [8]:
d = set()
cnt = 0
drop_edges = []


for row in relationship.iterrows():
    a = row[1]['id']
    b = row[1]['friend_id']
    
    if a not in d and b not in d and (a, b) not in bridges and (b, a) not in bridges:
        drop_edges.append(row[0])
        cnt += 1
        d.add(a)
        d.add(b)

In [9]:
train_test_num_edges = 20000
test_num_edges = 10000

In [10]:
random.seed(671) # for reproducibility
random_edges_idx = random.sample(drop_edges, train_test_num_edges)
len(random_edges_idx)

20000

In [11]:
test_id = random.sample(range(train_test_num_edges), test_num_edges)
train_id = list(set(range(train_test_num_edges)) - set(test_id))

In [12]:
test_edge_id = [random_edges_idx[id] for id in test_id]
train_edge_id = [random_edges_idx[id] for id in train_id]

In [13]:
reserved_id = list(set(range(relationship.shape[0])) - set(random_edges_idx))

In [14]:
train_edges = torch.LongTensor(relationship.iloc[train_edge_id][['id', 'friend_id']].transpose().values)
test_edges = torch.LongTensor(relationship.iloc[test_edge_id][['id', 'friend_id']].transpose().values)
reserved_edges = torch.LongTensor(relationship.iloc[reserved_id][['id', 'friend_id']].transpose().values)

**Node Features**

In [15]:
user['elite'] = user['elite'].apply(lambda row: len(set(row.split(','))) if row else 0)
user.drop(columns=['Unnamed: 0', 'name', 'yelping_since', 'friend_id', 'id'], inplace=True)

In [16]:
user.columns

Index(['average_stars', 'compliment_cool', 'compliment_cute',
       'compliment_funny', 'compliment_hot', 'compliment_list',
       'compliment_more', 'compliment_note', 'compliment_photos',
       'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool',
       'elite', 'fans', 'funny', 'review_count', 'useful'],
      dtype='object')

In [17]:
x = torch.Tensor(user.values)
x_scaled = torch.nn.functional.normalize(x)

In [18]:
x_scaled.shape, reserved_edges.shape

(torch.Size([67095, 18]), torch.Size([2, 1880802]))

**Graph**

In [19]:
import torch
from torch_geometric.data import Data

data = Data(x=x_scaled, edge_index=reserved_edges)

In [20]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
data = data.to(device=device)
train_edges = train_edges.to(device=device)
test_edges = test_edges.to(device=device)
reserved_edges = reserved_edges.to(device=device)

In [21]:
data.is_cuda

True

**Model**

In [74]:
from torch_geometric.nn import GCNConv, GraphSAGE
import torch.nn as nn


class MyModel(nn.Module):
    def __init__(self, in_dims, hidden_dims, num_layers, out_dims) -> None:
        super().__init__()
        self.gcn =  GraphSAGE(in_channels = in_dims, hidden_channels=hidden_dims, num_layers=num_layers, 
                        out_channels=out_dims, project=True, normalize=True)
        self.mlp = nn.Sequential(nn.Linear(out_dims, out_dims), nn.ReLU(), nn.Linear(out_dims, out_dims))
        self.init_weights(self.mlp)
    
    def init_weights(self, layers):
        for layer in layers:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight.data)
    
    def forward(self, data):
        # x = self.linear(data.x)
        embedding = self.mlp(self.gcn(x=data.x, edge_index=data.edge_index))
        return embedding

In [75]:
in_dims = data.x.shape[1]
model = MyModel(in_dims, 64, 2, 8).to(device)

**Go Training**

In [76]:
from tqdm import tqdm
from torch.optim import Adam
from torch_geometric.utils import negative_sampling

In [77]:
num_epochs = 2000
lr = 5e-4

loss_histroy = []
optimizer = Adam(model.parameters(), lr=lr)

criterion = nn.BCEWithLogitsLoss()

In [78]:
def run(model, data, pairs):
    embeddings = model(data)
    a = embeddings[pairs[0]]
    b = embeddings[pairs[1]]
    val = torch.sum(a * b, dim=1)
    return val

In [79]:
known_edges = torch.cat([reserved_edges, train_edges, test_edges], dim=1)
neg_test_pairs = negative_sampling(known_edges, data.x.shape[0], test_num_edges, force_undirected=True).to(device)
known_edges = torch.cat([known_edges, neg_test_pairs], dim=1)

test_pairs = torch.cat([test_edges, neg_test_pairs], dim=1)
test_marks = torch.cat([torch.ones(test_edges.shape[1]), torch.zeros(neg_test_pairs.shape[1])]).to(device)

In [80]:
for x in tqdm(range(num_epochs)):
    
    model.train()
    loss = 0
    
    neg_pairs = negative_sampling(known_edges, data.x.shape[0], train_test_num_edges - test_num_edges, force_undirected=True).to(device)
    pairs = torch.cat([train_edges, neg_pairs], dim=1)
    marks = torch.cat([torch.ones(train_edges.shape[1]), torch.zeros(neg_pairs.shape[1])]).to(device)
    
    optimizer.zero_grad()
    
    val = run(model, data, pairs)
    loss = criterion(val, marks).mean()
    
    loss.backward()
    optimizer.step()
    loss_histroy.append(loss)
    
    if (x + 1) % 100 == 0:
        model.eval()
        with torch.no_grad():
            val = torch.sigmoid(run(model, data, pairs))
            torch.round_(val)
            acc = (val == marks).sum() / val.shape[0]
            print("Train:", acc)
            val = torch.sigmoid(run(model, data, test_pairs))
            torch.round_(val)
            acc = (val == test_marks).sum() / val.shape[0]
            print("Test:", acc)

  5%|▌         | 102/2000 [00:05<01:42, 18.57it/s]

Train: tensor(0.6030, device='cuda:0')
Test: tensor(0.6000, device='cuda:0')


 10%|█         | 203/2000 [00:10<01:36, 18.64it/s]

Train: tensor(0.6332, device='cuda:0')
Test: tensor(0.6253, device='cuda:0')


 15%|█▌        | 304/2000 [00:15<01:26, 19.71it/s]

Train: tensor(0.6436, device='cuda:0')
Test: tensor(0.6418, device='cuda:0')


 20%|██        | 402/2000 [00:20<01:19, 20.01it/s]

Train: tensor(0.6482, device='cuda:0')
Test: tensor(0.6474, device='cuda:0')


 25%|██▌       | 503/2000 [00:25<01:14, 19.97it/s]

Train: tensor(0.6549, device='cuda:0')
Test: tensor(0.6541, device='cuda:0')


 30%|███       | 602/2000 [00:30<01:09, 20.21it/s]

Train: tensor(0.6654, device='cuda:0')
Test: tensor(0.6534, device='cuda:0')


 35%|███▌      | 704/2000 [00:35<01:04, 20.21it/s]

Train: tensor(0.6696, device='cuda:0')
Test: tensor(0.6560, device='cuda:0')


 40%|████      | 803/2000 [00:40<00:59, 20.24it/s]

Train: tensor(0.6628, device='cuda:0')
Test: tensor(0.6607, device='cuda:0')


 45%|████▌     | 902/2000 [00:45<00:54, 20.06it/s]

Train: tensor(0.6672, device='cuda:0')
Test: tensor(0.6668, device='cuda:0')


 50%|█████     | 1004/2000 [00:50<00:49, 20.28it/s]

Train: tensor(0.6847, device='cuda:0')
Test: tensor(0.6683, device='cuda:0')


 55%|█████▌    | 1103/2000 [00:55<00:44, 20.12it/s]

Train: tensor(0.6804, device='cuda:0')
Test: tensor(0.6716, device='cuda:0')


 60%|██████    | 1202/2000 [01:00<00:39, 20.08it/s]

Train: tensor(0.6844, device='cuda:0')
Test: tensor(0.6716, device='cuda:0')


 65%|██████▌   | 1304/2000 [01:05<00:34, 20.05it/s]

Train: tensor(0.6814, device='cuda:0')
Test: tensor(0.6743, device='cuda:0')


 70%|███████   | 1403/2000 [01:09<00:29, 20.29it/s]

Train: tensor(0.6876, device='cuda:0')
Test: tensor(0.6748, device='cuda:0')


 75%|███████▌  | 1502/2000 [01:14<00:24, 19.98it/s]

Train: tensor(0.6896, device='cuda:0')
Test: tensor(0.6759, device='cuda:0')


 80%|████████  | 1604/2000 [01:19<00:19, 20.28it/s]

Train: tensor(0.6878, device='cuda:0')
Test: tensor(0.6754, device='cuda:0')


 85%|████████▌ | 1703/2000 [01:24<00:14, 20.17it/s]

Train: tensor(0.6967, device='cuda:0')
Test: tensor(0.6774, device='cuda:0')


 90%|█████████ | 1802/2000 [01:29<00:09, 19.93it/s]

Train: tensor(0.6939, device='cuda:0')
Test: tensor(0.6775, device='cuda:0')


 95%|█████████▌| 1904/2000 [01:34<00:04, 20.29it/s]

Train: tensor(0.6890, device='cuda:0')
Test: tensor(0.6769, device='cuda:0')


100%|██████████| 2000/2000 [01:39<00:00, 20.18it/s]

Train: tensor(0.6921, device='cuda:0')
Test: tensor(0.6727, device='cuda:0')



