In [122]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [123]:
import wandb, os
sampling_size = 10000
batch_size = 16384
os.environ['WANDB_WATCH'] = 'all'
wandb.login(key = "d737480c1d812ca4c1c791b14b0888051f72b45a")
wandb.init(name="testing_from-jongmo",
           project="GCN-recommendaiton",
           tags=['for_testing_in_A100_80gb'],
           group=f"Sample_size:{sampling_size}"
           )



0,1
train_loss,█▅▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
train_loss,70.13124


In [124]:
class InteractionDataset(Dataset):
    def __init__(self, positive_users, positive_items, negative_items):
        assert len(positive_users) == len(positive_items) == len(negative_items)
        self.positive_users = positive_users
        self.positive_items = positive_items
        self.negative_items = negative_items

    def __len__(self):
        return len(self.positive_users)

    def __getitem__(self, idx):
        return self.positive_users[idx], self.positive_items[idx], self.negative_items[idx]


class GraphConvolutionLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphConvolutionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.mm(adj, support)
        return output
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, embedding_dim):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolutionLayer(input_dim, hidden_dim)
        self.gc2 = GraphConvolutionLayer(hidden_dim, embedding_dim)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.gc2(x, adj)
        return x
class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, positive_scores, negative_scores):
        loss = -torch.sum(F.logsigmoid(positive_scores - negative_scores))
        return loss

In [133]:
#Data prep.
ui = pd.read_csv('final/final_ui_interaction.csv')
ie = pd.read_csv('final/final_ie_interaction.csv')
ui = ui[['userID', 'itemID']].sample(sampling_size)
ie = ie[['itemID', 'entityID']].sample(sampling_size)
# 1. 유니크한 사용자, 아이템, 엔터티 리스트 생성
unique_users = ui['userID'].unique()
unique_items = pd.concat([ui['itemID'], ie['itemID']]).unique()  # ui와 ie에서 아이템 가져오기
unique_entities = ie['entityID'].unique()

# 2. 새로운 ID 부여
user_to_id = {user: idx for idx, user in enumerate(unique_users)}
item_to_id = {item: idx + len(unique_users) for idx, item in enumerate(unique_items)}  # 사용자 수만큼 오프셋
entity_to_id = {entity: idx + len(unique_users) + len(unique_items) for idx, entity in enumerate(unique_entities)}

# 3. 새로운 ID로 데이터 업데이트
ui['userID'] = ui['userID'].map(user_to_id)
ui['itemID'] = ui['itemID'].map(item_to_id)
ie['itemID'] = ie['itemID'].map(item_to_id)
ie['entityID'] = ie['entityID'].map(entity_to_id)

# 4. 인접 행렬 생성
num_total_nodes = len(unique_users) + len(unique_items) + len(unique_entities)
adj_matrix = np.zeros((num_total_nodes, num_total_nodes))

# ui 상호작용 추가
for _, row in ui.iterrows():
    adj_matrix[row['userID'], row['itemID']] = 1
    adj_matrix[row['itemID'], row['userID']] = 1

# ie 상호작용 추가
for _, row in ie.iterrows():
    adj_matrix[row['itemID'], row['entityID']] = 1
    adj_matrix[row['entityID'], row['itemID']] = 1

# 결과 리턴: 새로운 ID 부여된 데이터와 인접행렬
new_ui = ui
new_ie = ie
total_nodes_length = len(unique_users) + len(unique_items) + len(unique_entities)

# new_ui, new_ie, adj_matrix 를 사용

In [126]:
import random

# 긍정적 유저-아이템 데이터 추출
positive_samples = new_ui.reset_index(drop=True)
positive_users = positive_samples['userID'].tolist()
positive_items = positive_samples['itemID'].tolist()

# 부정적 유저-아이템 데이터 생성
all_items = set(new_ui['itemID'].unique())
negative_users = []
negative_items = []

while len(negative_users) < sampling_size:
    random_user = random.choice(positive_users)
    random_item = random.choice(list(all_items))

    # 해당 유저가 해당 아이템과 상호작용하지 않았을 경우만 부정적 데이터로 추가
    if not new_ui[(new_ui['userID'] == random_user) & (new_ui['itemID'] == random_item)].empty:
        continue

    negative_users.append(random_user)
    negative_items.append(random_item)

print(len(positive_users), len(positive_items), len(negative_items))

10000 10000 10000


In [127]:
num_nodes = total_nodes_length
node_features = torch.eye(num_nodes)  # One-Hot 인코딩
epochs = 1000
dataset = InteractionDataset(positive_users, positive_items, negative_items)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
node_features = torch.FloatTensor(node_features)
node_features = node_features.to(device)
adj = torch.FloatTensor(adj_matrix)
adj = adj.to(device)
model = GCN(input_dim=node_features.size(1), hidden_dim=64, embedding_dim=32)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

bpr_loss = BPRLoss()

In [128]:

# Group by userID and aggregate itemIDs into sets
grouped = ui.groupby('userID')['itemID'].apply(set)

# Convert the grouped Series into a dictionary
ground_truth = grouped.to_dict()

In [131]:
def generate_recommendations(model, node_features, adj, user_ids, num_recommendations):
    model.eval()
    with torch.no_grad():
        embeddings = model(node_features, adj)
    recommendations = {}
    # Correcting the range of item_ids
    item_ids = torch.arange(len(unique_users), len(unique_users) + len(unique_items)).to(device)
    for user_id in user_ids:
        # Ensure user_id is within valid range
        if user_id >= len(unique_users):
            print(f'Skipping user_id {user_id} as it is out of bounds')
            continue
        scores = torch.sum(embeddings[user_id] * embeddings[item_ids], dim=1)
        _, top_indices = torch.topk(scores, num_recommendations)
        recommended_items = [item_ids[i].item() for i in top_indices]  # Converting tensor indices to Python integers
        recommendations[user_id] = set(recommended_items)
    return recommendations



def calculate_recall(recommendations, ground_truth):
    total_relevant_items = 0
    total_recommended_relevant_items = 0
    for user_id, recommended_items in recommendations.items():
        relevant_items = ground_truth.get(user_id, set())
        total_relevant_items += len(relevant_items)
        total_recommended_relevant_items += len(relevant_items.intersection(recommended_items))
    recall = total_recommended_relevant_items / total_relevant_items if total_relevant_items > 0 else 0
    return recall


In [132]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch_users, batch_pos_items, batch_neg_items in dataloader:
        batch_users = batch_users.to(device)
        batch_pos_items = batch_pos_items.to(device)
        batch_neg_items = batch_neg_items.to(device)        
        
        optimizer.zero_grad()

        embeddings = model(node_features, adj)
        
        positive_scores = torch.sum(embeddings[batch_users] * embeddings[batch_pos_items], dim=1)
        negative_scores = torch.sum(embeddings[batch_users] * embeddings[batch_neg_items], dim=1)

        loss = bpr_loss(positive_scores, negative_scores)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        user_ids = list(unique_users)
        num_recommendations = 10
        recommendations = generate_recommendations(model, node_features, adj, user_ids, num_recommendations)
        recall = calculate_recall(recommendations, ground_truth)
        
    Loss = total_loss / len(dataloader)
    wandb.log({"train_loss": Loss})
    print(f"Epoch {epoch + 1}, Loss: {Loss}, Recall:{recall}")

Skipping user_id 9718 as it is out of bounds
Skipping user_id 20421 as it is out of bounds
Skipping user_id 9938 as it is out of bounds
Skipping user_id 14768 as it is out of bounds
Skipping user_id 28081 as it is out of bounds
Skipping user_id 19522 as it is out of bounds
Skipping user_id 14034 as it is out of bounds
Skipping user_id 13141 as it is out of bounds
Skipping user_id 25213 as it is out of bounds
Skipping user_id 19365 as it is out of bounds
Skipping user_id 17377 as it is out of bounds
Skipping user_id 18869 as it is out of bounds
Skipping user_id 8022 as it is out of bounds
Skipping user_id 10084 as it is out of bounds
Skipping user_id 21988 as it is out of bounds
Skipping user_id 25898 as it is out of bounds
Skipping user_id 18986 as it is out of bounds
Skipping user_id 21210 as it is out of bounds
Skipping user_id 24862 as it is out of bounds
Skipping user_id 12209 as it is out of bounds
Skipping user_id 24817 as it is out of bounds
Skipping user_id 12866 as it is out o

KeyboardInterrupt: 