# 📢 아마존북 데이터셋 구축과정(light-GCN 실습)

In [1]:
import os.path as osp
import pandas as pd
import openpyxl
import numpy as np

import torch
from tqdm import tqdm
import random
import copy

from torch_geometric.datasets import AmazonBook
from torch_geometric.nn import LightGCN
from torch_geometric.utils import degree

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10000)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
path = osp.join(osp.dirname(osp.join(osp.dirname('/step1/pytorch_geometric/examples'), '..', 'data', 'Amazon')), '..', 'data', 'Amazon')
dataset = AmazonBook(path)
data = dataset[0]
num_users, num_books = data['user'].num_nodes, data['book'].num_nodes
data = data.to_homogeneous().to(device)

# Use all message passing edges as training labels:
batch_size = 8192
mask = data.edge_index[0] < data.edge_index[1]
train_edge_label_index = data.edge_index[:, mask]
train_loader = torch.utils.data.DataLoader(
    range(train_edge_label_index.size(1)),
    shuffle=True,
    batch_size=batch_size,
)

In [3]:
model = LightGCN(
    num_nodes=data.num_nodes,
    embedding_dim=64,
    num_layers=2,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [4]:
def train():
    total_loss = total_examples = 0

    for index in tqdm(train_loader):
        # Sample positive and negative labels.
        pos_edge_label_index = train_edge_label_index[:, index]
        neg_edge_label_index = torch.stack([
            pos_edge_label_index[0],
            torch.randint(num_users, num_users + num_books,
                          (index.numel(), ), device=device)
        ], dim=0)
        edge_label_index = torch.cat([
            pos_edge_label_index,
            neg_edge_label_index,
        ], dim=1)

        optimizer.zero_grad()
        pos_rank, neg_rank = model(data.edge_index, edge_label_index).chunk(2) # model은 여기에

        loss = model.recommendation_loss(
            pos_rank,
            neg_rank,
            node_id=edge_label_index.unique(),
        )
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()

    return total_loss / total_examples


@torch.no_grad()
def test(k: int):
    emb = model.get_embedding(data.edge_index)
    user_emb, book_emb = emb[:num_users], emb[num_users:]

    precision = recall = total_examples = 0
    for start in range(0, num_users, batch_size):
        end = start + batch_size
        logits = user_emb[start:end] @ book_emb.t()

        # Exclude training edges:
        mask = ((train_edge_label_index[0] >= start) &
                (train_edge_label_index[0] < end))
        logits[train_edge_label_index[0, mask] - start,
               train_edge_label_index[1, mask] - num_users] = float('-inf')

        # Computing precision and recall:
        ground_truth = torch.zeros_like(logits, dtype=torch.bool)
        mask = ((data.edge_label_index[0] >= start) &
                (data.edge_label_index[0] < end))
        ground_truth[data.edge_label_index[0, mask] - start,
                     data.edge_label_index[1, mask] - num_users] = True
        node_count = degree(data.edge_label_index[0, mask] - start,
                            num_nodes=logits.size(0))

        topk_index = logits.topk(k, dim=-1).indices
        isin_mat = ground_truth.gather(1, topk_index)

        precision += float((isin_mat.sum(dim=-1) / k).sum())
        recall += float((isin_mat.sum(dim=-1) / node_count.clamp(1e-6)).sum())
        total_examples += int((node_count > 0).sum())

    return precision / total_examples, recall / total_examples

## 📌train_loader를 분해해서 패턴분석
* 여기서 edge_label_index는 pos + neg
* batchsize가 8,192라면 edge_label_index_list[0].T의 길이는 그 두배인 16,384
* neg_edge_label_index는 pos_edge_label_index[0]와 torch.randint로 구성된 랜덤 값

In [5]:
positive_edge_label_idx_list = []
negative_edge_label_idx_list = []
edge_label_index_list = []

for index in tqdm(train_loader):
    # Sample positive and negative labels.
    pos_edge_label_index = train_edge_label_index[:, index]
    neg_edge_label_index = torch.stack([
        pos_edge_label_index[0],
        torch.randint(num_users, num_users + num_books,
                      (index.numel(), ), device=device)
    ], dim=0)
    edge_label_index = torch.cat([
        pos_edge_label_index,
        neg_edge_label_index,
    ], dim=1)
    positive_edge_label_idx_list.append(pos_edge_label_index)
    negative_edge_label_idx_list.append(neg_edge_label_index)
    edge_label_index_list.append(edge_label_index)

100%|██████████| 291/291 [00:00<00:00, 682.21it/s]


In [6]:
print(f"edge_label_index_list 전치: {len(edge_label_index_list[0].T)} (batchsize8192)")

edge_label_index_list 전치: 16384 (batchsize8192)


### 🍔 edge_label_index는 pos와 neg 합친 것이란 것을 알 수 있었다.
* positive는 train.txt와 비교했을 때 일치함을 알 수 있다.
* label은 1로 표기됨을 알 수 있다.

In [7]:
df_positive = pd.DataFrame()
for i in range(len(positive_edge_label_idx_list)):
    df_positive = pd.concat([df_positive, pd.DataFrame(positive_edge_label_idx_list[i].cpu().numpy().T)])
df_positive = df_positive.reset_index(drop=True)
df_positive.columns = ['user', 'book']
df_positive['label'] = 1

df_negative = pd.DataFrame()
for i in range(len(negative_edge_label_idx_list)):
    df_negative = pd.concat([df_negative, pd.DataFrame(negative_edge_label_idx_list[i].cpu().numpy().T)])
df_negative = df_negative.reset_index(drop=True)
df_negative.columns = ['user', 'book']
df_negative['label'] = 0

df_edge_label_index = pd.DataFrame()
for i in range(len(edge_label_index_list)):
    df_edge_label_index = pd.concat([df_edge_label_index, pd.DataFrame(edge_label_index_list[i].cpu().numpy().T)])
df_edge_label_index.columns = ['A', 'B']
df_edge_label_index = df_edge_label_index.reset_index(drop=True)

df_positive_negative = pd.concat([df_positive, df_negative], ignore_index=True)

# df_positive_negative 오름차순 정렬
df_positive_negative = df_positive_negative.sort_values(by=['user', 'book'], ascending=True)
df_positive_negative = df_positive_negative.reset_index(drop=True)

display(df_positive_negative)
display(df_edge_label_index)


Unnamed: 0,user,book,label
0,0,52643,1
1,0,52644,1
2,0,52645,1
3,0,52646,1
4,0,52647,1
...,...,...,...
4761455,52642,133851,0
4761456,52642,137837,0
4761457,52642,138240,0
4761458,52642,140543,0


Unnamed: 0,A,B
0,52384,140691
1,29873,100832
2,18648,77281
3,37448,106719
4,48894,91906
...,...,...
4761455,35455,81610
4761456,11745,74449
4761457,42933,73054
4761458,6791,138205


### 🍔 엑셀파일로 저장
* 실행시간: 약 2 ~ 3분

In [8]:
# 엑셀 파일로 저장
chunk_size = 1000000  # 100만개씩 나눔
num_chunks = len(df_positive_negative) // chunk_size + 1

with pd.ExcelWriter('df_positive_negative.xlsx', engine='xlsxwriter') as writer:
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunk_df = df_positive_negative.iloc[start_idx:end_idx]
        sheet_name = f"TAB{i + 1}"
        chunk_df.to_excel(writer, sheet_name=sheet_name, index=False)

⬇⬇⬇ label 0은 edge가 연결되지 않은 것.
* train루틴 쪽에서 torch.randint를 이용해서 랜덤으로 생성

In [9]:
tmp1 = df_positive_negative[df_positive_negative['label']==0]
tmp2 = df_positive_negative[df_positive_negative['label']==1]

print(f"negative edge: {tmp1['book'].unique()}")
print(f"positive edge: {tmp2['book'].unique()}")

display(tmp1)
display(tmp2)


negative edge: [ 53953  58727  62618 ...  68436  63359 137276]
positive edge: [ 52643  52644  52645 ... 144239 144240 144241]


Unnamed: 0,user,book,label
53,0,53953,0
54,0,58727,0
55,0,62618,0
56,0,63422,0
57,0,65189,0
...,...,...,...
4761455,52642,133851,0
4761456,52642,137837,0
4761457,52642,138240,0
4761458,52642,140543,0


Unnamed: 0,user,book,label
0,0,52643,1
1,0,52644,1
2,0,52645,1
3,0,52646,1
4,0,52647,1
...,...,...,...
4761436,52642,76784,1
4761438,52642,83953,1
4761439,52642,85251,1
4761440,52642,85419,1


### 🍔 data.pt를 보고 분석하기. 
* Amazon 클래스 실행시키면 다운로드 받으면서 data.pt 만듬
* 딕셔너리이다.

#### data는 train_loader 할 때 쓰이는 변수


In [15]:
data

Data(edge_index=[2, 4761460], edge_label_index=[2, 603378], node_type=[144242], edge_type=[4761460])

#### data.pt는 Amazone 클래스 데이터셋 구축할 때 만들어짐

In [10]:
data_tmp = torch.load('/step1/data/Amazon/processed/data.pt')
data_tmp

({'_global_store': {},
  'user': {'num_nodes': 52643},
  'book': {'num_nodes': 91599},
  ('user',
   'rates',
   'book'): {'edge_index': tensor([[    0,     0,     0,  ..., 52642, 52642, 52642],
           [    0,     1,     2,  ..., 23186, 10690, 10874]]), 'edge_label_index': tensor([[    0,     0,     0,  ..., 52642, 52642, 52642],
           [ 7202,   114,  5612,  ..., 31782, 33163,   106]])},
  ('book',
   'rated_by',
   'user'): {'edge_index': tensor([[    0,     1,     2,  ..., 23186, 10690, 10874],
           [    0,     0,     0,  ..., 52642, 52642, 52642]])}},
 None)

In [11]:
# data_tmp 구조 분석
len(data_tmp[0])

5

In [12]:
data_tmp[0]['user'] , data_tmp[0]['book'], data_tmp[0][('user','rates','book')]

({'num_nodes': 52643},
 {'num_nodes': 91599},
 {'edge_index': tensor([[    0,     0,     0,  ..., 52642, 52642, 52642],
          [    0,     1,     2,  ..., 23186, 10690, 10874]]),
  'edge_label_index': tensor([[    0,     0,     0,  ..., 52642, 52642, 52642],
          [ 7202,   114,  5612,  ..., 31782, 33163,   106]])})

### 🍔 유저가 책 평가한 거 (유저아이디 - 북아이디)

In [13]:
datauser_df = pd.DataFrame(data_tmp[0][('user','rates','book')]['edge_index'].T.tolist())
datauser_df.columns = ['user', 'book']
datauser_df

Unnamed: 0,user,book
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
2380725,52642,1435
2380726,52642,5406
2380727,52642,23186
2380728,52642,10690


### 🍔 평가된 책에 대한 유저 아이디 (북아이디 - 유저아이디)

In [14]:
databook_df = pd.DataFrame(data_tmp[0][('book','rated_by','user')]['edge_index'].T.tolist())
databook_df.columns = ['book', 'user']
databook_df

Unnamed: 0,book,user
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
2380725,1435,52642
2380726,5406,52642
2380727,23186,52642
2380728,10690,52642
