### Package Installation

In [1]:
# choose to install approach 1 or 2 based on embedding type
# following packages are compatible with gpu

# approach 1 - node embedding
# Install PyTorch 2.1.0 with CUDA 11.8 (GPU)
!pip install torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
# Install PyTorch Geometric (GPU version)
!pip install torch-scatter torch-sparse torch-geometric torch-cluster -f https://data.pyg.org/whl/torch-2.1.0+cu118.html
!pip uninstall -y numpy
!pip install numpy==1.24.4 --no-cache-dir --force-reinstall
#try another version of numpy
!pip install numpy==1.26.4 --force-reinstall --no-cache-dir


# approach 2 - embedding
# Install PyTorch with CUDA 11.8
!pip install torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
# Install PyTorch Geometric (GPU compatible)
!pip install torch-scatter torch-sparse torch-geometric torch-cluster -f https://data.pyg.org/whl/torch-2.1.0+cu118.html
# Reinstall compatible NumPy and transformers
!pip install numpy==1.24.4 --no-cache-dir --force-reinstall
!pip install -U huggingface-hub==0.23.0 transformers==4.41.0 sentence-transformers==2.6.1


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.1.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp311-cp311-linux_x86_64.whl (2325.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m977.8 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.16.0%2Bcu118-cp311-cp311-linux_x86_64.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.1.0
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp311-cp311-linux_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.1.0 (from torch==2.1.0+cu118)
  Downloading https://download.pytorch.org/whl/triton-2.1.0-0-cp311-cp311-m

Collecting huggingface-hub==0.23.0
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting transformers==4.41.0
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers==2.6.1
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━

### Google Colab Setup

Before getting started we need to run some standard code to set up our environment. You'll need to execute this code again each time you start the notebook.

First, run this cell to load the [autoreload](https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload) extension. This enables us to modify `.py` source files and reintegrate them into the notebook, ensuring a smooth editing and debugging experience.

In [1]:
%load_ext autoreload
%autoreload 2

Next we need to run a few commands to set up our environment on Google Colab. If you are running this notebook on a local machine you can skip this section.

Run the following cell to mount your Google Drive. Follow the link, sign in to your Google account (the same account you used to store this notebook!).

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

# TODO: Fill in the Google Drive path where you uploaded assignment1
# Example: If you create a Fall2023 folder and put all the files under A1 folder, then 'Fall2023/A1'
GOOGLE_DRIVE_PATH_POST_MYDRIVE = 'Colab Notebooks/Group-Project'
GOOGLE_DRIVE_PATH = os.path.join('/content', 'drive', 'MyDrive', GOOGLE_DRIVE_PATH_POST_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

['.git', 'dataset-screening', 'paper', 'data', 'model', 'notebooks', 'scripts', 'README.md', '.gitignore', 'requirements.txt', 'config']


### Local Setup or Google Colab
Run the cell below regardless of setup to set the path

In [4]:
# if running locally set GOOGLE PATH
import sys
if 'google.colab' in sys.modules:
  print(f'Running in google colab. Our path is `{GOOGLE_DRIVE_PATH}`')
else:
  GOOGLE_DRIVE_PATH = '.'
  print('Running locally.')

Running in google colab. Our path is `/content/drive/MyDrive/Colab Notebooks/Group-Project`


# 1. Data Prepration


### 1.1 Download Amazon Co-purchase Data

In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys



In [6]:
# change directory to script folder
sys.path.append(GOOGLE_DRIVE_PATH + "/scripts")
# download data
from download_data import download_amazon_data
download_amazon_data(GOOGLE_DRIVE_PATH)

'/content/drive/MyDrive/Colab Notebooks/Group-Project/data/amazon-meta.txt' already exists. Skipping download and extraction.


### 1.2 Convert data to CSV format

In [None]:
# change directory to script folder
sys.path.append(GOOGLE_DRIVE_PATH + "/scripts")
# download data
from convert_to_csv import convert_amazon_meta_to_csv
convert_amazon_meta_to_csv(GOOGLE_DRIVE_PATH)

✔️ CSV files already exist. Skipping conversion.


### 1.3 Data Exploration


#### Summary

##### Item


1.  roughly 1% of missing values for title/ group/ salesrank/ reviews columns and missings for these columns often happen at the same time -> solution: drop rows with missings in any of these columns
2.   group distribution is highly imbalanced, most observations fall into book or music -> solution: since our target is not group classification, we could use group as feature in modeling & maybe run groupwise evaluation if needed
3. salesrank, reviews_total, reviews_downloaded are right skewed.

##### Category
4. category hiearchy is pretty sparse after first 4-5 levels -> solution: collapse all hierarchies into 1 string or only focus on the first several levels
5. category csv only has 519781 items, compared to 548552 items in item table -> solution: inner join
6. 90% of items have one than one category path, but we only need 1 category path in the final feature matrix  
-> solution:
  - multi-label one-hot encoding + dimenstion reduction PCA: collect all possible categories from all layers and run one-hot encoding (approximately 1500 categories for the first 4 levels)
  - pick top 4 level and top representative category in each layer
  

##### Review
7. review csv only has 402724 items, compared to 548552 items in item table -> solution: inner join
8. 75% of items have more than one review record, need to combine to 1 for future modeling






In [29]:
# utility functions
def check_missing(df):
  result = pd.DataFrame({'col_name':list(df.columns),
                                   'num_missing':list(df.isnull().sum()),
                                   'percent_missing':[round(k/len(df), 4) for k in list(df.isnull().sum())]})
  return result

# count unique values for each column
def value_count_percentage(col_name, data):
    df = pd.DataFrame(data[col_name].value_counts())
    df.reset_index(inplace = True)
    df.columns = [col_name, 'count']
    df['percentage'] = round(df['count']/len(data),4)
    return df

#### 1.3.1 Items

In [None]:
# load csv
items_df = pd.read_csv(GOOGLE_DRIVE_PATH + "/data/items.csv")
items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548552 entries, 0 to 548551
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   item_Id             548552 non-null  int64  
 1   ASIN                548552 non-null  object 
 2   title               542684 non-null  object 
 3   group               542684 non-null  object 
 4   salesrank           542225 non-null  float64
 5   similar             379093 non-null  object 
 6   category_count      548552 non-null  int64  
 7   reviews_total       542684 non-null  float64
 8   reviews_downloaded  542684 non-null  float64
 9   reviews_avg_rating  542684 non-null  float64
dtypes: float64(4), int64(2), object(4)
memory usage: 41.9+ MB


In [None]:
items_df.head()

Unnamed: 0,item_Id,ASIN,title,group,salesrank,similar,category_count,reviews_total,reviews_downloaded,reviews_avg_rating
0,0,771044445,,,,,0,,,
1,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,396585.0,"0804215715,156101074X,0687023955,0687074231,08...",2,2.0,2.0,5.0
2,2,738700797,Candlemas: Feast of Flames,Book,168596.0,"0738700827,1567184960,1567182836,0738700525,07...",2,12.0,12.0,4.5
3,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,1270652.0,,1,1.0,1.0,5.0
4,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,631289.0,"0842328130,0830818138,0842330313,0842328610,08...",5,1.0,1.0,4.0


In [None]:
items_df.shape

(548552, 10)

In [None]:
# check for missings
check_missing(items_df)

Unnamed: 0,col_name,num_missing,percent_missing
0,item_Id,0,0.0
1,ASIN,0,0.0
2,title,5868,0.0107
3,group,5868,0.0107
4,salesrank,6327,0.0115
5,similar,169459,0.3089
6,category_count,0,0.0
7,reviews_total,5868,0.0107
8,reviews_downloaded,5868,0.0107
9,reviews_avg_rating,5868,0.0107


In [None]:
# check group balance
value_count_percentage('group', items_df)

Unnamed: 0,group,count,percentage
0,Book,393561,0.7175
1,Music,103144,0.188
2,Video,26131,0.0476
3,DVD,19828,0.0361
4,Toy,8,0.0
5,Software,5,0.0
6,CE,4,0.0
7,Video Games,1,0.0
8,Baby Product,1,0.0
9,Sports,1,0.0


In [None]:
# check for skewness
items_df[['salesrank', 'category_count', 'reviews_total',
         'reviews_downloaded', 'reviews_avg_rating']].describe()

Unnamed: 0,salesrank,category_count,reviews_total,reviews_downloaded,reviews_avg_rating
count,542225.0,548552.0,542684.0,542684.0,542684.0
mean,489738.5,4.575134,14.339818,13.992017,3.209534
std,567784.6,4.452269,74.768369,71.908413,1.996296
min,0.0,0.0,0.0,0.0,0.0
25%,91153.0,2.0,0.0,0.0,0.0
50%,300963.0,3.0,2.0,2.0,4.0
75%,672481.0,6.0,8.0,7.0,5.0
max,3798351.0,116.0,5545.0,4995.0,5.0


#### 1.3.2 Categories

In [None]:
categories_df = pd.read_csv(GOOGLE_DRIVE_PATH + "/data/categories.csv")
categories_df.info()

  categories_df = pd.read_csv(GOOGLE_DRIVE_PATH + "/data/categories.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509699 entries, 0 to 2509698
Data columns (total 12 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   item_Id           int64 
 1   category_path_1   object
 2   category_path_2   object
 3   category_path_3   object
 4   category_path_4   object
 5   category_path_5   object
 6   category_path_6   object
 7   category_path_7   object
 8   category_path_8   object
 9   category_path_9   object
 10  category_path_10  object
 11  category_path_11  object
dtypes: int64(1), object(11)
memory usage: 229.8+ MB


In [None]:
categories_df.head()

Unnamed: 0,item_Id,category_path_1,category_path_2,category_path_3,category_path_4,category_path_5,category_path_6,category_path_7,category_path_8,category_path_9,category_path_10,category_path_11
0,1,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],Clergy[12360],Preaching[12368],,,,,
1,1,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],Clergy[12360],Sermons[12370],,,,,
2,2,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],Wicca[12484],,,,,,
3,2,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],Witchcraft[12486],,,,,,
4,3,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],General[5144],,,,,,


In [None]:
# check for missing
check_missing(categories_df)

Unnamed: 0,col_name,num_missing,percent_missing
0,item_Id,0,0.0
1,category_path_1,0,0.0
2,category_path_2,0,0.0
3,category_path_3,18,0.0
4,category_path_4,7932,0.0032
5,category_path_5,328921,0.1311
6,category_path_6,1500645,0.5979
7,category_path_7,2192768,0.8737
8,category_path_8,2388197,0.9516
9,category_path_9,2456978,0.979


In [None]:
# count how many items have more than one category
tmp = categories_df.groupby('item_Id').size().reset_index(name = 'count').sort_values(by = 'count', ascending=False)
tmp[tmp['count'] > 1]

Unnamed: 0,item_Id,count
108750,115078,116
353462,374287,101
312952,331464,101
289079,305997,97
474599,502536,95
...,...,...
24,25,2
27,28,2
519760,548531,2
89388,94626,2


In [None]:
# dig into item with the most categories
categories_df[categories_df['item_Id'] == 115078]

Unnamed: 0,item_Id,category_path_1,category_path_2,category_path_3,category_path_4,category_path_5,category_path_6,category_path_7,category_path_8,category_path_9,category_path_10,category_path_11
526689,115078,Books[283155],Subjects[1000],Entertainment[86],Music[4507],Musical Genres[4512],Classical[1537],General[1620],,,,
526690,115078,Books[283155],Subjects[1000],Entertainment[86],Music[4507],History & Criticism[4511],,,,,,
526691,115078,Music[5174],Styles[301668],Classical[85],Ballets & Dances[5260],Polonaises[5305],,,,,,
526692,115078,Music[5174],Styles[301668],Classical[85],Chamber Music[5318],Quartets[5327],,,,,,
526693,115078,Music[5174],Styles[301668],Classical[85],Chamber Music[5318],Quintets[5330],,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
526800,115078,Music[5174],Styles[301668],Classical[85],"Featured Composers, A-Z[5338]",( R )[29612],"Rossini, Gioacchino[30703]",All Works by Rossini[573294],,,,
526801,115078,Music[5174],Styles[301668],Classical[85],"Featured Composers, A-Z[5338]",( P )[28166],"Prokofiev, Sergei[29355]",All Works by Prokofiev[573674],,,,
526802,115078,Music[5174],Styles[301668],Classical[85],"Featured Composers, A-Z[5338]",( R )[29612],"Rachmaninov, Sergei[29629]",All Works by Rachmaninov[574560],,,,
526803,115078,Books[283155],Formats[504358],Books on CD[69724],Music[1038314],,,,,,,


#### 1.3.3 Reviews

In [None]:
reviews_df = pd.read_csv(GOOGLE_DRIVE_PATH + "/data/reviews.csv")
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7593244 entries, 0 to 7593243
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   item_Id      int64 
 1   date         object
 2   customer_Id  object
 3   rating       int64 
 4   votes        int64 
 5   helpful      int64 
dtypes: int64(4), object(2)
memory usage: 347.6+ MB


In [None]:
reviews_df.head()

Unnamed: 0,item_Id,date,customer_Id,rating,votes,helpful
0,1,2000-7-28,A2JW67OY8U6HHK,5,10,9
1,1,2003-12-14,A2VE83MZF98ITY,5,6,5
2,2,2001-12-16,A11NCO6YTE4BTJ,5,5,4
3,2,2002-1-7,A9CQ3PLRNIR83,4,5,5
4,2,2002-1-24,A13SG9ACZ9O5IM,5,8,8


In [None]:
# check missings
check_missing(reviews_df)

Unnamed: 0,col_name,num_missing,percent_missing
0,item_Id,0,0.0
1,date,0,0.0
2,customer_Id,0,0.0
3,rating,0,0.0
4,votes,0,0.0
5,helpful,0,0.0


In [None]:
# count how many items have more than 1 reviews
reviews_df.groupby('item_Id').size().reset_index(name = 'count').sort_values(by = 'count', ascending=False)

Unnamed: 0,item_Id,count
17197,23792,4995
186604,258380,4995
65712,91158,4995
307893,428073,4995
152564,211463,4995
...,...,...
3,4,1
2,3,1
402721,548549,1
402713,548541,1


### 1.4 Data Cleaning



##### Summary

1.   Drop rows with missing values from items table
2.   Deal with sparsity in category hierarchies: take top 4 category level from hierarchy and for each level, only take the most representative category (mode); pick top 4 levels based on the missing percentage


*   pros: keep hierarchical information, low dimensionality, clean
*   cons: lose some fine-grained multi-label signals


3. combine reviews
4. Standardized numerical features
5. Join 3 dataframes -> get 402691 records (about 73.4% of original items)

#### 1.4.1 drop missing from items

In [None]:
# drop rows with missing
# take care of similar column
items_df = items_df.dropna(subset=['title', 'group', 'salesrank', 'reviews_total',
                                   'reviews_downloaded', 'reviews_avg_rating'])
check_missing(items_df)

Unnamed: 0,col_name,num_missing,percent_missing
0,item_Id,0,0.0
1,ASIN,0,0.0
2,title,0,0.0
3,group,0,0.0
4,salesrank,0,0.0
5,similar,163136,0.3009
6,category_count,0,0.0
7,reviews_total,0,0.0
8,reviews_downloaded,0,0.0
9,reviews_avg_rating,0,0.0


In [None]:
items_df.shape

(542225, 10)

#### 1.4.2 Take top 4 level from category hierarchy

In [None]:
# take top 4 levels and pick the most representative category for each level
categories_df = categories_df.iloc[:, :5]
# fill na
categories_df['category_path_1'] = categories_df['category_path_1'].fillna('unknown1')
categories_df['category_path_2'] = categories_df['category_path_2'].fillna('unknown2')
categories_df['category_path_3'] = categories_df['category_path_3'].fillna('unknown3')
categories_df['category_path_4'] = categories_df['category_path_4'].fillna('unknown4')
categories_df.head()

Unnamed: 0,item_Id,category_path_1,category_path_2,category_path_3,category_path_4
0,1,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290]
1,1,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290]
2,2,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472]
3,2,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472]
4,3,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126]


In [None]:
categories_df.item_Id.nunique()

519781

In [None]:
# for those items with multiple category path, take the most representative one
# i.e. compute occurence of each category path, take the top one
categories_count = categories_df.groupby(['item_Id', 'category_path_1',
                                          'category_path_2', 'category_path_3',
                                          'category_path_4']).size().reset_index(name='count')
categories_df = categories_count.sort_values(by=['item_Id', 'count'], ascending=[True, False]).groupby('item_Id').head(1).reset_index(drop = True)
categories_df = categories_df.drop('count', axis = 1)

In [None]:
categories_df.head()

Unnamed: 0,item_Id,category_path_1,category_path_2,category_path_3,category_path_4
0,1,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290]
1,2,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472]
2,3,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126]
3,4,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290]
4,5,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290]


In [None]:
categories_df.shape

(519781, 5)

#### 1.4.3 combine reviews

In [None]:
reviews_df = reviews_df.groupby('item_Id')[["rating", "votes", "helpful"]].mean().round(3).reset_index().rename(columns = {'rating':'reviews_avg_ratings', 'votes':'reviews_avg_votes', 'helpful':'reviews_avg_helpful'})

In [None]:
reviews_df.head()

Unnamed: 0,item_Id,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful
0,1,5.0,8.0,7.0
1,2,4.333,7.0,6.333
2,3,5.0,2.0,2.0
3,4,4.0,1.0,1.0
4,6,4.235,12.118,8.941


In [None]:
reviews_df.shape

(402724, 4)

#### 1.4.4 numerical column cleaning

In [None]:
# log transformation on skewed numerical features
items_df['salesrank_log'] = round(np.log1p(items_df['salesrank']), 3)
items_df['reviews_total_log'] = round(np.log1p(items_df['reviews_total']), 3)
items_df['reviews_downloaded_log'] = round(np.log1p(items_df['reviews_downloaded']), 3)

In [None]:
items_df.head()

Unnamed: 0,item_Id,ASIN,title,group,salesrank,similar,category_count,reviews_total,reviews_downloaded,reviews_avg_rating,salesrank_log,reviews_total_log,reviews_downloaded_log
1,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,396585.0,"0804215715,156101074X,0687023955,0687074231,08...",2,2.0,2.0,5.0,12.891,1.099,1.099
2,2,738700797,Candlemas: Feast of Flames,Book,168596.0,"0738700827,1567184960,1567182836,0738700525,07...",2,12.0,12.0,4.5,12.035,2.565,2.565
3,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,1270652.0,,1,1.0,1.0,5.0,14.055,0.693,0.693
4,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,631289.0,"0842328130,0830818138,0842330313,0842328610,08...",5,1.0,1.0,4.0,13.356,0.693,0.693
5,5,1577943082,Prayers That Avail Much for Business: Executive,Book,455160.0,"157794349X,0892749504,1577941829,0892749563,15...",2,0.0,0.0,0.0,13.028,0.0,0.0


In [None]:
items_df.shape

(542225, 13)

#### 1.4.5 combine item, category, review

In [None]:
df = pd.merge(items_df, categories_df, on='item_Id', how='inner')
df = pd.merge(df, reviews_df, on='item_Id', how='inner')
df.head()

Unnamed: 0,item_Id,ASIN,title,group,salesrank,similar,category_count,reviews_total,reviews_downloaded,reviews_avg_rating,salesrank_log,reviews_total_log,reviews_downloaded_log,category_path_1,category_path_2,category_path_3,category_path_4,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,396585.0,"0804215715,156101074X,0687023955,0687074231,08...",2,2.0,2.0,5.0,12.891,1.099,1.099,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],5.0,8.0,7.0
1,2,738700797,Candlemas: Feast of Flames,Book,168596.0,"0738700827,1567184960,1567182836,0738700525,07...",2,12.0,12.0,4.5,12.035,2.565,2.565,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],4.333,7.0,6.333
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,1270652.0,,1,1.0,1.0,5.0,14.055,0.693,0.693,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],5.0,2.0,2.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,631289.0,"0842328130,0830818138,0842330313,0842328610,08...",5,1.0,1.0,4.0,13.356,0.693,0.693,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],4.0,1.0,1.0
4,6,486220125,How the Other Half Lives: Studies Among the Te...,Book,188784.0,"0486401960,0452283612,0486229076,0714840343,03...",5,17.0,17.0,4.0,12.148,2.89,2.89,Books[283155],Subjects[1000],Arts & Photography[1],Photography[2020],4.235,12.118,8.941


In [None]:
df = df.drop(['salesrank', 'reviews_total', 'reviews_downloaded', 'reviews_avg_rating'], axis = 1, inplace = False)
df['similar'] = df['similar'].fillna('')
df.head()

Unnamed: 0,item_Id,ASIN,title,group,similar,category_count,salesrank_log,reviews_total_log,reviews_downloaded_log,category_path_1,category_path_2,category_path_3,category_path_4,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,"0804215715,156101074X,0687023955,0687074231,08...",2,12.891,1.099,1.099,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],5.0,8.0,7.0
1,2,738700797,Candlemas: Feast of Flames,Book,"0738700827,1567184960,1567182836,0738700525,07...",2,12.035,2.565,2.565,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],4.333,7.0,6.333
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,,1,14.055,0.693,0.693,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],5.0,2.0,2.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,"0842328130,0830818138,0842330313,0842328610,08...",5,13.356,0.693,0.693,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],4.0,1.0,1.0
4,6,486220125,How the Other Half Lives: Studies Among the Te...,Book,"0486401960,0452283612,0486229076,0714840343,03...",5,12.148,2.89,2.89,Books[283155],Subjects[1000],Arts & Photography[1],Photography[2020],4.235,12.118,8.941


In [None]:
# drop nodes only appear in similarity list but not in item table after removing discontinued items
def filter_similar(similar_str, unique_asin_ls):
  if pd.isna(similar_str):
    return ''
  similar_ls = similar_str.split(',')
  filtered_ls = [asin.strip() for asin in similar_ls if asin.strip() in unique_asin_ls]
  return ','.join(filtered_ls)


valid_asin = set(df['ASIN'].unique())
df['similar'] = df['similar'].apply(lambda x: filter_similar(x, valid_asin))

In [None]:
df.head()

Unnamed: 0,item_Id,ASIN,title,group,similar,category_count,salesrank_log,reviews_total_log,reviews_downloaded_log,category_path_1,category_path_2,category_path_3,category_path_4,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,"0804215715,156101074X,0687023955,0687074231,08...",2,12.891,1.099,1.099,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],5.0,8.0,7.0
1,2,738700797,Candlemas: Feast of Flames,Book,"0738700827,1567184960,1567182836,0738700525,07...",2,12.035,2.565,2.565,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],4.333,7.0,6.333
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,,1,14.055,0.693,0.693,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],5.0,2.0,2.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,0842328610,5,13.356,0.693,0.693,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],4.0,1.0,1.0
4,6,486220125,How the Other Half Lives: Studies Among the Te...,Book,0486401960045228361204862290760714840343,5,12.148,2.89,2.89,Books[283155],Subjects[1000],Arts & Photography[1],Photography[2020],4.235,12.118,8.941


In [None]:
df.shape

(402691, 16)

In [None]:
# write cleaned data into csv
df.to_csv(GOOGLE_DRIVE_PATH + "/data/items_cleaned.csv", index = False)

### 1.5 Input Data Format for Each Model

✅ Link Prediction Model Requirements Summary

| Model             | Graph Structure (positive edges tensor)       | Node Features Needed?     | Precomputed Embedding?          | Edge Label Pairs Needed? |
|------------------|-------------------------------|---------------------------|----------------------------------|---------------------------|
| **MLP + Node2Vec** | ❌ (used for embedding only)   | ❌                         | ✅ Node2Vec embeddings           | ✅                        |
| **GraphSAGE**     | ✅                             | ✅                         | ❌                                | ✅                        |
| **LightGCN**      | ✅                             | ❌                         | ❌                                | ✅                        |
| **PinSAGE**       | ✅ (with random walks)         | ✅ (e.g., title/review embeddings) | ❌                        | ✅                        |
| **GAT**           | ✅                             | ✅                         | ❌                                | ✅                        |
| **TGN**           | ✅ (dynamic edges + time)      | Optional                   | ❌                                | ✅                        |


1. MLP + Node2Vec Inputs Required:

- Precomputed Node2Vec embeddings: X = [node2vec_src_embedding ⨁ node2vec_tgt_embedding]

- Label: 1 (positive edge), 0 (negative edge)

📥 Example Input Format:

| node2vec_src_embedding       | node2vec_tgt_embedding       | label |
|------------------------------|------------------------------|-------|
| [0.12, 0.33, ..., 0.09]      | [0.05, 0.89, ..., 0.01]      | 1     |
| [0.27, 0.45, ..., 0.13]      | [0.41, 0.18, ..., 0.77]      | 0     |

Node2Vec generates embeddings using graph structure. MLP then takes concatenated embeddings of source/target as input.





2. GraphSAGE Inputs Required:
- Graph structure (positive_edge tensor)

- Node features matrix X (from category, review, title, etc.)

- Link prediction test pairs (edge_label_index inclues both positive and negative links)

- Labels (edge_label)

📥 Input Format:
X: Tensor of shape [num_nodes, num_features], e.g.:

| node\_id | category\_one\_hot | title\_emb            | numerical\_features           |
| -------- | ------------------ | --------------------- | --------------------- |
| 0        | \[0,0,1,...]       | \[0.1, 0.2, ..., 0.4] | \[0.3, 0.5, ..., 0.6] |
| 1        | \[1,0,0,...]       | \[0.2, 0.4, ..., 0.1] | \[0.1, 0.7, ..., 0.8] |

positive_edge tensor (graph structure):
tensor([
  [0, 1, 2],
  [1, 0, 3]
])
- Shape: [2, num_edges]

- Each column represents a directed edge from source to target.

edge_label_index (which nodes are we predicting link):
tensor([
  [10, 17],
  [25, 22]
])
- Shape: [2, num_links]

- These are the pairs of node indices that you're asking the model to predict a link score (probability) for.

edge_label:
tensor([1, 0])
- Shape: [num_links]

- These are the true labels for each link in edge_label_index.

3. LightGCN
- edge_index only (user-item or item-item connections)
- No features or initial embeddings

📥 Input Format:

edge_index = tensor([
  [0, 1, 2],
  [1, 2, 3]
])

4. PinSAGE
- Graph structure (edge_index)

- Node feature matrix X (including embeddings from titles, reviews, categories)

5. GAT (Graph Attention Network)
- X: Node features

- edge_index: Graph structure

➡️ Learns attention-weighted neighbor information.

6. TGN (Temporal Graph Network)

- Time-ordered edge list with timestamps

- Optional node features

📥 Input Format:

| source | target | timestamp  | label |
| ------ | ------ | ---------- | ----- |
| 101    | 303    | 2020-01-01 | 1     |
| 122    | 411    | 2020-01-04 | 0     |


### 1.6 Graph Building + Negative Sampling

#### Summary
1. Build positive edge lists
2. Collect all nodes
3. Random sample negative edges
- there are 402691 items in cleaned data, so a fully connect graph should roughly have ~ 81 billion edges
- however, there only exist ~ 0.8 million positive edge, which implicate sparsity of 0.001% (most nodes are not connected)
- to reflect the true structure of the graph (mostly disconnected), it's often useful to include more negatives (e.g., 5 times more),  so the model learns to correctly say “no link” in most cases
- generate 1:N negative samples (N = 5 or 10) could give us more flexibility for real-world usecase like Ranking Evaluation (Top-K)
4. create training labels
5. create edge_index tensor for future modeling with PyTorch Geometric
- shape [2, num_edges]
- each column in this matrix represents an edge: edge_index[0, i] = source_node, edge_index[1, i] = target_node.

In [10]:
df = pd.read_csv(GOOGLE_DRIVE_PATH + "/data/items_cleaned.csv")
df.head()

Unnamed: 0,item_Id,ASIN,title,group,similar,category_count,salesrank_log,reviews_total_log,reviews_downloaded_log,category_path_1,category_path_2,category_path_3,category_path_4,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,"0804215715,156101074X,0687023955,0687074231,08...",2,12.891,1.099,1.099,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],5.0,8.0,7.0
1,2,738700797,Candlemas: Feast of Flames,Book,"0738700827,1567184960,1567182836,0738700525,07...",2,12.035,2.565,2.565,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],4.333,7.0,6.333
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,,1,14.055,0.693,0.693,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],5.0,2.0,2.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,0842328610,5,13.356,0.693,0.693,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],4.0,1.0,1.0
4,6,486220125,How the Other Half Lives: Studies Among the Te...,Book,0486401960045228361204862290760714840343,5,12.148,2.89,2.89,Books[283155],Subjects[1000],Arts & Photography[1],Photography[2020],4.235,12.118,8.941


In [11]:
df.shape

(402691, 16)

In [None]:
# step 1: build positive edge lists

# model (Node2Vec/GraphSAGE) needs:
# Node indices from 0 to N-1, contiguous and dense, for edge_index, X, etc.
# build asin number to dense node index mapping
df['node_idx'] = df.index
asin_idx = dict(zip(df['ASIN'], df['node_idx']))
idx_asin = dict(zip(df['node_idx'], df['ASIN']))

positive_edges = set()
for _, row in df.iterrows():
    source = row['node_idx']
    # filter out missings in similar column
    if not isinstance(row['similar'], str):
        continue
    similar_asin = row['similar'].split(',')
    for asin in similar_asin:
      asin = asin.strip()
      target = asin_idx.get(asin)
      if target is not None:
        # create undirected edge
        edge = tuple(sorted((source, target)))
        positive_edges.add(edge)

positive_edges = list(positive_edges)

In [None]:
len(positive_edges)

808185

In [None]:
# save graph structure
import pickle
with open(GOOGLE_DRIVE_PATH + "/data/positive_edges.pkl", "wb") as f:
    pickle.dump(positive_edges, f)

print("Positive edges saved.")

Positive edges saved.


In [12]:
# updated step1 - used just for pinsage modeling
# pinsage model does not need separate negative sampling process
# will be taken care of in data_utis_inductive.py
import pickle
# Assign a contiguous node index (starts from 0)
df = df.reset_index(drop=True)
df["node_idx"] = df.index  # 0, 1, 2, ...

# Build ASIN → node index lookup
asin_to_idx = dict(zip(df["ASIN"], df["node_idx"]))

# Build positive edge set
positive_edges = set()

for _, row in df.iterrows():
    source = row["node_idx"]
    if not isinstance(row["similar"], str):
        continue
    similar_asins = row["similar"].split(",")
    for asin in similar_asins:
        asin = asin.strip()
        target = asin_to_idx.get(asin)
        if target is not None:
            edge = tuple(sorted((source, target)))  # undirected
            positive_edges.add(edge)

# Convert to list and save
positive_edges = list(positive_edges)

with open(GOOGLE_DRIVE_PATH + "/data/positive_edges.pkl", "wb") as f:
    pickle.dump(positive_edges, f)

print(f"Positive edges saved: {len(positive_edges)} edges.")


Positive edges saved: 808185 edges.


In [22]:
edge_df = pd.DataFrame(positive_edges, columns=['source', 'target'])
edge_df['label'] = 1  # Add label for positive edges

# Save to CSV
edge_df.to_csv(GOOGLE_DRIVE_PATH + "/data/edges.csv", index=False)

print("Saved to:", GOOGLE_DRIVE_PATH + "/data/edges.csv")

Saved to: /content/drive/MyDrive/Colab Notebooks/Group-Project/data/edges.csv


In [23]:
edge_df.head()

Unnamed: 0,source,target,label
0,52615,73940,1
1,112227,401139,1
2,221379,311246,1
3,316213,362655,1
4,253839,359125,1


In [None]:
# step2: collect all nodes from cleaned dataset
all_nodes = set(df['node_idx'].unique())

In [None]:
# step3: negative sampling with ratio n
import random
def generate_negative_edges(positive_edges, all_nodes, neg_ratio = 1):
    negative_edges = set()
    positive_edges = set(positive_edges)
    all_nodes = list(all_nodes)
    num_samples = len(positive_edges) * neg_ratio
    while len(negative_edges) < num_samples:
      source = random.choice(all_nodes)
      target = random.choice(all_nodes)
      if source != target:
        edge = tuple(sorted((source, target)))
        if edge not in positive_edges and edge not in negative_edges:
          negative_edges.add(edge)
    return list(negative_edges)


In [None]:
# step4: create training labels
negative_edges = generate_negative_edges(positive_edges, all_nodes, neg_ratio = 10)
positive_df = pd.DataFrame(positive_edges, columns=['source', 'target'])
positive_df['label'] = 1
negative_df = pd.DataFrame(negative_edges, columns=['source', 'target'])
negative_df['label'] = 0
edge_df = pd.concat([positive_df, negative_df], ignore_index=True)
edge_df = edge_df.sample(frac=1).reset_index(drop=True)


In [None]:
edge_df.head()

Unnamed: 0,source,target,label
0,133666,338395,1
1,185646,283312,0
2,46593,137703,0
3,46619,103582,0
4,128230,158838,1


In [None]:
edge_df.shape

(8890035, 3)

In [None]:
edge_df.to_csv(GOOGLE_DRIVE_PATH + "/data/pos_neg_edges_labeled.csv", index=False)

### 1.7 Node Embedding

In [None]:
import torch
import torch.nn.functional as F
import random
import copy
import numpy as np
from torch_geometric.nn import Node2Vec
from sklearn.model_selection import train_test_split

# Create graph tensor from all positive_edges
# edge_index = torch.tensor(positive_edges, dtype=torch.long).t().contiguous()

# Prepare positive and negative edges
all_edges = list(zip(edge_df['source'], edge_df['target']))
all_labels = edge_df['label'].tolist()

# Split into train and validation
# stratify to make sure positive and negative labels are balanced in train and valication sets
train_edges, val_edges, train_labels, val_labels = train_test_split(
    all_edges, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

# Create graph tensor from positive_edges from train set
train_pos_edges = [(u, v) for (u, v), label in zip(train_edges, train_labels) if label == 1]
edge_index = torch.tensor(train_pos_edges, dtype=torch.long).t().contiguous()

'''
Initializes a Node2Vec model with given hyperparameters.

Parameters:
    edge_index (Tensor): Graph connectivity with shape [2, num_edges].
    num_nodes (int): Number of nodes in the graph.
    embedding_dim (int): Dimension of the node embeddings.
    walk_length (int): Length of each random walk.
    context_size (int): Size of the context window used in Skip-Gram.
    walks_per_node (int): Number of walks per node.

Returns:
    Node2Vec: Initialized Node2Vec model.
'''
def build_node2vec(edge_index, num_nodes, embedding_dim, walk_length, context_size, walks_per_node):
    return Node2Vec(
        edge_index=edge_index,
        embedding_dim=embedding_dim,
        walk_length=walk_length,
        context_size=context_size,
        walks_per_node=walks_per_node,
        num_nodes=num_nodes,
        sparse=True
    )


'''
Trains the Node2Vec model for one epoch.

Parameters:
    model (Node2Vec): The Node2Vec model.
    optimizer (torch.optim.Optimizer): Optimizer to use for training.
    loader (DataLoader): Loader that generates random walks for training.
    device (str): Device to train on ('cpu' or 'cuda').

Returns:
    float: Average loss over the epoch.
'''
def train_node2vec(model, optimizer, loader, device):
    model.train()
    total_loss = 0
    # loop over batches
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

'''
Evaluates the quality of node embeddings for link prediction using ROC AUC.

Parameters:
    model (Node2Vec): The trained Node2Vec model.
    edge_list (List[Tuple[int, int]]): List of node index pairs to evaluate.
    labels (List[int]): Ground truth labels for edges (1 for positive, 0 for negative).
    device (str): Device to run evaluation on ('cpu' or 'cuda').

Returns:
    float: ROC AUC score measuring link prediction performance.
'''
def evaluate_link_prediction(model, edge_list, labels, device):
    model.eval()
    embeddings = model().detach().cpu()
    scores = []
    for (src, tgt) in edge_list:
        score = (embeddings[src] * embeddings[tgt]).sum().item()
        scores.append(score)
    preds = torch.sigmoid(torch.tensor(scores)).numpy()
    labels = np.array(labels)
    auc = roc_auc_score(labels, preds)
    return auc

from sklearn.metrics import roc_auc_score

# Define search space
search_space = []
while len(search_space) < 5:
    walk_length = random.choice([15, 20, 25, 30, 35, 40])
    context_size = random.choice([5, 10, 15, 20])

    if walk_length < 2 * context_size + 1:
        continue  # skip invalid config

    config = {
        'embedding_dim': random.choice([128, 256, 512]),
        'walk_length': walk_length,
        'context_size': context_size,
        'walks_per_node': random.choice([5, 10, 15]),
        'learning_rate': random.choice([1e-3, 1e-2]),
        'batch_size': random.choice([64, 128, 256]),
        'epochs': random.choice([5, 10, 15, 20])
    }

    search_space.append(config)

# best auc across all n random search configs
best_auc = 0
best_config = None
best_state = None

device = 'cuda' if torch.cuda.is_available() else 'cpu'

for i, config in enumerate(search_space):
    print(f"\n=== Trying config {i+1}: {config} ===")
    # initialize model
    model = build_node2vec(
        edge_index=edge_index,
        # total nodes in the whole graph
        num_nodes=df.shape[0],
        embedding_dim=config['embedding_dim'],
        walk_length=config['walk_length'],
        context_size=config['context_size'],
        walks_per_node=config['walks_per_node']
    ).to(device)

    loader = model.loader(batch_size=config['batch_size'], shuffle=True)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=config['learning_rate'])

    # best auc under the current config
    best_val_auc = 0
    early_stop_counter = 0

    for epoch in range(config['epochs']):
        loss = train_node2vec(model, optimizer, loader, device)
        val_auc = evaluate_link_prediction(model, val_edges, val_labels, device)
        print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}")

        if val_auc > best_val_auc:
            best_val_auc = val_auc
            early_stop_counter = 0
            if val_auc > best_auc:
                best_auc = val_auc
                best_config = config
                best_state = copy.deepcopy(model.state_dict())
        else:
            early_stop_counter += 1

        if early_stop_counter >= 5:
            print("Early stopping triggered.")
            break


=== Trying config 1: {'embedding_dim': 128, 'walk_length': 35, 'context_size': 5, 'walks_per_node': 15, 'learning_rate': 0.001, 'batch_size': 256, 'epochs': 15} ===
Epoch 1, Loss: 5.1174, Val AUC: 0.5063
Epoch 2, Loss: 4.2303, Val AUC: 0.5115
Epoch 3, Loss: 3.5284, Val AUC: 0.5161
Epoch 4, Loss: 2.9668, Val AUC: 0.5205
Epoch 5, Loss: 2.5206, Val AUC: 0.5248
Epoch 6, Loss: 2.1638, Val AUC: 0.5294
Epoch 7, Loss: 1.8722, Val AUC: 0.5342
Epoch 8, Loss: 1.6296, Val AUC: 0.5394
Epoch 9, Loss: 1.4270, Val AUC: 0.5452
Epoch 10, Loss: 1.2594, Val AUC: 0.5517
Epoch 11, Loss: 1.1236, Val AUC: 0.5590
Epoch 12, Loss: 1.0161, Val AUC: 0.5673
Epoch 13, Loss: 0.9334, Val AUC: 0.5766
Epoch 14, Loss: 0.8714, Val AUC: 0.5871
Epoch 15, Loss: 0.8262, Val AUC: 0.5991

=== Trying config 2: {'embedding_dim': 512, 'walk_length': 35, 'context_size': 15, 'walks_per_node': 15, 'learning_rate': 0.01, 'batch_size': 64, 'epochs': 10} ===
Epoch 1, Loss: 8.0440, Val AUC: 0.6305
Epoch 2, Loss: 2.5866, Val AUC: 0.6805


In [None]:
# save best model into model folder
os.makedirs(GOOGLE_DRIVE_PATH + "/model", exist_ok=True)
save_path = os.path.join(GOOGLE_DRIVE_PATH, 'model/best_node2vec_model.pth')
torch.save(best_state, save_path)
print(f"\n Best model saved to: {save_path}")
print("Best model parameters:")
for key, value in best_config.items():
    print(f"  {key}: {value}")


 Best model saved to: /content/drive/MyDrive/Colab Notebooks/Group-Project/model/best_node2vec_model.pth
Best model parameters:
  embedding_dim: 512
  walk_length: 35
  context_size: 15
  walks_per_node: 15
  learning_rate: 0.01
  batch_size: 64
  epochs: 10


In [None]:
# get node embedding
from torch_geometric.nn import Node2Vec

# Rebuild the model architecture (same config as best one)
best_model = Node2Vec(
    edge_index=edge_index,
    embedding_dim=best_config['embedding_dim'],
    walk_length=best_config['walk_length'],
    context_size=best_config['context_size'],
    walks_per_node=best_config['walks_per_node'],
    num_nodes=df.shape[0],
    sparse=True
).to(device)

# Load trained weights
best_model.load_state_dict(torch.load(os.path.join(GOOGLE_DRIVE_PATH, "model/best_node2vec_model.pth")))
best_model.eval()

with torch.no_grad():
    node_embeddings = best_model().cpu()  # shape: [num_nodes, embedding_dim]

In [None]:
node_embeddings.shape

torch.Size([402691, 512])

### 1.8 Feature Embedding

In [24]:
df = pd.read_csv(GOOGLE_DRIVE_PATH + "/data/items_cleaned.csv")
df.head()

Unnamed: 0,item_Id,ASIN,title,group,similar,category_count,salesrank_log,reviews_total_log,reviews_downloaded_log,category_path_1,category_path_2,category_path_3,category_path_4,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,"0804215715,156101074X,0687023955,0687074231,08...",2,12.891,1.099,1.099,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],5.0,8.0,7.0
1,2,738700797,Candlemas: Feast of Flames,Book,"0738700827,1567184960,1567182836,0738700525,07...",2,12.035,2.565,2.565,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],4.333,7.0,6.333
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,,1,14.055,0.693,0.693,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],5.0,2.0,2.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,0842328610,5,13.356,0.693,0.693,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],4.0,1.0,1.0
4,6,486220125,How the Other Half Lives: Studies Among the Te...,Book,0486401960045228361204862290760714840343,5,12.148,2.89,2.89,Books[283155],Subjects[1000],Arts & Photography[1],Photography[2020],4.235,12.118,8.941


In [27]:
df.shape

(402691, 17)

In [26]:
# Assign a contiguous node index (starts from 0)
df = df.reset_index(drop=True)
df["node_idx"] = df.index  # 0, 1, 2, ...
df.head()

Unnamed: 0,item_Id,ASIN,title,group,similar,category_count,salesrank_log,reviews_total_log,reviews_downloaded_log,category_path_1,category_path_2,category_path_3,category_path_4,reviews_avg_ratings,reviews_avg_votes,reviews_avg_helpful,node_idx
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,"0804215715,156101074X,0687023955,0687074231,08...",2,12.891,1.099,1.099,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],5.0,8.0,7.0,0
1,2,738700797,Candlemas: Feast of Flames,Book,"0738700827,1567184960,1567182836,0738700525,07...",2,12.035,2.565,2.565,Books[283155],Subjects[1000],Religion & Spirituality[22],Earth-Based Religions[12472],4.333,7.0,6.333,1
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,,1,14.055,0.693,0.693,Books[283155],Subjects[1000],Home & Garden[48],Crafts & Hobbies[5126],5.0,2.0,2.0,2
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,0842328610,5,13.356,0.693,0.693,Books[283155],Subjects[1000],Religion & Spirituality[22],Christianity[12290],4.0,1.0,1.0,3
4,6,486220125,How the Other Half Lives: Studies Among the Te...,Book,0486401960045228361204862290760714840343,5,12.148,2.89,2.89,Books[283155],Subjects[1000],Arts & Photography[1],Photography[2020],4.235,12.118,8.941,4


In [30]:
check_missing(df)

Unnamed: 0,col_name,num_missing,percent_missing
0,item_Id,0,0.0
1,ASIN,0,0.0
2,title,0,0.0
3,group,0,0.0
4,similar,96036,0.2385
5,category_count,0,0.0
6,salesrank_log,0,0.0
7,reviews_total_log,0,0.0
8,reviews_downloaded_log,0,0.0
9,category_path_1,0,0.0


#### 1.8.1 text encoding for title and category_path variables

In [31]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


model = SentenceTransformer('all-MiniLM-L6-v2')
text_fields = ['title', 'category_path_1', 'category_path_2', 'category_path_3', 'category_path_4']

embeddings = []
for col in text_fields:
    print(f"Encoding {col}...")
    col_embeddings = model.encode(df[col].tolist(), show_progress_bar=True, normalize_embeddings=True)
    embeddings.append(col_embeddings)

text_embeddings = np.concatenate(embeddings, axis=1)  # shape: (num_rows, 5 * embed_dim)
print(f"Final text embedding shape: {text_embeddings.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding title...


Batches:   0%|          | 0/12585 [00:00<?, ?it/s]

Encoding category_path_1...


Batches:   0%|          | 0/12585 [00:00<?, ?it/s]

Encoding category_path_2...


Batches:   0%|          | 0/12585 [00:00<?, ?it/s]

Encoding category_path_3...


Batches:   0%|          | 0/12585 [00:00<?, ?it/s]

Encoding category_path_4...


Batches:   0%|          | 0/12585 [00:00<?, ?it/s]

Final text embedding shape: (402691, 1920)


#### 1.8.2 one hot encoding on group

In [32]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
group_onehot = onehot.fit_transform(df[['group']])
print("One-hot encoding shape:", group_onehot.shape)

One-hot encoding shape: (402691, 10)


In [33]:
group_onehot

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### 1.8.3 combine feature embeddings

In [34]:
from sklearn.preprocessing import StandardScaler

numeric_cols = [
    'category_count', 'salesrank_log', 'reviews_total_log',
    'reviews_downloaded_log', 'reviews_avg_ratings',
    'reviews_avg_votes', 'reviews_avg_helpful'
]

# normalize
# df[numeric_cols] = df[numeric_cols].fillna(0)  # or use df[numeric_cols].mean()
scaler = StandardScaler()
numerical_features = scaler.fit_transform(df[numeric_cols])
print(f"Numerical features shape: {numerical_features.shape}")

X_features = np.hstack([
    text_embeddings,        # (num_items, 5×embedding_dim)
    group_onehot,           # (num_items, num_unique_groups)
    numerical_features      # (num_items, 7)
])
print(f"Final feature shape: {X_features.shape}")

Numerical features shape: (402691, 7)
Final feature shape: (402691, 1937)


In [36]:
# save to file
np.save(GOOGLE_DRIVE_PATH + "/data/features.npy", X_features)