In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import Sequential as Seq, Linear, ReLU
from torch.nn.functional import binary_cross_entropy

import torch_geometric
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.nn import HeteroConv, SAGEConv, NNConv

from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

data = pd.read_csv('final_fantasy.csv')
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Change cpu to mps for Mac
# device = torch.device("cpu")  # Move computation to CPU

In [None]:
# Encoding data
user_encoder, game_encoder, genre_encoder, category_encoder = LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder()
full_data = data.shape[0]
percent_data = .25
num_entries = int(full_data * percent_data) # Number of entries to use

smaller_df = data[:num_entries]
smaller_df = smaller_df[['user_steam_id', 'game_appid', 'genres_en', 'categories_en', 'user_playtime_forever']]
smaller_df['user_id'] = user_encoder.fit_transform(smaller_df['user_steam_id'])
smaller_df['game_id'] = game_encoder.fit_transform(smaller_df['game_appid'])

genre_map = dict()
for genres in data['genres_en']:
    if genres == 'Nan':
        continue
    for genre in eval(genres):
        if genre not in genre_map:
            genre_map[genre] = len(genre_map)

def genre_mapper(row):
    if row == 'Nan':
        return []
    row = eval(row)
    formatted_row = [genre_map[item] for item in row]
    return formatted_row

smaller_df['genre_ids'] = smaller_df['genres_en'].apply(genre_mapper).copy()

category_map = dict()
for categories in data['categories_en']:
    if categories == 'Nan':
        continue
    categories = categories.replace('s\' ', 's') # Screw you 'Players' Battles'
    for category in eval(categories):
        if category not in category_map:
            category_map[category] = len(category_map)

def category_mapper(row):
    if row == 'Nan':
        return []
    row = eval(row)
    formatted_row = [category_map[item] for item in row]
    return formatted_row

smaller_df['categories_en'] = smaller_df['categories_en'].apply(lambda x: x.replace('s\' ', 's'))
smaller_df['category_ids'] = smaller_df['categories_en'].apply(category_mapper).copy()
smaller_df.head()

Unnamed: 0,user_steam_id,game_appid,genres_en,categories_en,user_playtime_forever,user_id,game_id,genre_ids,category_ids
0,76561198060785055,240,['Action'],"['Multi-Player', 'Cross-Platform Multiplayer',...",7279,2553,10,[0],"[0, 1, 2, 3, 4, 5, 6, 7]"
1,76561198060785055,12210,"['Action', 'Adventure']","['Single-Player', 'Multi-Player', 'Partial Con...",0,2553,333,"[0, 1]","[8, 0, 9, 10, 11]"
2,76561198060785055,620,"['Action', 'Adventure']","['Single Person', 'Multiple People', 'Cooperat...",717,2553,18,"[0, 1]","[12, 13, 14, 15, 16, 17, 2, 18, 19, 20, 21, 21..."
3,76561198060785055,105600,"['Action', 'Adventure', 'Indie', 'Rpg']","['Single-Player', 'Multi-Player', 'Pvp', 'Onli...",4578,2553,1248,"[0, 1, 2, 3]","[8, 0, 29, 30, 31, 32, 2, 33, 34, 3, 10, 11, 3..."
4,76561198060785055,46520,"['Action', 'Indie']","['Single-Player', 'Steam Achievements', 'Parti...",0,2553,956,"[0, 2]","[8, 2, 9, 36, 7]"


In [3]:
# Setting up HeteroData

h_data = HeteroData() # The h is for hentai
h_data['user'].x = torch.arange(len(user_encoder.classes_)).to(device)
h_data['game'].x = torch.arange(len(game_encoder.classes_)).to(device)
h_data['genre'].x = torch.arange(len(genre_map)).to(device)
h_data['category'].x = torch.arange(len(category_map)).to(device)
h_data = h_data.to(device)

  h_data['user'].x = torch.arange(len(user_encoder.classes_)).to(device)


In [None]:
# Embedding Genres
from tqdm import tqdm
from torch.cuda.amp import autocast

def embed_texts(texts, batch_size=32, save_every_n_batches=100, checkpoint_path=None, batches_completed=0):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)
    model.eval()
    all_embeddings = []
    name = texts.name
    texts = texts.tolist()

    if checkpoint_path and os.path.exists(checkpoint_path):
        all_embeddings = torch.cat(torch.load(checkpoint_path), dim=0).to(device)
        print(f"Loaded checkpoint from {checkpoint_path} with {len(all_embeddings)} batches.")

    with torch.no_grad():
        progress_bar = tqdm(total=len(texts) // batch_size, desc=f"Embedding {name}", unit="batch")
        
        for i in range(batches_completed * batch_size, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}  # Move to GPU

            with autocast():
                outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].to(device)
            all_embeddings.append(cls_embeddings)

            torch.cuda.empty_cache()  # Free up GPU memory
            progress_bar.update(1)

            if (i // batch_size) % save_every_n_batches == 0:
                torch.save(all_embeddings, f"{name}_embeddings.pth")
                tqdm.write(f"💾 Checkpoint saved after {i // batch_size} batches.")

        progress_bar.close()

    torch.save(all_embeddings, f"{name}_embeddings.pth")
    print("🎉 Embeddings saved successfully!")

    return torch.cat(all_embeddings, dim=0)

# genre_embeddings = embed_genre_texts(smaller_df['genres_en'].tolist(), batch_size = 4)  # [num_rows, 768]
# projector = torch.nn.Linear(768, 64).to(device)
# genre_embeddings_projected = projector(genre_embeddings)  # [num_rows, 64]

In [None]:
# Save/Load genre_embeddings
# torch.save(genre_embeddings_projected, "genre_en_embeddings.pth")
genre_embeddings_projected = torch.cat(torch.load("genres_en_embeddings.pth"), dim=0).to(device)

In [None]:
# Run category embeddings and save/load them
# category_embeddings = embed_texts(smaller_df['categories_en'], 100, checkpoint_path="categories_en_embeddings.pth", batches_completed=39901)
# projector = torch.nn.Linear(768, 64).to(device)
# category_embeddings_projected = projector(category_embeddings)
# torch.save(category_embeddings_projected, "categories_en_embeddings.pth")

In [7]:
# Normalizing playtime because some loser has 81k hours in a game
playtime = torch.tensor(smaller_df['user_playtime_forever'].values, dtype=torch.float).to(device)
playtime_norm = (playtime - playtime.min()) / (playtime.max() - playtime.min() + 1e-8)

In [None]:
# Set up indexes and attributes for h_data
user_game_edges = torch.tensor(np.array([
    smaller_df['user_id'].values,
    smaller_df['game_id'].values
]), dtype=torch.long)
h_data['user', 'plays', 'game'].edge_index = user_game_edges 
h_data['user', 'plays', 'game'].edge_attr = playtime_norm

# game → genre
game_genre_pairs = [(row['game_id'], genre_id) for _, row in smaller_df.iterrows()
    for genre_id in row['genre_ids']
]
game_genre_edges = torch.tensor(game_genre_pairs, dtype=torch.long).t().to(device)
h_data['game', 'has_genre', 'genre'].edge_index = game_genre_edges

# edge_index[1] gives genre_id for each edge
genre_ids_per_edge = game_genre_edges[1]  # shape: [2811]

# Pick the embedding for each genre node involved in each edge
h_data['game', 'has_genre', 'genre'].edge_attr = genre_embeddings_projected[genre_ids_per_edge]

# game → category
# game_cat_pairs = [
#     (row['game_id'], cat_id)
#     for _, row in smaller_df.iterrows()
#     for cat_id in row['category_ids']
# ]
# game_cat_edges = torch.tensor(game_cat_pairs, dtype=torch.long).t().to(device)
# h_data['game', 'has_category', 'category'].edge_index = game_cat_edges
# h_data['game', 'has_category', 'category']

# Do the same thing for categories
game_category_pairs = [(row['category_ids'], category_id) for _, row in smaller_df.itterown() 
    for category_id in row['category_ids']
]
game_category_edges = torch.tensor(game_category_pairs, dtype=torch.long).t().to(device)
h_data['game', 'has_category', 'category'].edge_index = game_category_edges
category_ids_per_page = game_category_edges[1]
h_data['game', 'has_category', 'category'].edge_attr = category_embeddings_projected[category_ids_per_page]

  user_game_edges = torch.tensor([


{'edge_index': tensor([[   10,    10,    10,  ..., 14727, 14727, 14727],
        [    0,     1,     2,  ...,    10,    11,     7]], device='cuda:0')}

In [9]:
# Adding Undirection to h_data
h_data = ToUndirected()(h_data)
h_data[('genre', 'rev_has_genre', 'game')].edge_attr = genre_embeddings_projected[genre_ids_per_edge]
h_data['game', 'rev_plays', 'user']

{'edge_index': tensor([[   10,   332,    18,  ...,  9620,  9973, 14727],
        [  618,   618,   618,  ...,   553,   553,   553]]), 'edge_attr': tensor([0.0059, 0.0000, 0.0006,  ..., 0.0004, 0.0013, 0.0005], device='cuda:0')}

In [10]:
# new architecture, different convolution approach singe SAGEConv does not support edge attributes
from torch_geometric.nn import HeteroConv, NNConv
import torch.nn.functional as F
from torch import nn

class GNN(nn.Module):
    def __init__(self, num_users, num_games, num_genres, num_categories, embedding_dimensions=64, edge_dims=None):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, embedding_dimensions).to(device)
        self.game_embed = nn.Embedding(num_games, embedding_dimensions).to(device)
        self.genre_embed = nn.Embedding(num_genres, embedding_dimensions).to(device)
        self.category_embed = nn.Embedding(num_categories, embedding_dimensions).to(device)

        def edge_nn(in_dim):
            return nn.Sequential(
                nn.Linear(in_dim, embedding_dimensions),
                nn.ReLU(),
                nn.Linear(embedding_dimensions, embedding_dimensions * embedding_dimensions)
            ).to(device)

        self.convs = nn.ModuleList([
            HeteroConv({
                edge_type: NNConv(
                    in_channels=(embedding_dimensions, embedding_dimensions),
                    out_channels=embedding_dimensions,
                    nn=edge_nn(edge_dims[edge_type])
                ).to(device)
                for edge_type in edge_dims
            }, aggr='sum')
        ])

    def forward(self, x_dict, edge_index_dict, edge_attr_dict):
        x_dict = {
            'user': self.user_embed(x_dict['user'].to(device)),
            'game': self.game_embed(x_dict['game'].to(device)),
            'genre': self.genre_embed(x_dict['genre'].to(device)),
            'category': self.category_embed(x_dict['category'].to(device)),
        }
        x_dict = self.convs[0](x_dict, edge_index_dict, edge_attr_dict)
        return x_dict

    
#same LP as before
class LinkPredictor(torch.nn.Module):
    def __init__(self, embedding_dimensions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2 * embedding_dimensions, embedding_dimensions),
            nn.ReLU(),
            nn.Linear(embedding_dimensions, 1)
        ).to(device)
    def forward(self, i, j):
        X = torch.cat([i,j], dim=-1)
        return torch.sigmoid(self.net(X)).squeeze()

In [11]:
# Create attribute dictionary
edge_attr_dict = {
    ('user', 'plays', 'game'): h_data['user', 'plays', 'game'].edge_attr.view(-1, 1),
    ('game', 'has_genre', 'genre'): h_data['game', 'has_genre', 'genre'].edge_attr,
    ('game', 'has_category', 'category'): torch.ones(h_data['game', 'has_category', 'category'].edge_index.size(1), 1),
    ('game', 'rev_plays', 'user'): h_data['user', 'plays', 'game'].edge_attr.view(-1, 1),
    ('genre', 'rev_has_genre', 'game'): h_data['genre', 'rev_has_genre', 'game'].edge_attr,
    ('category', 'rev_has_category', 'game'): torch.ones(h_data['category', 'rev_has_category', 'game'].edge_index.size(1), 1),
}
edge_attr_dict = {k: v.to(device) for k, v in edge_attr_dict.items()} # Put everything on gpu 
edge_dims = {k: v.shape[1] for k, v in edge_attr_dict.items()}

In [12]:
# Sanity check for dimensions. Every 3rd thing matches
for k, v in edge_attr_dict.items():
    print(f"{k}: {v.shape}")

('user', 'plays', 'game'): torch.Size([499103, 1])
('game', 'has_genre', 'genre'): torch.Size([1251103, 64])
('game', 'has_category', 'category'): torch.Size([3864253, 1])
('game', 'rev_plays', 'user'): torch.Size([499103, 1])
('genre', 'rev_has_genre', 'game'): torch.Size([1251103, 64])
('category', 'rev_has_category', 'game'): torch.Size([3864253, 1])


In [13]:
# Instantiate model
model = GNN(
    num_users=h_data['user'].num_nodes,
    num_games=h_data['game'].num_nodes,
    num_genres=h_data['genre'].num_nodes,
    num_categories=h_data['category'].num_nodes,
    edge_dims = edge_dims,
    embedding_dimensions=64
).to(device)

In [14]:
# Generate predictor with batches
predictor = LinkPredictor(64).to(device)

optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=0.001)
h_data = h_data.to(device)

# === Create Positive and Negative Edge Pairs ===
positive_edges = h_data['user', 'plays', 'game'].edge_index.t().tolist()

def sample_batch(batch_size=256):
    pos_samples = random.sample(positive_edges, batch_size)
    user_nodes = h_data['user'].x.tolist()
    game_nodes = h_data['game'].x.tolist()
    neg_samples = [[random.choice(user_nodes), random.choice(game_nodes)]
                   for _ in range(batch_size)
                   if [random.choice(user_nodes), random.choice(game_nodes)] not in positive_edges]
    return pos_samples, neg_samples

In [15]:
# Train the model
num_epochs = 50
for epoch in tqdm(range(num_epochs), desc="Training"):
    model.train()
    predictor.train()

    pos_edges, neg_edges = sample_batch()
    edges = torch.tensor(pos_edges + neg_edges, dtype=torch.long).to(device)
    labels = torch.cat([
        torch.ones(len(pos_edges)),
        torch.zeros(len(neg_edges))
    ]).to(device)

    x_dict = model(h_data.x_dict, h_data.edge_index_dict, edge_attr_dict)
    users, games = edges[:, 0], edges[:, 1]
    user_emb, game_emb = x_dict['user'][users].to(device), x_dict['game'][games].to(device)
    preds = predictor(user_emb, game_emb)

    loss = F.binary_cross_entropy(preds, labels)
    acc = (preds >= 0.5).float().eq(labels).float().mean()

    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

    print(f"Epoch {epoch+1:02d} | Loss: {loss.item():.4f} | Accuracy: {acc.item():.4f}")

Training:   0%|          | 0/50 [00:16<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 19.09 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 9.76 GiB is allocated by PyTorch, and 23.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)