In [1]:
import torch
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch_geometric
from torch_geometric.nn import SAGEConv
from torch.nn import Sequential as Seq, Linear, ReLU
import torch.nn as nn
import torch.nn.functional as F
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
my_var = os.environ["STEAM_API_KEY"]  # Raises a KeyError if the variable isn't set
full_size = 1996413
sample_size = int(1 * full_size)

In [3]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT-base
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Move the model to GPU
bert_model.to(device)
bert_model.eval()  # Evaluation mode (no dropout)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
data = pd.read_csv('final_fantasy.csv')

In [5]:
data.shape

(1996413, 21)

In [6]:
def create_game_mapper(df):
    # Convert the two columns into a dictionary
    game_mapper = dict(zip(df['game_appid'], df['game_name']))
    return game_mapper

# Example usage
# Assuming you have a DataFrame named 'df' with columns 'game_appid' and 'game_name'
game_mapper = create_game_mapper(data)
game_mapper[240]

'Counter-Strike: Source'

Feature Embeddings:
- Start off with pure genre if it is good enough
- (3 approaches) Eval pure genre approach with pure short descriptons embeddings then try combining

In [7]:
data.columns

Index(['Unnamed: 0', 'user_steam_id', 'user_playtime_forever',
       'user_playtime_2weeks', 'user_has_leaderboards',
       'user_has_community_visible_stats', 'user_content_descriptorids',
       'game_appid', 'game_name', 'game_developer', 'game_publisher',
       'game_price', 'game_initialprice', 'game_discount', 'game_ccu',
       'game_owners_estimate', 'game_review_ratio', 'genres_en',
       'categories_en', 'short_description_en', 'detailed_description_en'],
      dtype='object')

In [8]:
# data['text_for_embedding'] = ( data['short_description_en'].fillna('') + ' ' +  data['genres_en'].astype(str) )
# data['full_description_embedding'] = ( data['short_description_en'].fillna('') + ' ' +  data['detailed_description_en'].fillna('') )
data['full_description_embedding'] = data.apply(lambda row: f"{row['short_description_en']} {row['detailed_description_en']} {row['genres_en']} {row['categories_en']}" if pd.notna(row['detailed_description_en']) else row['short_description_en'], axis=1)

Short Description + Genre == Token Size

In [9]:
# word_count = 0
# for text in data['text_for_embedding']:
#     word_count+=len(text)
# print(f'Total number of tokens: {word_count}')
data['full_description_embedding'] = data['full_description_embedding'].fillna('')
word_count = 0
for text in data['full_description_embedding']:
    word_count+=len(text)
print(f'Total number of tokens: {word_count}')

Total number of tokens: 4319879969


install transformer of choice (BeRT)

In [10]:
def embed_text(texts, tokenizer, model, batch_size=16, max_length=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=max_length, padding=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embeddings)
        torch.cuda.empty_cache()

    return torch.cat(embeddings, dim=0)

I have the embeddings done separately!

In [11]:
# import torch
# torch.cuda.empty_cache()
# game_embeddings = embed_text(data['full_description_embedding'].tolist()[:sample_size], tokenizer, bert_model)
# print(game_embeddings.shape)
# torch.save(game_embeddings, "game_embeddings.pt")

## User/Game Node Mapping

### how to dedupe interactions
check composite keys (user/game), if all 1s then only unique interactions

In [12]:
data.groupby(by=['user_steam_id','game_appid']).count().reset_index()['user_playtime_forever'].value_counts()

user_playtime_forever
1    1996413
Name: count, dtype: int64

### init mappings

In [13]:
# Unique users and games
unique_users = data['user_steam_id'].unique()
unique_games = data['game_appid'].unique()

# Build mappings
user_id_map = {user_id: idx for idx, user_id in enumerate(unique_users)}
game_id_map = {game_id: idx + len(unique_users) for idx, game_id in enumerate(unique_games)}

# How many nodes?
num_users = len(user_id_map)
num_games = len(game_id_map)
num_nodes = num_users + num_games

In [14]:
num_users, num_games, num_nodes

(5842, 31397, 37239)

## Building out user/game nodes
- Approximately takes an hour to run, so we will have preloaded from our sample dataset.

In [15]:
# build edges, load a new user to recommend games (edges user to game)
user_nodes = []
game_nodes = []
for idx, row in tqdm(data.iterrows(), desc="Processing rows"):
    # get indices and append them
    user_idx = user_id_map[row['user_steam_id']]
    game_idx = game_id_map[row['game_appid']]
    # print(user_idx, game_idx)
    user_nodes.append(user_idx)
    game_nodes.append(game_idx)
    # print(f'User Node {user_idx} and Game Node {game_idx} were appended')
    if idx == sample_size:
        break

Processing rows: 1996413it [01:47, 18538.71it/s]


### Array Dimension Validation
- Also upon running initial mapping, storing list in .npy file

In [16]:
edge_index = torch.tensor([user_nodes, game_nodes], dtype=torch.long)

In [17]:
user_features = np.zeros((num_users, 768))  # dummy features for users
# game_features = game_embeddings.numpy()      # real features for games
# game_features = game_embeddings.cpu().numpy()      # For Cuda
game_features = torch.load("game_embeddings.pt").to('cpu') # This line of code takes 1.2 minutes roughly

# Stack vertically: first users, then games
X = np.vstack([user_features, game_features])
X = torch.tensor(X, dtype=torch.float)
print(X.shape)  # (num_nodes, 768)

torch.Size([2002255, 768])


## attribute selection for edges

if we want more features, add here

In [18]:
# edge_attrs = []

# for idx, row in tqdm(data.iterrows(), desc="Processing rows"):
#     playtime = row['user_playtime_forever']
#     edge_attrs.append([playtime])  # wrapping in list for 1-dim feature

# edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
# print(edge_attr.shape)  # (num_edges, 1)
edge_attrs = [] # Roughly 1 minute
for row in tqdm(data.itertuples(index=False), desc="Processing rows"):
    playtime = row.user_playtime_forever
    played_games = row.game_appid
    
    edge_attrs.append([playtime, played_games])  # Wrapping in list for 1D feature

Processing rows: 1996413it [00:27, 73871.72it/s] 


create data for geometric graph

In [19]:
from torch_geometric.data import Data

edge_attr = torch.tensor(edge_attrs, dtype=torch.float)

graph = Data(
    x=X,
    edge_index=edge_index,
    edge_y=edge_attr
)

print(graph)

Data(x=[2002255, 768], edge_index=[2, 1996413], edge_y=[1996413, 2])


test/train/val split

In [20]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.05,        # 5% for validation
    num_test=0.10,       # 10% for testing
    is_undirected=True,  # True if your graph is undirected (user <-> game is undirected)
    add_negative_train_samples=False  # We will do negative sampling manually during training
)

# Apply the transform
train_data, val_data, test_data = transform(graph)

print(train_data)
print(val_data)
print(test_data)

Data(x=[2002255, 768], edge_index=[2, 3393904], edge_y=[3393904, 2], edge_label=[1696952], edge_label_index=[2, 1696952])
Data(x=[2002255, 768], edge_index=[2, 3393904], edge_y=[3393904, 2], edge_label=[199640], edge_label_index=[2, 199640])
Data(x=[2002255, 768], edge_index=[2, 3593544], edge_y=[3593544, 2], edge_label=[399282], edge_label_index=[2, 399282])


Encoder

In [21]:
from torch_geometric.nn import SAGEConv
import torch.nn as nn
import torch.nn.functional as F

In [22]:
class GNNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, X, edge_index):
        X = self.conv1(X, edge_index)
        X = F.relu(X)
        X = self.conv2(X, edge_index)
        return X

In [23]:
class LinkPredictor(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin = nn.Linear(hidden_channels*2, 1)
    def forward(self, i, j):
        X = torch.cat([i,j], dim=1)
        return torch.sigmoid(self.lin(X))

In [24]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

hidden_channels = 128
encoder = GNNEncoder(in_channels=graph.num_node_features, hidden_channels=hidden_channels).to(device)
predictor = LinkPredictor(hidden_channels=hidden_channels).to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.001)

train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

In [25]:
test_data

Data(x=[2002255, 768], edge_index=[2, 3593544], edge_y=[3593544, 2], edge_label=[399282], edge_label_index=[2, 399282])

In [26]:
def train():
    encoder.train()
    predictor.train()
    optimizer.zero_grad()
    M = encoder(train_data.x, train_data.edge_index)
    preds = predictor(M[train_data.edge_label_index[0]], M[train_data.edge_label_index[1]])
    loss = F.binary_cross_entropy(preds.squeeze(), train_data.edge_label.float())
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(test_data):
    encoder.eval()
    predictor.eval()
    M = encoder(train_data.x, train_data.edge_index)
    preds = predictor(
        M[test_data.edge_label_index[0]], 
        M[test_data.edge_label_index[1]]
    )
    labels = test_data.edge_label.float()

    pred_labels = (preds.squeeze() > 0.5).float()

    acc = (pred_labels == labels).sum().item() / labels.size(0)
    return acc

In [27]:
torch.cuda.empty_cache()
epochs = 5

In [28]:
for epoch in tqdm(range(1, epochs + 1)):
    loss = train()
    val_acc = test(val_data)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(test_data)
print(f'\nTest Accuracy: {test_acc:.4f}')

  0%|          | 0/5 [00:07<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.71 GiB. GPU 

In [None]:
@torch.no_grad()
def recommend_new_games_for_user(user_idx, train_data, top_k=5):
    encoder.eval()
    predictor.eval()

    # Step 1: Get node embeddings
    z = encoder(train_data.x, train_data.edge_index)

    user_embeddings = z[:num_users]
    game_embeddings = z[num_users:]

    user_emb = user_embeddings[user_idx].unsqueeze(0)
    user_emb_expanded = user_emb.expand(game_embeddings.size(0), -1)

    # Step 2: Predict scores for all games
    scores = predictor(user_emb_expanded, game_embeddings).squeeze()

    # Step 3: Find already played games
    # Find edges where source == user_idx
    user_edges = train_data.edge_index[0] == user_idx
    played_game_indices = train_data.edge_index[1][user_edges] - num_users  # get game indices

    played_game_indices = played_game_indices.tolist()

    # Step 4: Mask scores of already played games
    scores[played_game_indices] = -1e9  # set played games' scores very low so they won't be selected

    # Step 5: Pick top-k games
    top_scores, top_game_indices = torch.topk(scores, k=top_k)

    recommendations = []
    for i, game_idx in enumerate(top_game_indices):
        game_id = unique_games[game_idx.item()]  # map back to real game ID
        score = top_scores[i].item()
        recommendations.append((game_id, score))
    return recommendations

In [None]:
user_idx = 999
top_k = 10
recommendations = recommend_new_games_for_user(user_idx=user_idx, train_data=train_data, top_k=top_k)

print(f"\nTop {top_k} new game recommendations for User {user_idx}:")
for rank, (game_id, score) in enumerate(recommendations, start=1):
    print(f"Rank {rank}: Game ID = {game_mapper[game_id]}, Score = {score:.4f}")


Top 10 new game recommendations for User 999:
Rank 1: Game ID = Ranch Simulator: Build, Hunt, Farm, Score = 0.9994
Rank 2: Game ID = VRocker, Score = 0.9994
Rank 3: Game ID = Tormentum - Dark Sorrow, Score = 0.9994
Rank 4: Game ID = Mytheon, Score = 0.9994
Rank 5: Game ID = Gravity Cat, Score = 0.9994
Rank 6: Game ID = Lost in Spice, Score = 0.9994
Rank 7: Game ID = Gray Skies, Dark Waters, Score = 0.9994
Rank 8: Game ID = Acorns Above: A World Gone Nuts, Score = 0.9994
Rank 9: Game ID = Aero GPX, Score = 0.9994
Rank 10: Game ID = EverQuest II, Score = 0.9994


In [None]:
# from torch_geometric.utils import to_networkx
# import networkx as nx

# # Convert to NetworkX graph
# G = to_networkx(data, to_undirected=True)

# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 12))
# pos = nx.spring_layout(G, seed=42)  # spring layout (force-directed)

# nx.draw(
#     G,
#     pos,
#     node_size=10, 
#     edge_color="gray", 
#     alpha=0.7,
#     node_color="blue"
# )
# plt.title("User-Game Bipartite Graph Visualization")
# plt.show()

Actionable Steps

# Cold Start Recommendation if the user is not in the graph

In [None]:
import requests

STEAM_API_KEY = os.getenv("STEAM_API_KEY")

# 👇 Build this once when you load your graph
game_idx_map = {appid: idx for idx, appid in enumerate(unique_games)}

@torch.no_grad()
def recommend_games_by_steam_id(steam_id, user_id_map, train_data, top_k=5):
    encoder.eval()
    predictor.eval()

    z = encoder(train_data.x, train_data.edge_index)

    user_embeddings = z[:num_users]
    game_embeddings = z[num_users:]

    if steam_id in user_id_map:
        print(f"Steam ID {steam_id} found in graph ✅")
        user_idx = user_id_map[steam_id]
        user_emb = user_embeddings[user_idx].unsqueeze(0)
    else:
        print(f"Steam ID {steam_id} not found. Trying Steam API cold-start fallback...")

        owned_game_ids = get_owned_games_from_steam(steam_id)
        if not owned_game_ids:
            print(f"No games found for Steam ID {steam_id} Cannot recommend.")
            return []

        known_owned_games = [appid for appid in owned_game_ids if appid in unique_games]
        if not known_owned_games:
            print(f"No known games found for Steam ID {steam_id} Cannot recommend.")
            return []

        game_idxs = [game_id_map[appid] - num_users for appid in known_owned_games]
        user_emb = game_embeddings[game_idxs].mean(dim=0).unsqueeze(0)
        print(f"Created cold-start embedding from {len(game_idxs)} owned games")

    user_emb_expanded = user_emb.expand(game_embeddings.size(0), -1)
    scores = predictor(user_emb_expanded, game_embeddings).squeeze()

    if steam_id in user_id_map:
        user_idx = user_id_map[steam_id]
        user_edges = train_data.edge_index[0] == user_idx
        played_game_indices = train_data.edge_index[1][user_edges] - num_users
        played_game_indices = played_game_indices.tolist()
        scores[played_game_indices] = -1e9
    else:
        played_game_indices = [game_idx_map[appid] for appid in known_owned_games if appid in game_idx_map]
        scores[played_game_indices] = -1e9

    top_scores, top_game_indices = torch.topk(scores, k=top_k)

    recommendations = []
    for i, game_idx in enumerate(top_game_indices):
        game_id = unique_games[game_idx.item()]
        score = top_scores[i].item()
        recommendations.append((game_id, score))

    return recommendations

# --- Helper ---
def get_owned_games_from_steam(steam_id):
    url = "http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/"
    params = {
        'key': STEAM_API_KEY,
        'steamid': steam_id,
        'format': 'json',
        'include_appinfo': False,
        'include_played_free_games': True
    }
    try:
        response = requests.get(url, params=params, timeout=5)
        data = response.json()

        if 'response' in data and 'games' in data['response']:
            games = data['response']['games']
            owned_game_ids = [game['appid'] for game in games]
            return owned_game_ids
        else:
            return []
    except Exception as e:
        print(f"Error fetching games for Steam ID {steam_id}: {e}")
        return []


In [None]:
# Before: Build mapping once
appid_to_game_name = dict(zip(data['game_appid'], data['game_name']))
# Recommend
# https://steamcommunity.com/profiles/76561198005754455/
steam_id = 76561199383582027
recommendations = recommend_games_by_steam_id(steam_id, user_id_map, train_data, top_k=10)

# Print results nicely
print(f"\nTop 5 recommendations for Steam ID {steam_id}:")
for rank, (game_id, score) in enumerate(recommendations, start=1):
    game_name = appid_to_game_name.get(game_id, f"Unknown Game ({game_id})")
    print(f"Rank {rank}: {game_name} (Score: {score:.4f})")


Steam ID 76561199383582027 found in graph ✅

Top 5 recommendations for Steam ID 76561199383582027:
Rank 1: Ranch Simulator: Build, Hunt, Farm (Score: 0.9994)
Rank 2: VRocker (Score: 0.9994)
Rank 3: Tormentum - Dark Sorrow (Score: 0.9994)
Rank 4: Mytheon (Score: 0.9994)
Rank 5: Gravity Cat (Score: 0.9994)
Rank 6: Lost in Spice (Score: 0.9994)
Rank 7: Gray Skies, Dark Waters (Score: 0.9994)
Rank 8: Acorns Above: A World Gone Nuts (Score: 0.9994)
Rank 9: Aero GPX (Score: 0.9994)
Rank 10: EverQuest II (Score: 0.9994)
