In [11]:
import torch
from torch.nn import Sequential as Seq, Linear, ReLU
import torch_geometric
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [58]:
sample_size = 6000

In [59]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT-base
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_model.eval()  # Evaluation mode (no dropout)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [60]:
data = pd.read_csv('final_fantasy.csv')

In [61]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_steam_id,user_playtime_forever,user_playtime_2weeks,user_has_leaderboards,user_has_community_visible_stats,user_content_descriptorids,game_appid,game_name,game_developer,...,game_price,game_initialprice,game_discount,game_ccu,game_owners_estimate,game_review_ratio,genres_en,categories_en,short_description_en,detailed_description_en
0,0,76561198060785055,7279,0.0,False,True,"[2, 5]",240,Counter-Strike: Source,Valve,...,9.99,9.99,0.0,15002,10000000,0.962647,['Action'],"['Multi-Player', 'Cross-Platform Multiplayer',...",Counter-Strike: Source blends Counter-Strike's...,THE NEXT INSTALLMENT OF THE WORLD'S # 1 ONLINE...
1,1,76561198060785055,0,0.0,False,True,[0],12210,Grand Theft Auto IV: The Complete Edition,"Rockstar North, Rockstar Toronto",...,19.99,19.99,0.0,3464,5000000,0.823607,"['Action', 'Adventure']","['Single-Player', 'Multi-Player', 'Partial Con...","Niko Bellic, Johnny Klebitz and Luis Lopez all...",Important Updates To Grand Theft Auto IV and E...
2,2,76561198060785055,717,0.0,True,True,[0],620,Portal 2,Valve,...,9.99,9.99,0.0,3194,10000000,0.986921,"['Action', 'Adventure']","['Single Person', 'Multiple People', 'Cooperat...",The Lifetime Test Plan is now upgraded and you...,Portal 2 creates another successor to the gran...
3,3,76561198060785055,4578,0.0,False,True,[0],105600,Terraria,Re-Logic,...,9.99,9.99,0.0,38390,20000000,0.974917,"['Action', 'Adventure', 'Indie', 'Rpg']","['Single-Player', 'Multi-Player', 'Pvp', 'Onli...","Dig, fight, explore, build! Nothing is impossi...","Dig, Fight, Explore, Build: The very world is ..."
4,4,76561198060785055,0,0.0,True,True,[0],46520,Wasteland Angel,Octane Games,...,4.99,4.99,0.0,0,50000,0.587302,"['Action', 'Indie']","['Single-Player', 'Steam Achievements', 'Parti...",Wasteland Angel is a top-down twin-stick shoot...,Bummer! World War III happened and killed most...


In [62]:
def create_game_mapper(df):
    # Convert the two columns into a dictionary
    game_mapper = dict(zip(df['game_appid'], df['game_name']))
    return game_mapper

# Example usage
# Assuming you have a DataFrame named 'df' with columns 'game_appid' and 'game_name'
game_mapper = create_game_mapper(data)
game_mapper[240]

'Counter-Strike: Source'

Feature Embeddings:
- Start off with pure genre if it is good enough
- (3 approaches) Eval pure genre approach with pure short descriptons embeddings then try combining

In [63]:
data['text_for_embedding'] = ( data['short_description_en'].fillna('') + ' ' +  data['genres_en'].astype(str) )

Short Description + Genre == Token Size

In [64]:
word_count = 0
for text in data['text_for_embedding']:
    word_count+=len(text)
print(f'Total number of tokens: {word_count}')

Total number of tokens: 472731179


install transformer of choice (BeRT)

In [65]:
def embed_text(texts, tokenizer, model, max_length=128):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        # CLS pooling (using [CLS] token embedding)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embedding.squeeze(0))
    embeddings = torch.stack(embeddings)
    return embeddings

In [66]:
game_embeddings = embed_text(data['text_for_embedding'].tolist()[:sample_size], tokenizer, bert_model)
print(game_embeddings.shape)

KeyboardInterrupt: 

## User/Game Node Mapping

### how to dedupe interactions
check composite keys (user/game), if all 1s then only unique interactions

In [None]:
data.groupby(by=['user_steam_id','game_appid']).count().reset_index()['user_playtime_forever'].value_counts()

user_playtime_forever
1    1996413
Name: count, dtype: int64

### init mappings

In [31]:
# Unique users and games
unique_users = data['user_steam_id'].unique()
unique_games = data['game_appid'].unique()

# Build mappings
user_id_map = {user_id: idx for idx, user_id in enumerate(unique_users)}
game_id_map = {game_id: idx + len(unique_users) for idx, game_id in enumerate(unique_games)}

# How many nodes?
num_users = len(user_id_map)
num_games = len(game_id_map)
num_nodes = num_users + num_games

In [32]:
num_users, num_games, num_nodes

(5842, 31397, 37239)

## Building out user/game nodes
- Approximately takes an hour to run, so we will have preloaded from our sample dataset.

In [None]:
# build edges, load a new user to recommend games (edges user to game)
user_nodes = []
game_nodes = []
for idx, row in tqdm(data.iterrows(), desc="Processing rows"):
    # get indices and append them
    user_idx = user_id_map[row['user_steam_id']]
    game_idx = game_id_map[row['game_appid']]
    # print(user_idx, game_idx)
    user_nodes.append(user_idx)
    game_nodes.append(game_idx)
    # print(f'User Node {user_idx} and Game Node {game_idx} were appended')
    if idx == sample_size:
        break

Processing rows: 2000it [00:00, 2269.01it/s]


### Array Dimension Validation
- Also upon running initial mapping, storing list in .npy file

In [34]:
len(game_nodes)==len(user_nodes)

True

In [35]:
edge_index = torch.tensor([user_nodes, game_nodes], dtype=torch.long)

In [None]:
user_features = np.zeros((num_users, 768))  # dummy features for users
game_features = game_embeddings.numpy()      # real features for games

# Stack vertically: first users, then games
X = np.vstack([user_features, game_features])
X = torch.tensor(X, dtype=torch.float)
print(X.shape)  # (num_nodes, 768)

torch.Size([7842, 768])


## attribute selection for edges

if we want more features, add here

In [None]:
edge_attrs = []

for idx, row in tqdm(data.iterrows(), desc="Processing rows"):
    playtime = row['user_playtime_forever']
    edge_attrs.append([playtime])  # wrapping in list for 1-dim feature

edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
print(edge_attr.shape)  # (num_edges, 1)

Processing rows: 1996413it [00:24, 81540.41it/s]


torch.Size([1996413, 1])


create data for geometric graph

In [38]:
from torch_geometric.data import Data

graph = Data(
    x=X,
    edge_index=edge_index,
    edge_y=edge_attr
)

print(graph)

Data(x=[7842, 768], edge_index=[2, 2001], edge_y=[1996413, 1])


test/train/val split

In [39]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.05,        # 5% for validation
    num_test=0.10,       # 10% for testing
    is_undirected=True,  # True if your graph is undirected (user <-> game is undirected)
    add_negative_train_samples=False  # We will do negative sampling manually during training
)

# Apply the transform
train_data, val_data, test_data = transform(graph)

print(train_data)
print(val_data)
print(test_data)

Data(x=[7842, 768], edge_index=[2, 3402], edge_y=[1996413, 1], edge_label=[1701], edge_label_index=[2, 1701])
Data(x=[7842, 768], edge_index=[2, 3402], edge_y=[1996413, 1], edge_label=[200], edge_label_index=[2, 200])
Data(x=[7842, 768], edge_index=[2, 3602], edge_y=[1996413, 1], edge_label=[400], edge_label_index=[2, 400])


Encoder

In [40]:
from torch_geometric.nn import SAGEConv
import torch.nn as nn
import torch.nn.functional as F

In [41]:
class GNNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, X, edge_index):
        X = self.conv1(X, edge_index)
        X = F.relu(X)
        X = self.conv2(X, edge_index)
        return X

In [42]:
class LinkPredictor(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin = nn.Linear(hidden_channels*2, 1)
    def forward(self, i, j):
        X = torch.cat([i,j], dim=1)
        return torch.sigmoid(self.lin(X))

In [43]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

hidden_channels = 128
encoder = GNNEncoder(in_channels=graph.num_node_features, hidden_channels=hidden_channels).to(device)
predictor = LinkPredictor(hidden_channels=hidden_channels).to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.001)

train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

In [44]:
test_data

Data(x=[7842, 768], edge_index=[2, 3602], edge_y=[1996413, 1], edge_label=[400], edge_label_index=[2, 400])

In [45]:
def train():
    encoder.train()
    predictor.train()
    optimizer.zero_grad()
    M = encoder(train_data.x, train_data.edge_index)
    preds = predictor(M[train_data.edge_label_index[0]], M[train_data.edge_label_index[1]])
    loss = F.binary_cross_entropy(preds.squeeze(), train_data.edge_label.float())
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(test_data):
    encoder.eval()
    predictor.eval()
    M = encoder(train_data.x, train_data.edge_index)
    preds = predictor(
        M[test_data.edge_label_index[0]], 
        M[test_data.edge_label_index[1]]
    )
    labels = test_data.edge_label.float()

    pred_labels = (preds.squeeze() > 0.5).float()

    acc = (pred_labels == labels).sum().item() / labels.size(0)
    return acc

In [46]:
epochs = 50

In [47]:
for epoch in tqdm(range(1, epochs + 1)):
    loss = train()
    val_acc = test(val_data)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(test_data)
print(f'\nTest Accuracy: {test_acc:.4f}')

 30%|███       | 15/50 [00:03<00:05,  6.88it/s]

Epoch 01, Loss: 0.6585, Val Acc: 0.5000
Epoch 02, Loss: 0.3596, Val Acc: 0.5000
Epoch 03, Loss: 0.1637, Val Acc: 0.5000
Epoch 04, Loss: 0.0548, Val Acc: 0.5000
Epoch 05, Loss: 0.0145, Val Acc: 0.5000
Epoch 06, Loss: 0.0035, Val Acc: 0.5000
Epoch 07, Loss: 0.0008, Val Acc: 0.5000
Epoch 08, Loss: 0.0002, Val Acc: 0.5000
Epoch 09, Loss: 0.0001, Val Acc: 0.5000
Epoch 10, Loss: 0.0000, Val Acc: 0.5000
Epoch 11, Loss: 0.0000, Val Acc: 0.5000
Epoch 12, Loss: 0.0000, Val Acc: 0.5000
Epoch 13, Loss: 0.0000, Val Acc: 0.5000
Epoch 14, Loss: 0.0000, Val Acc: 0.5000
Epoch 15, Loss: 0.0000, Val Acc: 0.5000
Epoch 16, Loss: 0.0000, Val Acc: 0.5000
Epoch 17, Loss: 0.0000, Val Acc: 0.5000
Epoch 18, Loss: 0.0000, Val Acc: 0.5000
Epoch 19, Loss: 0.0000, Val Acc: 0.5000
Epoch 20, Loss: 0.0000, Val Acc: 0.5000
Epoch 21, Loss: 0.0000, Val Acc: 0.5000
Epoch 22, Loss: 0.0000, Val Acc: 0.5000
Epoch 23, Loss: 0.0000, Val Acc: 0.5000
Epoch 24, Loss: 0.0000, Val Acc: 0.5000
Epoch 25, Loss: 0.0000, Val Acc: 0.5000


100%|██████████| 50/50 [00:03<00:00, 15.48it/s]

Epoch 31, Loss: 0.0000, Val Acc: 0.5000
Epoch 32, Loss: 0.0000, Val Acc: 0.5000
Epoch 33, Loss: 0.0000, Val Acc: 0.5000
Epoch 34, Loss: 0.0000, Val Acc: 0.5000
Epoch 35, Loss: 0.0000, Val Acc: 0.5000
Epoch 36, Loss: 0.0000, Val Acc: 0.5000
Epoch 37, Loss: 0.0000, Val Acc: 0.5000
Epoch 38, Loss: 0.0000, Val Acc: 0.5000
Epoch 39, Loss: 0.0000, Val Acc: 0.5000
Epoch 40, Loss: 0.0000, Val Acc: 0.5000
Epoch 41, Loss: 0.0000, Val Acc: 0.5000
Epoch 42, Loss: 0.0000, Val Acc: 0.5000
Epoch 43, Loss: 0.0000, Val Acc: 0.5000
Epoch 44, Loss: 0.0000, Val Acc: 0.5000
Epoch 45, Loss: 0.0000, Val Acc: 0.5000
Epoch 46, Loss: 0.0000, Val Acc: 0.5000
Epoch 47, Loss: 0.0000, Val Acc: 0.5000
Epoch 48, Loss: 0.0000, Val Acc: 0.5000
Epoch 49, Loss: 0.0000, Val Acc: 0.5000
Epoch 50, Loss: 0.0000, Val Acc: 0.5000

Test Accuracy: 0.5000





In [48]:
@torch.no_grad()
def recommend_new_games_for_user(user_idx, train_data, top_k=5):
    encoder.eval()
    predictor.eval()

    # Step 1: Get node embeddings
    z = encoder(train_data.x, train_data.edge_index)

    user_embeddings = z[:num_users]
    game_embeddings = z[num_users:]

    user_emb = user_embeddings[user_idx].unsqueeze(0)
    user_emb_expanded = user_emb.expand(game_embeddings.size(0), -1)

    # Step 2: Predict scores for all games
    scores = predictor(user_emb_expanded, game_embeddings).squeeze()

    # Step 3: Find already played games
    # Find edges where source == user_idx
    user_edges = train_data.edge_index[0] == user_idx
    played_game_indices = train_data.edge_index[1][user_edges] - num_users  # get game indices

    played_game_indices = played_game_indices.tolist()

    # Step 4: Mask scores of already played games
    scores[played_game_indices] = -1e9  # set played games' scores very low so they won't be selected

    # Step 5: Pick top-k games
    top_scores, top_game_indices = torch.topk(scores, k=top_k)

    recommendations = []
    for i, game_idx in enumerate(top_game_indices):
        game_id = unique_games[game_idx.item()]  # map back to real game ID
        score = top_scores[i].item()
        recommendations.append((game_id, score))
    return recommendations

In [None]:
user_idx = 0
top_k = 5
recommendations = recommend_new_games_for_user(user_idx=user_idx, train_data=train_data, top_k=top_k)

print(f"\nTop {top_k} new game recommendations for User {user_idx}:")
for rank, (game_id, score) in enumerate(recommendations, start=1):
    print(f"Rank {rank}: Game ID = {game_mapper[game_id]}, Score = {score:.4f}")


Top 5 new game recommendations for User 0:
Rank 1: Game ID = Portal 2, Score = 1.0000
Rank 2: Game ID = Path of Exile, Score = 1.0000
Rank 3: Game ID = Cubetractor, Score = 1.0000
Rank 4: Game ID = The Forest, Score = 1.0000
Rank 5: Game ID = Rocket League, Score = 1.0000


In [202]:
# from torch_geometric.utils import to_networkx
# import networkx as nx

# # Convert to NetworkX graph
# G = to_networkx(data, to_undirected=True)

# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 12))
# pos = nx.spring_layout(G, seed=42)  # spring layout (force-directed)

# nx.draw(
#     G,
#     pos,
#     node_size=10, 
#     edge_color="gray", 
#     alpha=0.7,
#     node_color="blue"
# )
# plt.title("User-Game Bipartite Graph Visualization")
# plt.show()

Actionable Steps

# Cold Start Recommendation if the user is not in the graph

In [67]:
import requests

STEAM_API_KEY = os.getenv("STEAM_API_KEY")

# 👇 Build this once when you load your graph
game_idx_map = {appid: idx for idx, appid in enumerate(unique_games)}

@torch.no_grad()
def recommend_games_by_steam_id(steam_id, user_id_map, train_data, top_k=5):
    encoder.eval()
    predictor.eval()

    z = encoder(train_data.x, train_data.edge_index)

    user_embeddings = z[:num_users]
    game_embeddings = z[num_users:]

    if steam_id in user_id_map:
        print(f"Steam ID {steam_id} found in graph ✅")
        user_idx = user_id_map[steam_id]
        user_emb = user_embeddings[user_idx].unsqueeze(0)
    else:
        print(f"Steam ID {steam_id} not found. Trying Steam API cold-start fallback...")

        owned_game_ids = get_owned_games_from_steam(steam_id)
        if not owned_game_ids:
            print(f"No games found for Steam ID {steam_id} Cannot recommend.")
            return []

        known_owned_games = [appid for appid in owned_game_ids if appid in unique_games]
        if not known_owned_games:
            print(f"No known games found for Steam ID {steam_id} Cannot recommend.")
            return []

        game_idxs = [game_id_map[appid] - num_users for appid in known_owned_games]
        user_emb = game_embeddings[game_idxs].mean(dim=0).unsqueeze(0)
        print(f"Created cold-start embedding from {len(game_idxs)} owned games")

    user_emb_expanded = user_emb.expand(game_embeddings.size(0), -1)
    scores = predictor(user_emb_expanded, game_embeddings).squeeze()

    if steam_id in user_id_map:
        user_idx = user_id_map[steam_id]
        user_edges = train_data.edge_index[0] == user_idx
        played_game_indices = train_data.edge_index[1][user_edges] - num_users
        played_game_indices = played_game_indices.tolist()
        scores[played_game_indices] = -1e9
    else:
        played_game_indices = [game_idx_map[appid] for appid in known_owned_games if appid in game_idx_map]
        scores[played_game_indices] = -1e9

    top_scores, top_game_indices = torch.topk(scores, k=top_k)

    recommendations = []
    for i, game_idx in enumerate(top_game_indices):
        game_id = unique_games[game_idx.item()]
        score = top_scores[i].item()
        recommendations.append((game_id, score))

    return recommendations

# --- Helper ---
def get_owned_games_from_steam(steam_id):
    url = "http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/"
    params = {
        'key': STEAM_API_KEY,
        'steamid': steam_id,
        'format': 'json',
        'include_appinfo': False,
        'include_played_free_games': True
    }
    try:
        response = requests.get(url, params=params, timeout=5)
        data = response.json()

        if 'response' in data and 'games' in data['response']:
            games = data['response']['games']
            owned_game_ids = [game['appid'] for game in games]
            return owned_game_ids
        else:
            return []
    except Exception as e:
        print(f"Error fetching games for Steam ID {steam_id}: {e}")
        return []


In [72]:
# Before: Build mapping once
appid_to_game_name = dict(zip(data['game_appid'], data['game_name']))

# Recommend
steam_id = 76561198030330010
recommendations = recommend_games_by_steam_id(steam_id, user_id_map, train_data, top_k=10)

# Print results nicely
print(f"\nTop 5 recommendations for Steam ID {steam_id}:")
for rank, (game_id, score) in enumerate(recommendations, start=1):
    game_name = appid_to_game_name.get(game_id, f"Unknown Game ({game_id})")
    print(f"Rank {rank}: {game_name} (Score: {score:.4f})")


Steam ID 76561198030330010 not found. Trying Steam API cold-start fallback...
Created cold-start embedding from 1206 owned games ✅

Top 5 recommendations for Steam ID 76561198030330010:
Rank 1: Wasteland Angel (Score: 1.0000)
Rank 2: TrackMania Nations Forever (Score: 1.0000)
Rank 3: Path of Exile (Score: 1.0000)
Rank 4: Batman: Arkham Origins (Score: 1.0000)
Rank 5: Trove (Score: 1.0000)
Rank 6: Unturned (Score: 1.0000)
Rank 7: Heroes & Generals (Score: 1.0000)
Rank 8: DRAGON BALL XENOVERSE (Score: 1.0000)
Rank 9: Batman: Arkham Knight (Score: 1.0000)
Rank 10: VRChat (Score: 1.0000)
