In [170]:
import torch
from torch.nn import Sequential as Seq, Linear, ReLU
import torch_geometric
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [171]:
sample_size = 8000

In [172]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT-base
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_model.eval()  # Evaluation mode (no dropout)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [173]:
data = pd.read_csv('final_fantasy.csv')

In [174]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_steam_id,user_playtime_forever,user_playtime_2weeks,user_has_leaderboards,user_has_community_visible_stats,user_content_descriptorids,game_appid,game_name,game_developer,...,game_price,game_initialprice,game_discount,game_ccu,game_owners_estimate,game_review_ratio,genres_en,categories_en,short_description_en,detailed_description_en
0,0,76561198060785055,7279,0.0,False,True,"[2, 5]",240,Counter-Strike: Source,Valve,...,9.99,9.99,0.0,15002,10000000,0.962647,['Action'],"['Multi-Player', 'Cross-Platform Multiplayer',...",Counter-Strike: Source blends Counter-Strike's...,THE NEXT INSTALLMENT OF THE WORLD'S # 1 ONLINE...
1,1,76561198060785055,0,0.0,False,True,[0],12210,Grand Theft Auto IV: The Complete Edition,"Rockstar North, Rockstar Toronto",...,19.99,19.99,0.0,3464,5000000,0.823607,"['Action', 'Adventure']","['Single-Player', 'Multi-Player', 'Partial Con...","Niko Bellic, Johnny Klebitz and Luis Lopez all...",Important Updates To Grand Theft Auto IV and E...
2,2,76561198060785055,717,0.0,True,True,[0],620,Portal 2,Valve,...,9.99,9.99,0.0,3194,10000000,0.986921,"['Action', 'Adventure']","['Single Person', 'Multiple People', 'Cooperat...",The Lifetime Test Plan is now upgraded and you...,Portal 2 creates another successor to the gran...
3,3,76561198060785055,4578,0.0,False,True,[0],105600,Terraria,Re-Logic,...,9.99,9.99,0.0,38390,20000000,0.974917,"['Action', 'Adventure', 'Indie', 'Rpg']","['Single-Player', 'Multi-Player', 'Pvp', 'Onli...","Dig, fight, explore, build! Nothing is impossi...","Dig, Fight, Explore, Build: The very world is ..."
4,4,76561198060785055,0,0.0,True,True,[0],46520,Wasteland Angel,Octane Games,...,4.99,4.99,0.0,0,50000,0.587302,"['Action', 'Indie']","['Single-Player', 'Steam Achievements', 'Parti...",Wasteland Angel is a top-down twin-stick shoot...,Bummer! World War III happened and killed most...


In [175]:
def create_game_mapper(df):
    # Convert the two columns into a dictionary
    game_mapper = dict(zip(df['game_appid'], df['game_name']))
    return game_mapper

# Example usage
# Assuming you have a DataFrame named 'df' with columns 'game_appid' and 'game_name'
game_mapper = create_game_mapper(data)
game_mapper[240]

'Counter-Strike: Source'

Feature Embeddings:
- Start off with pure genre if it is good enough
- (3 approaches) Eval pure genre approach with pure short descriptons embeddings then try combining

In [176]:
data['text_for_embedding'] = ( data['short_description_en'].fillna('') + ' ' +  data['genres_en'].astype(str) )

Short Description + Genre == Token Size

In [177]:
word_count = 0
for text in data['text_for_embedding']:
    word_count+=len(text)
print(f'Total number of tokens: {word_count}')

Total number of tokens: 472731179


install transformer of choice (BeRT)

In [178]:
def embed_text(texts, tokenizer, model, max_length=128):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        # CLS pooling (using [CLS] token embedding)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embedding.squeeze(0))
    embeddings = torch.stack(embeddings)
    return embeddings

In [179]:
game_embeddings = embed_text(data['text_for_embedding'].tolist()[:sample_size], tokenizer, bert_model)
print(game_embeddings.shape)

torch.Size([8000, 768])


## User/Game Node Mapping

### how to dedupe interactions
check composite keys (user/game), if all 1s then only unique interactions

In [180]:
data.groupby(by=['user_steam_id','game_appid']).count().reset_index()['user_playtime_forever'].value_counts()

user_playtime_forever
1    1996413
Name: count, dtype: int64

### init mappings

In [181]:
# Unique users and games
unique_users = data['user_steam_id'].unique()
unique_games = data['game_appid'].unique()

# Build mappings
user_id_map = {user_id: idx for idx, user_id in enumerate(unique_users)}
game_id_map = {game_id: idx + len(unique_users) for idx, game_id in enumerate(unique_games)}

# How many nodes?
num_users = len(user_id_map)
num_games = len(game_id_map)
num_nodes = num_users + num_games

In [182]:
num_users, num_games, num_nodes

(5842, 31397, 37239)

## Building out user/game nodes
- Approximately takes an hour to run, so we will have preloaded from our sample dataset.

In [183]:
# build edges, load a new user to recommend games (edges user to game)
user_nodes = []
game_nodes = []
for idx, row in tqdm(data.iterrows(), desc="Processing rows"):
    # get indices and append them
    user_idx = user_id_map[row['user_steam_id']]
    game_idx = game_id_map[row['game_appid']]
    # print(user_idx, game_idx)
    user_nodes.append(user_idx)
    game_nodes.append(game_idx)
    # print(f'User Node {user_idx} and Game Node {game_idx} were appended')
    if idx == sample_size:
        break

Processing rows: 8000it [00:07, 1037.68it/s]


### Array Dimension Validation
- Also upon running initial mapping, storing list in .npy file

In [184]:
len(game_nodes)==len(user_nodes)

True

In [185]:
edge_index = torch.tensor([user_nodes, game_nodes], dtype=torch.long)

In [186]:
user_features = np.zeros((num_users, 768))  # dummy features for users
game_features = game_embeddings.numpy()      # real features for games

# Stack vertically: first users, then games
X = np.vstack([user_features, game_features])
X = torch.tensor(X, dtype=torch.float)
print(X.shape)  # (num_nodes, 768)

torch.Size([13842, 768])


## attribute selection for edges

if we want more features, add here

In [187]:
edge_attrs = []

for idx, row in tqdm(data.iterrows(), desc="Processing rows"):
    playtime = row['user_playtime_forever']
    edge_attrs.append([playtime])  # wrapping in list for 1-dim feature

edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
print(edge_attr.shape)  # (num_edges, 1)

Processing rows: 1996413it [00:21, 92839.49it/s] 


torch.Size([1996413, 1])


create data for geometric graph

In [188]:
from torch_geometric.data import Data

graph = Data(
    x=X,
    edge_index=edge_index,
    edge_y=edge_attr
)

print(graph)

Data(x=[13842, 768], edge_index=[2, 8001], edge_y=[1996413, 1])


test/train/val split

In [189]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.05,        # 5% for validation
    num_test=0.10,       # 10% for testing
    is_undirected=True,  # True if your graph is undirected (user <-> game is undirected)
    add_negative_train_samples=False  # We will do negative sampling manually during training
)

# Apply the transform
train_data, val_data, test_data = transform(graph)

print(train_data)
print(val_data)
print(test_data)

Data(x=[13842, 768], edge_index=[2, 13602], edge_y=[1996413, 1], edge_label=[6801], edge_label_index=[2, 6801])
Data(x=[13842, 768], edge_index=[2, 13602], edge_y=[1996413, 1], edge_label=[800], edge_label_index=[2, 800])
Data(x=[13842, 768], edge_index=[2, 14402], edge_y=[1996413, 1], edge_label=[1600], edge_label_index=[2, 1600])


Encoder

In [190]:
from torch_geometric.nn import SAGEConv
import torch.nn as nn
import torch.nn.functional as F

In [191]:
class GNNEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, X, edge_index):
        X = self.conv1(X, edge_index)
        X = F.relu(X)
        X = self.conv2(X, edge_index)
        return X

In [192]:
class LinkPredictor(nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin = nn.Linear(hidden_channels*2, 1)
    def forward(self, i, j):
        X = torch.cat([i,j], dim=1)
        return torch.sigmoid(self.lin(X))

In [195]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

hidden_channels = 128
encoder = GNNEncoder(in_channels=graph.num_node_features, hidden_channels=hidden_channels).to(device)
predictor = LinkPredictor(hidden_channels=hidden_channels).to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=0.001)

train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

In [196]:
test_data

Data(x=[13842, 768], edge_index=[2, 14402], edge_y=[1996413, 1], edge_label=[1600], edge_label_index=[2, 1600])

In [197]:
def train():
    encoder.train()
    predictor.train()
    optimizer.zero_grad()
    M = encoder(train_data.x, train_data.edge_index)
    preds = predictor(M[train_data.edge_label_index[0]], M[train_data.edge_label_index[1]])
    loss = F.binary_cross_entropy(preds.squeeze(), train_data.edge_label.float())
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(test_data):
    encoder.eval()
    predictor.eval()
    M = encoder(train_data.x, train_data.edge_index)
    preds = predictor(
        M[test_data.edge_label_index[0]], 
        M[test_data.edge_label_index[1]]
    )
    labels = test_data.edge_label.float()

    pred_labels = (preds.squeeze() > 0.5).float()

    acc = (pred_labels == labels).sum().item() / labels.size(0)
    return acc

In [198]:
epochs = 50

In [199]:
for epoch in tqdm(range(1, epochs + 1)):
    loss = train()
    val_acc = test(val_data)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(test_data)
print(f'\nTest Accuracy: {test_acc:.4f}')

 10%|█         | 5/50 [00:00<00:05,  8.81it/s]

Epoch 01, Loss: 0.7209, Val Acc: 0.5613
Epoch 02, Loss: 0.3528, Val Acc: 0.5312
Epoch 03, Loss: 0.1432, Val Acc: 0.5012
Epoch 04, Loss: 0.0426, Val Acc: 0.5000
Epoch 05, Loss: 0.0103, Val Acc: 0.5000
Epoch 06, Loss: 0.0023, Val Acc: 0.5000
Epoch 07, Loss: 0.0005, Val Acc: 0.5000


 26%|██▌       | 13/50 [00:00<00:01, 19.80it/s]

Epoch 08, Loss: 0.0001, Val Acc: 0.5000
Epoch 09, Loss: 0.0000, Val Acc: 0.5000
Epoch 10, Loss: 0.0000, Val Acc: 0.5000
Epoch 11, Loss: 0.0000, Val Acc: 0.5000
Epoch 12, Loss: 0.0000, Val Acc: 0.5000
Epoch 13, Loss: 0.0000, Val Acc: 0.5000
Epoch 14, Loss: 0.0000, Val Acc: 0.5000


 42%|████▏     | 21/50 [00:01<00:01, 26.53it/s]

Epoch 15, Loss: 0.0000, Val Acc: 0.5000
Epoch 16, Loss: 0.0000, Val Acc: 0.5000
Epoch 17, Loss: 0.0000, Val Acc: 0.5000
Epoch 18, Loss: 0.0000, Val Acc: 0.5000
Epoch 19, Loss: 0.0000, Val Acc: 0.5000
Epoch 20, Loss: 0.0000, Val Acc: 0.5000
Epoch 21, Loss: 0.0000, Val Acc: 0.5000


 50%|█████     | 25/50 [00:01<00:00, 28.85it/s]

Epoch 22, Loss: 0.0000, Val Acc: 0.5000
Epoch 23, Loss: 0.0000, Val Acc: 0.5000
Epoch 24, Loss: 0.0000, Val Acc: 0.5000
Epoch 25, Loss: 0.0000, Val Acc: 0.5000
Epoch 26, Loss: 0.0000, Val Acc: 0.5000
Epoch 27, Loss: 0.0000, Val Acc: 0.5000
Epoch 28, Loss: 0.0000, Val Acc: 0.5000


 66%|██████▌   | 33/50 [00:01<00:00, 31.65it/s]

Epoch 29, Loss: 0.0000, Val Acc: 0.5000
Epoch 30, Loss: 0.0000, Val Acc: 0.5000
Epoch 31, Loss: 0.0000, Val Acc: 0.5000
Epoch 32, Loss: 0.0000, Val Acc: 0.5000
Epoch 33, Loss: 0.0000, Val Acc: 0.5000
Epoch 34, Loss: 0.0000, Val Acc: 0.5000
Epoch 35, Loss: 0.0000, Val Acc: 0.5000


 82%|████████▏ | 41/50 [00:01<00:00, 33.01it/s]

Epoch 36, Loss: 0.0000, Val Acc: 0.5000
Epoch 37, Loss: 0.0000, Val Acc: 0.5000
Epoch 38, Loss: 0.0000, Val Acc: 0.5000
Epoch 39, Loss: 0.0000, Val Acc: 0.5000
Epoch 40, Loss: 0.0000, Val Acc: 0.5000
Epoch 41, Loss: 0.0000, Val Acc: 0.5000
Epoch 42, Loss: 0.0000, Val Acc: 0.5000


 98%|█████████▊| 49/50 [00:01<00:00, 33.63it/s]

Epoch 43, Loss: 0.0000, Val Acc: 0.5000
Epoch 44, Loss: 0.0000, Val Acc: 0.5000
Epoch 45, Loss: 0.0000, Val Acc: 0.5000
Epoch 46, Loss: 0.0000, Val Acc: 0.5000
Epoch 47, Loss: 0.0000, Val Acc: 0.5000
Epoch 48, Loss: 0.0000, Val Acc: 0.5000
Epoch 49, Loss: 0.0000, Val Acc: 0.5000


100%|██████████| 50/50 [00:02<00:00, 24.75it/s]

Epoch 50, Loss: 0.0000, Val Acc: 0.5000

Test Accuracy: 0.5000





In [200]:
@torch.no_grad()
def recommend_new_games_for_user(user_idx, train_data, top_k=5):
    encoder.eval()
    predictor.eval()

    # Step 1: Get node embeddings
    z = encoder(train_data.x, train_data.edge_index)

    user_embeddings = z[:num_users]
    game_embeddings = z[num_users:]

    user_emb = user_embeddings[user_idx].unsqueeze(0)
    user_emb_expanded = user_emb.expand(game_embeddings.size(0), -1)

    # Step 2: Predict scores for all games
    scores = predictor(user_emb_expanded, game_embeddings).squeeze()

    # Step 3: Find already played games
    # Find edges where source == user_idx
    user_edges = train_data.edge_index[0] == user_idx
    played_game_indices = train_data.edge_index[1][user_edges] - num_users  # get game indices

    played_game_indices = played_game_indices.tolist()

    # Step 4: Mask scores of already played games
    scores[played_game_indices] = -1e9  # set played games' scores very low so they won't be selected

    # Step 5: Pick top-k games
    top_scores, top_game_indices = torch.topk(scores, k=top_k)

    recommendations = []
    for i, game_idx in enumerate(top_game_indices):
        game_id = unique_games[game_idx.item()]  # map back to real game ID
        score = top_scores[i].item()
        recommendations.append((game_id, score))
    return recommendations

In [203]:
user_idx = 0
top_k = 5
recommendations = recommend_new_games_for_user(user_idx=user_idx, train_data=train_data, top_k=top_k)

print(f"\nTop {top_k} new game recommendations for User {user_idx}:")
for rank, (game_id, score) in enumerate(recommendations, start=1):
    print(f"Rank {rank}: Game ID = {game_mapper[game_id]}, Score = {score:.4f}")


Top 5 new game recommendations for User 0:
Rank 1: Game ID = TrackMania Nations Forever, Score = 1.0000
Rank 2: Game ID = Tomb Raider, Score = 1.0000
Rank 3: Game ID = Batman: Arkham Origins, Score = 1.0000
Rank 4: Game ID = Rhiannon: Curse of the Four Branches, Score = 1.0000
Rank 5: Game ID = Unturned, Score = 1.0000


In [202]:
# from torch_geometric.utils import to_networkx
# import networkx as nx

# # Convert to NetworkX graph
# G = to_networkx(data, to_undirected=True)

# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 12))
# pos = nx.spring_layout(G, seed=42)  # spring layout (force-directed)

# nx.draw(
#     G,
#     pos,
#     node_size=10, 
#     edge_color="gray", 
#     alpha=0.7,
#     node_color="blue"
# )
# plt.title("User-Game Bipartite Graph Visualization")
# plt.show()

Actionable Steps