In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
df = pd.read_csv(r"C:\Work\Programing Language\task4\Game-Recommendation-System\data\raw\rawg_games_20k.csv")

# Keep only required columns
df_clean = df[['id', 'name', 'platforms', 'genres', 'tags']].copy()

# Remove rows with missing values in critical columns
df_clean = df_clean.dropna(subset=['platforms', 'genres', 'tags'])


In [3]:
print(f"Total games after cleaning: {len(df_clean)}")
print(f"\nFirst 5 rows:")
df_clean.head()


Total games after cleaning: 18981

First 5 rows:


Unnamed: 0,id,name,platforms,genres,tags
0,3498,Grand Theft Auto V,PC|PlayStation 5|Xbox Series S/X|PlayStation 4...,Action,Singleplayer|Steam Achievements|Multiplayer|Fu...
1,3328,The Witcher 3: Wild Hunt,PlayStation 5|Xbox Series S/X|macOS|PlayStatio...,Action|RPG,Singleplayer|Full controller support|Atmospher...
2,4200,Portal 2,PlayStation 3|PC|Xbox 360|Linux|macOS|Xbox One,Shooter|Puzzle,Singleplayer|Steam Achievements|Multiplayer|Fu...
3,4291,Counter-Strike: Global Offensive,PC|Linux|Xbox 360|PlayStation 3,Shooter,Steam Achievements|Multiplayer|Full controller...
4,5286,Tomb Raider (2013),PlayStation 3|Xbox 360|macOS|PC|Xbox One|PlayS...,Action,Singleplayer|Multiplayer|Full controller suppo...


In [4]:
df_clean['platforms_list'] = df_clean['platforms'].str.split('|')
df_clean['genres_list'] = df_clean['genres'].str.split('|')
df_clean['tags_list'] = df_clean['tags'].str.split('|')

print("\nExample of split values:")
print(f"Game: {df_clean.iloc[0]['name']}")
print(f"Platforms: {df_clean.iloc[0]['platforms_list']}")
print(f"Genres: {df_clean.iloc[0]['genres_list']}")
print(f"Tags (first 5): {df_clean.iloc[0]['tags_list'][:5]}")



Example of split values:
Game: Grand Theft Auto V
Platforms: ['PC', 'PlayStation 5', 'Xbox Series S/X', 'PlayStation 4', 'PlayStation 3', 'Xbox 360', 'Xbox One']
Genres: ['Action']
Tags (first 5): ['Singleplayer', 'Steam Achievements', 'Multiplayer', 'Full controller support', 'Atmospheric']


In [5]:
all_genres = []
for genres in df_clean['genres_list']:
    all_genres.extend(genres)
genre_counts = Counter(all_genres)
print(f"\nTotal unique genres: {len(genre_counts)}")
print(f"All genres: {list(genre_counts.keys())}")

# Count all platforms
all_platforms = []
for platforms in df_clean['platforms_list']:
    all_platforms.extend(platforms)
platform_counts = Counter(all_platforms)
print(f"\nTotal unique platforms: {len(platform_counts)}")
print(f"All platforms: {list(platform_counts.keys())}")

# Count all tags and select top 75
all_tags = []
for tags in df_clean['tags_list']:
    all_tags.extend(tags)
tag_counts = Counter(all_tags)

print(f"\nTotal unique tags: {len(tag_counts)}")
print(f"Selecting top 75 most frequent tags...")

# Get top 75 tags
top_75_tags = [tag for tag, count in tag_counts.most_common(75)]

print(f"\nTop 20 most frequent tags:")
for i, (tag, count) in enumerate(tag_counts.most_common(20), 1):
    print(f"{i}. {tag:} - appears in {count} games")

# ============================================
# STEP 4: CREATE BINARY COLUMNS


Total unique genres: 19
All genres: ['Action', 'RPG', 'Shooter', 'Puzzle', 'Adventure', 'Indie', 'Platformer', 'Massively Multiplayer', 'Sports', 'Racing', 'Simulation', 'Arcade', 'Casual', 'Strategy', 'Fighting', 'Family', 'Educational', 'Card', 'Board Games']

Total unique platforms: 51
All platforms: ['PC', 'PlayStation 5', 'Xbox Series S/X', 'PlayStation 4', 'PlayStation 3', 'Xbox 360', 'Xbox One', 'macOS', 'Nintendo Switch', 'Linux', 'Android', 'Xbox', 'PS Vita', 'iOS', 'Web', 'Wii U', 'Nintendo 3DS', 'PlayStation 2', 'Dreamcast', 'Classic Macintosh', 'GameCube', 'Wii', 'Nintendo DS', 'Nintendo 64', 'PlayStation', 'SEGA Saturn', 'Game Boy Advance', 'PSP', 'Nintendo DSi', 'Commodore / Amiga', 'Game Boy Color', 'Neo Geo', 'SEGA CD', 'Apple II', 'Atari ST', 'SEGA Master System', 'Atari 8-bit', 'NES', 'Game Boy', 'SNES', 'Jaguar', 'Genesis', '3DO', 'Game Gear', 'SEGA 32X', 'Atari Lynx', 'Atari 5200', 'Atari 2600', 'Atari 7800', 'Atari XEGS', 'Atari Flashback']

Total unique tags: 259

In [6]:
# ============================================
# STEP 4: CREATE BINARY COLUMNS (OPTIMIZED)
# ============================================
print("\n" + "=" * 60)
print("STEP 4: CREATING BINARY FEATURE COLUMNS")
print("=" * 60)

# Start with id and name
df_encoded = df_clean[['id', 'name']].copy()

# Create all binary columns at once using dictionaries (MUCH FASTER!)
all_binary_cols = {}

# Create binary columns for GENRES
print("\nCreating genre columns...")
for genre in sorted(genre_counts.keys()):
    col_name = f'genre_{genre.replace(" ", "_").replace("-", "_")}'
    all_binary_cols[col_name] = df_clean['genres_list'].apply(
        lambda x: 1 if genre in x else 0
    )

# Create binary columns for PLATFORMS
print("Creating platform columns...")
for platform in sorted(platform_counts.keys()):
    col_name = f'platform_{platform.replace(" ", "_").replace("/", "_").replace("-", "_")}'
    all_binary_cols[col_name] = df_clean['platforms_list'].apply(
        lambda x: 1 if platform in x else 0
    )

# Create binary columns for TOP 75 TAGS
print("Creating tag columns (top 75 only)...")
for tag in sorted(top_75_tags):
    # Clean the tag name for column naming
    col_name = tag.replace(" ", "_").replace("-", "_").replace("'", "").replace("/", "_").replace("&", "and")
    col_name = f'tag_{col_name}'
    all_binary_cols[col_name] = df_clean['tags_list'].apply(
        lambda x: 1 if tag in x else 0
    )

# Concatenate all columns at once (FAST!)
print("\nCombining all feature columns...")
df_binary = pd.DataFrame(all_binary_cols)
df_encoded = pd.concat([df_encoded, df_binary], axis=1)


STEP 4: CREATING BINARY FEATURE COLUMNS

Creating genre columns...
Creating platform columns...
Creating tag columns (top 75 only)...

Combining all feature columns...


In [7]:
print(f"\n shape: {df_encoded.shape}")
output_file = 'games_encoded_for_knn.csv'
df_encoded.to_csv(output_file, index=False)

# Get feature columns only (exclude id and name)
feature_cols = [col for col in df_encoded.columns if col not in ['id', 'name']]

feature_sums = df_encoded[feature_cols].sum()
print(feature_sums.sort_values(ascending=False).head(10))

gta_row = df_encoded[df_encoded['name'] == 'Grand Theft Auto V']
if not gta_row.empty:
    active_features = []
    for col in feature_cols:
        if gta_row[col].values[0] == 1:
            active_features.append(col)
    for feat in active_features:
        print(f"{feat}")


 shape: (18981, 147)
platform_PC                17362
tag_Singleplayer           16010
genre_Indie                10069
genre_Action                9382
tag_Steam_Achievements      8933
genre_Adventure             7414
platform_macOS              5719
tag_Multiplayer             5279
tag_Steam_Cloud             5241
tag_steam_trading_cards     5207
dtype: int64
genre_Action
platform_PC
platform_PlayStation_3
platform_PlayStation_4
platform_PlayStation_5
platform_Xbox_360
platform_Xbox_One
platform_Xbox_Series_S_X
tag_Atmospheric
tag_Co_op
tag_Comedy
tag_First_Person
tag_Full_controller_support
tag_Funny
tag_Great_Soundtrack
tag_Multiplayer
tag_Open_World
tag_RPG
tag_Sandbox
tag_Singleplayer
tag_Steam_Achievements
tag_Third_Person
tag_cooperative


In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

df = pd.read_csv('games_encoded_for_knn.csv')
print(f" Length: {len(df)}")




 Length: 18981


In [9]:
game_info = df[['id', 'name']].copy()
feature_columns = df.drop(columns=['id', 'name']).columns.tolist()

X = df[feature_columns].values

print(f"\nshape: {X.shape}")

# Normalization for better cosine sim
X_normalized = normalize(X, norm='l2')
# study about l2 norm


shape: (18981, 145)


c

In [10]:
#KNN
knn_model = NearestNeighbors(
    n_neighbors=6,
    metric='cosine',
    algorithm='brute'
)

# Fit
knn_model.fit(X_normalized)

0,1,2
,n_neighbors,6
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


### Working
* Impliemnted Game name search (not that good yet)
* Applied KNN
* 'distances' = store how close two similar datapoints really are
* 'index' = location of actual game
*

In [11]:
def recommend_games(game_name, n_recommendations=10):
    # removed case sensative input
    game_matches = game_info[game_info['name'].str.contains(game_name, case=False, na=False)]
    # error handling
    if game_matches.empty:
        print(f"Game '{game_name}' not found in dataset.")
        return None

    game_idx = game_matches.index[0]
    game_name_exact = game_matches.iloc[0]['name']
    game_features = X_normalized[game_idx].reshape(1, -1)
    distances, index = knn_model.kneighbors(game_features, n_neighbors=6)

    # output - the first neighbour
    results = []
    for i in range(1, len(index[0])):
        idx = index[0][i]
        distance = distances[0][i]
        similarity = 1 - distance
        results.append({
            'Rank': i,
            'Game': game_info.iloc[idx]['name'],
            # similarity = 1-distances
            'Similarity': round(similarity * 100, 2)
        })

    results_df = pd.DataFrame(results)

    # op result

    print(f"More Games like : {game_name_exact}")
    print("======================================================================")
    print(results_df.to_string(index=False))
    print("======================================================================")

    return results_df


def recommend_by_features(platforms=None, genres=None, tags=None, n_recommendations=6):
    # make a temp features having length same as all the features in our dataset
    # it will contain all zero val which we will update by taking user input
    custom_features = np.zeros(X.shape[1])
    if platforms:
        for platform in platforms:
            col_name = f'platform_{platform.replace(" ", "_").replace("/", "_").replace("-", "_")}'
            if col_name in feature_columns:
                col_idx = feature_columns.index(col_name)
                custom_features[col_idx] = 1

    if genres:
        for genre in genres:
            col_name = f'genre_{genre.replace(" ", "_").replace("-", "_")}'
            if col_name in feature_columns:
                col_idx = feature_columns.index(col_name)
                custom_features[col_idx] = 1

    if tags:
        for tag in tags:
            # Clean tag name for column matching
            clean_tag = tag.replace(" ", "_").replace("-", "_").replace("'", "").replace("/", "_").replace("&", "and")
            col_name = f'tag_{clean_tag}'
            if col_name in feature_columns:
                col_idx = feature_columns.index(col_name)
                custom_features[col_idx] = 1

    # Normalize the custom feature vector
    custom_features_normalized = normalize(custom_features.reshape(1, -1), norm='l2')

    # Find nearest neighbors
    distances, index = knn_model.kneighbors(custom_features_normalized, n_neighbors=n_recommendations)

    # Prepare results
    results = []
    for i in range(len(index[0])):
        idx = index[0][i]
        distance = distances[0][i]
        similarity = 1 - distance

        results.append({
            'Rank': i + 1,
            'Game': game_info.iloc[idx]['name'],
            'Similarity': round(similarity * 100, 2)
        })

    results_df = pd.DataFrame(results)

    # op
    print("======================================================================")
    print(f"Games by tags")
    print("======================================================================")
    if platforms:
        print(f"Platforms: {', '.join(platforms)}")
    if genres:
        print(f"Genres: {', '.join(genres)}")
    if tags:
        print(f"Tags: {', '.join(tags)}")
    print("======================================================================")
    print(results_df.to_string(index=False))
    print("======================================================================")

    return results_df

In [12]:
# GTA V
print("\nMinecraft")
print("======================================================================")
recommendations = recommend_games('MineCraft', n_recommendations=6)

print("======================================================================")
print("ghost of tsushima")
recommendations = recommend_games('Valorant',n_recommendations=6)

# witcher 3
print("\nThe Witcher 3")
print("======================================================================")
recommendations = recommend_games('Witcher 3', n_recommendations=6)

# by tags
print("\nPC RPG games with Open World and Singleplayer")
print("======================================================================")
recommendations = recommend_by_features(
    platforms=['PC'],
    genres=['RPG', 'Action'],
    tags=['Singleplayer', 'Open World'],
    n_recommendations=10
)


Minecraft
More Games like : Minecraft
 Rank                          Game  Similarity
    1            XCOM: Enemy Within       64.28
    2         Angry Birds Star Wars       64.28
    3                      Terraria       63.96
    4 Shovel Knight: Shovel of Hope       63.59
    5              Bombing Bastards       62.68
ghost of tsushima
More Games like : Valorant
 Rank                                   Game  Similarity
    1                               Enlisted       77.15
    2           Project I.G.I.: I’m Going In       75.59
    3                            ÜberSoldier       71.43
    4            Chernobyl: Terrorist Attack       67.61
    5 Maelstrom: The Battle for Earth Begins       67.61

The Witcher 3
More Games like : The Witcher 3: Wild Hunt
 Rank                                         Game  Similarity
    1                                    Lies Of P       83.15
    2                              Hogwarts Legacy       80.02
    3                                  