In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score

In [2]:
game = pd.read_csv(
    r'C:\Work\Programing Language\task4\Game-Recommendation-System\data\raw\rawg_games_20k.csv'
)
print(game.head())

     id                              name                             slug  \
0  3498                Grand Theft Auto V               grand-theft-auto-v   
1  3328          The Witcher 3: Wild Hunt          the-witcher-3-wild-hunt   
2  4200                          Portal 2                         portal-2   
3  4291  Counter-Strike: Global Offensive  counter-strike-global-offensive   
4  5286                Tomb Raider (2013)                      tomb-raider   

     released  metacritic  rating  ratings_count  \
0  2013-09-17        92.0    4.47           7236   
1  2015-05-18        92.0    4.64           7052   
2  2011-04-18        95.0    4.58           5992   
3  2012-08-21        81.0    3.57           3603   
4  2013-03-05        86.0    4.06           4048   

                                           platforms          genres  \
0  PC|PlayStation 5|Xbox Series S/X|PlayStation 4...          Action   
1  PlayStation 5|Xbox Series S/X|macOS|PlayStatio...      Action|RPG   
2 

In [3]:
game.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             20000 non-null  int64  
 1   name           20000 non-null  object 
 2   slug           20000 non-null  object 
 3   released       19328 non-null  object 
 4   metacritic     5406 non-null   float64
 5   rating         20000 non-null  float64
 6   ratings_count  20000 non-null  int64  
 7   platforms      19998 non-null  object 
 8   genres         19652 non-null  object 
 9   stores         18842 non-null  object 
 10  added          20000 non-null  int64  
 11  tags           19323 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 1.8+ MB


In [4]:
game = game[['id', 'name', 'platforms', 'genres', 'stores', 'tags']].dropna()
print("\nRemaining rows after dropping NA:", len(game))



Remaining rows after dropping NA: 18285


In [5]:
print("\nMissing values per column:")
print(game.isnull().sum())


Missing values per column:
id           0
name         0
platforms    0
genres       0
stores       0
tags         0
dtype: int64


In [6]:
def clean_text(series):
    series = series.str.replace(' ', '', regex=False)
    series = series.str.replace('|', ' ', regex=False)
    series = series.str.lower()
    return series

In [7]:
game[['platforms', 'genres', 'stores', 'tags']] = game[['platforms', 'genres', 'stores', 'tags']].apply(clean_text)

* combining important coloumns
*

In [8]:
game['combined'] = game['platforms'] + ' ' + game['genres'] + ' ' + game['stores'] + ' ' + game['tags']


In [9]:
game['combined'].head()

0    pc playstation5 xboxseriess/x playstation4 pla...
1    playstation5 xboxseriess/x macos playstation4 ...
2    playstation3 pc xbox360 linux macos xboxone sh...
3    pc linux xbox360 playstation3 shooter playstat...
4    playstation3 xbox360 macos pc xboxone playstat...
Name: combined, dtype: object

In [None]:
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
X = tfidf.fit_transform(game['combined'])
X = normalize(X)

In [11]:
sample_idx = np.random.choice(len(game), 2000, replace=False)
X_sample = X[sample_idx]
sampled_game = game.iloc[sample_idx].copy()

In [12]:
cluster_model = AgglomerativeClustering(
    n_clusters=15,
    metric='cosine',
    linkage='average'
)

In [13]:
X_sample_dense = X_sample.toarray()
sampled_game['cluster'] = cluster_model.fit_predict(X_sample_dense)

* trying K-means

In [14]:
# %% KMeans clustering
from sklearn.cluster import KMeans

# Run KMeans on the same sample data
kmeans_model = KMeans(n_clusters=15, random_state=42, n_init=10)
sampled_game['cluster_kmeans'] = kmeans_model.fit_predict(X_sample_dense)

print("\nKMeans Cluster distribution:\n", sampled_game['cluster_kmeans'].value_counts())

# Evaluate clustering quality using silhouette score
sil_score_kmeans = silhouette_score(X_sample_dense, sampled_game['cluster_kmeans'], metric='cosine')
print(f"\nSilhouette Score (KMeans Cluster quality): {sil_score_kmeans:.3f}")



KMeans Cluster distribution:
 cluster_kmeans
14    292
1     213
3     192
0     187
2     142
11    141
10    134
7     131
4     115
9     114
6     103
12     92
13     56
5      55
8      33
Name: count, dtype: int64

Silhouette Score (KMeans Cluster quality): 0.077


In [15]:
print(sampled_game['cluster'].value_counts())

cluster
14    1561
6      114
2       99
5       63
4       49
3       36
0       18
8       14
12      13
13       9
11       9
1        6
9        4
10       3
7        2
Name: count, dtype: int64


In [16]:
sil_score = silhouette_score(X_sample_dense, sampled_game['cluster'], metric='cosine')
print(f"\nSilhouette Score (cluster quality): {sil_score:.3f}")


Silhouette Score (cluster quality): 0.015


In [17]:
print("\n--- Comparison ---")
print(f"Hierarchical Silhouette Score: {sil_score:.3f}")
print(f"KMeans Silhouette Score: {sil_score_kmeans:.3f}")

if sil_score_kmeans > sil_score:
    print("KMeans performed better based on silhouette score.")
else:
    print("Hierarchical clustering performed better based on silhouette score.")



--- Comparison ---
Hierarchical Silhouette Score: 0.015
KMeans Silhouette Score: 0.077
KMeans performed better based on silhouette score.


In [18]:
for c in range(3):
    print(f"\nCluster {c} examples:")
    print(sampled_game[sampled_game['cluster'] == c]['name'].head(5).to_list())



Cluster 0 examples:
['The Church in the Darkness', 'Signs of the Sojourner: Prologue', 'Pillars of Eternity - Definitive Edition', "Assassin's Creed Unity", 'Neptunia x Senran Kagura: Ninja Wars']

Cluster 1 examples:
['Delta Force — Black Hawk Down: Team Sabre', 'Kraken Academy!!', 'Hitman HD Pack', 'Unreal Tournament (2014)', 'Punch Club 2: Fast Forward']

Cluster 2 examples:
['Avorion', 'Star Chef 2: Cooking Game', 'Endless Sky', 'Project CARS', 'Roots of Pacha']


In [19]:
def recommend_by_input(user_input, top_n=5):
    user_input = user_input.lower()
    user_vector = tfidf.transform([user_input]).toarray()
    user_vector = normalize(user_vector)

    similarities = cosine_similarity(user_vector, X_sample)
    closest_game_idx = np.argmax(similarities)
    user_cluster = sampled_game.iloc[closest_game_idx]['cluster']

    cluster_mask = sampled_game['cluster'] == user_cluster
    cluster_games = sampled_game[cluster_mask]
    cluster_vectors = X_sample[cluster_mask.to_numpy()]

    cluster_sim = cosine_similarity(user_vector, cluster_vectors).flatten()
    cluster_games = cluster_games.copy()
    cluster_games['similarity'] = cluster_sim

    recommendations = cluster_games.sort_values(by='similarity', ascending=False).head(top_n)

    print(f"\nTop {top_n} games for input: '{user_input}' (Cluster {user_cluster})")
    for i, row in recommendations.iterrows():
        print(f"- {row['name']} | Similarity: {row['similarity']:.3f}")

In [20]:
# %% Test GTA V with KMeans clusters

def recommend_by_input_kmeans(user_input, top_n=5):
    user_input = user_input.lower()
    user_vector = tfidf.transform([user_input]).toarray()
    user_vector = normalize(user_vector)

    # Find the nearest game in TF-IDF space
    similarities = cosine_similarity(user_vector, X_sample)
    closest_game_idx = np.argmax(similarities)
    user_cluster = sampled_game.iloc[closest_game_idx]['cluster_kmeans']

    # Recommend from the same KMeans cluster
    cluster_mask = sampled_game['cluster_kmeans'] == user_cluster
    cluster_games = sampled_game[cluster_mask]
    cluster_vectors = X_sample[cluster_mask.to_numpy()]

    cluster_sim = cosine_similarity(user_vector, cluster_vectors).flatten()

    cluster_games = cluster_games.copy()
    cluster_games['similarity'] = cluster_sim
    recommendations = cluster_games.sort_values(by='similarity', ascending=False).head(top_n)

    print(f"\nTop {top_n} games for input: '{user_input}' (KMeans Cluster {user_cluster})")
    for _, row in recommendations.iterrows():
        print(f"- {row['name']} | Similarity: {row['similarity']:.3f}")

gta_input = "PC Action Open World Crime Multiplayer"
recommend_by_input_kmeans(gta_input, top_n=5)



Top 5 games for input: 'pc action open world crime multiplayer' (KMeans Cluster 14)
- Kingpin — Life of Crime | Similarity: 0.482
- Batla | Similarity: 0.352
- Delta Force: Task Force Dagger | Similarity: 0.338
- Kraken Academy!! | Similarity: 0.302
- Thief Simulator VR | Similarity: 0.298


In [21]:
test_queries_kmeans = [
    "story",
    "open world",
    "single player",
    "pc",
    "sandbox"
]

for query in test_queries:
    print("\n" + "="*70)
    recommend_by_input(query, top_n=5)

# %% Example: GTA V
print("\n" + "="*70)
gta_input = "PC Action Open World Crime Multiplayer"
recommend_by_input(gta_input, top_n=5)