In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
game = pd.read_csv("video_game_reviews.csv")

In [4]:
game.head()

Unnamed: 0,Game Title,User Rating,Age Group Targeted,Price,Platform,Requires Special Device,Developer,Publisher,Release Year,Genre,Multiplayer,Game Length (Hours),Graphics Quality,Soundtrack Quality,Story Quality,User Review Text,Game Mode,Min Number of Players
0,Grand Theft Auto V,36.4,All Ages,41.41,PC,No,Game Freak,Innersloth,2015,Adventure,No,55.3,Medium,Average,Poor,"Solid game, but too many bugs.",Offline,1
1,The Sims 4,38.3,Adults,57.56,PC,No,Nintendo,Electronic Arts,2015,Shooter,Yes,34.6,Low,Poor,Poor,"Solid game, but too many bugs.",Offline,3
2,Minecraft,26.8,Teens,44.93,PC,Yes,Bungie,Capcom,2012,Adventure,Yes,13.9,Low,Good,Average,"Great game, but the graphics could be better.",Offline,5
3,Bioshock Infinite,38.4,All Ages,48.29,Mobile,Yes,Game Freak,Nintendo,2015,Sports,No,41.9,Medium,Good,Excellent,"Solid game, but the graphics could be better.",Online,4
4,Half-Life: Alyx,30.1,Adults,55.49,PlayStation,Yes,Game Freak,Epic Games,2022,RPG,Yes,13.2,High,Poor,Good,"Great game, but too many bugs.",Offline,1


In [6]:
game.shape

(47774, 18)

In [7]:
game.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47774 entries, 0 to 47773
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Game Title               47774 non-null  object 
 1   User Rating              47774 non-null  float64
 2   Age Group Targeted       47774 non-null  object 
 3   Price                    47774 non-null  float64
 4   Platform                 47774 non-null  object 
 5   Requires Special Device  47774 non-null  object 
 6   Developer                47774 non-null  object 
 7   Publisher                47774 non-null  object 
 8   Release Year             47774 non-null  int64  
 9   Genre                    47774 non-null  object 
 10  Multiplayer              47774 non-null  object 
 11  Game Length (Hours)      47774 non-null  float64
 12  Graphics Quality         47774 non-null  object 
 13  Soundtrack Quality       47774 non-null  object 
 14  Story Quality         

In [8]:
game.describe()

Unnamed: 0,User Rating,Price,Release Year,Game Length (Hours),Min Number of Players
count,47774.0,47774.0,47774.0,47774.0,47774.0
mean,29.719329,39.951371,2016.480952,32.481672,5.116758
std,7.550131,11.520342,4.027276,15.872508,2.769521
min,10.1,19.99,2010.0,5.0,1.0
25%,24.3,29.99,2013.0,18.8,3.0
50%,29.7,39.845,2016.0,32.5,5.0
75%,35.1,49.9575,2020.0,46.3,7.0
max,49.5,59.99,2023.0,60.0,10.0


In [9]:
game.isnull().sum()

Game Title                 0
User Rating                0
Age Group Targeted         0
Price                      0
Platform                   0
Requires Special Device    0
Developer                  0
Publisher                  0
Release Year               0
Genre                      0
Multiplayer                0
Game Length (Hours)        0
Graphics Quality           0
Soundtrack Quality         0
Story Quality              0
User Review Text           0
Game Mode                  0
Min Number of Players      0
dtype: int64

In [10]:
game.duplicated().sum()

np.int64(0)

In [11]:
game['Play_Mode'] = game['Multiplayer'].apply(lambda x: 'MultiPlayer' if x == 'Yes' else 'Solo')

In [12]:
game = game[['Game Title', 'Age Group Targeted', 'Platform', 'Developer','Publisher', 'Genre', 'User Review Text', 'Game Mode', 'Play_Mode']]

In [13]:
def bland_space_remover(obj):
  return obj.str.replace(' ', '', regex=False)

In [14]:
game[['Age Group Targeted', 'Developer', 'Publisher']]=game[['Age Group Targeted', 'Developer', 'Publisher']].apply(bland_space_remover)

In [15]:
game['Tags'] = game['Age Group Targeted']+" "+game['Platform']+" "+game['Developer']+" "+game['Publisher']+" "+game['Genre']+" "+game['Game Mode']+" "+game['Play_Mode']

In [16]:
game = game[['Game Title', 'Tags']]

In [17]:
game['Tags'] = game['Tags'].str.lower()

In [18]:
game

Unnamed: 0,Game Title,Tags
0,Grand Theft Auto V,allages pc gamefreak innersloth adventure offl...
1,The Sims 4,adults pc nintendo electronicarts shooter offl...
2,Minecraft,teens pc bungie capcom adventure offline multi...
3,Bioshock Infinite,allages mobile gamefreak nintendo sports onlin...
4,Half-Life: Alyx,adults playstation gamefreak epicgames rpg off...
...,...,...
47769,Minecraft,allages playstation valve innersloth party onl...
47770,The Legend of Zelda: Breath of the Wild,kids xbox valve electronicarts strategy offlin...
47771,Animal Crossing: New Horizons,allages playstation gamefreak squareenix sport...
47772,The Legend of Zelda: Breath of the Wild,kids xbox epicgames epicgames simulation offli...


In [19]:
cv = CountVectorizer(max_features=50, stop_words='english')
vectors = cv.fit_transform(game['Tags']).toarray()