In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

In [3]:
!unzip game_cards.zip
!ls

Archive:  game_cards.zip
  inflating: game_cards.csv          
cosine_similarity.pkl  metacritic_parser   requirements.txt	  venv
game_cards.csv	       prepare_data.ipynb  telegram_bot.py
game_cards.zip	       README.md	   unique_game_cards.csv


In [4]:
df = pd.read_csv('./game_cards.csv')

In [5]:
df.head()

Unnamed: 0,name,platform,date,summary,metascore,userscore,href
0,Sid Meier s Civilization II,PC,"February 29, 1996",An empire-building turn-based strategy game. T...,94,8.8,/game/pc/sid-meiers-civilization-ii
1,Quake,PC,"June 22, 1996",Rage through 32 single player levels and 6 dea...,94,8.7,/game/pc/quake
2,Diablo,PC,"December 31, 1996",The kingdom of Khandaras has fallen into chaos...,94,8.6,/game/pc/diablo
3,Super Mario 64,Nintendo 64,"September 26, 1996",Mario is super in a whole new way! Combining t...,94,9.1,/game/nintendo-64/super-mario-64
4,Wipeout XL,PlayStation,"September 30, 1996",The original scorched the game world and becam...,93,8.6,/game/playstation/wipeout-xl


In [6]:
df.shape

(19837, 7)

In [7]:
unique_game_cards = df.drop_duplicates('name')
unique_game_cards.shape

(12829, 7)

In [8]:
!rm game_cards.csv

In [9]:
unique_game_cards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12829 entries, 0 to 19836
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       12829 non-null  object
 1   platform   12829 non-null  object
 2   date       12829 non-null  object
 3   summary    12748 non-null  object
 4   metascore  12829 non-null  int64 
 5   userscore  12829 non-null  object
 6   href       12829 non-null  object
dtypes: int64(1), object(6)
memory usage: 801.8+ KB


In [10]:
userscore_mean = unique_game_cards[unique_game_cards['userscore'] != 'tbd']['userscore'].astype('float').mean()
unique_game_cards['userscore'] = unique_game_cards['userscore'].replace('tbd', userscore_mean).astype('float')

In [11]:
unique_game_cards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12829 entries, 0 to 19836
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       12829 non-null  object 
 1   platform   12829 non-null  object 
 2   date       12829 non-null  object 
 3   summary    12748 non-null  object 
 4   metascore  12829 non-null  int64  
 5   userscore  12829 non-null  float64
 6   href       12829 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 801.8+ KB


In [13]:
unique_game_cards.to_csv('unique_game_cards.csv', index=False)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
unique_game_cards['summary'] = unique_game_cards['summary'].astype('U')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(unique_game_cards['summary'])

tfidf_matrix.shape

(12829, 37840)

In [14]:
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim.shape

(12829, 12829)

In [15]:
indices = pd.Series(unique_game_cards.index, index=unique_game_cards['name'])

cos_df = pd.DataFrame(cos_sim)

cos_df.columns = indices.index

cos_df['name'] = indices.index
cos_df = cos_df.set_index('name')

cos_df.head()

name,Sid Meier s Civilization II,Quake,Diablo,Super Mario 64,Wipeout XL,Wave Race 64,Tomb Raider,Resident Evil,Command & Conquer: Red Alert,Tekken 2,...,Dragon Ball: The Breakers,Blind Fate: Edo no Yami,In Nightmare,Blade Runner: Enhanced Edition,The Waylanders,Zorro: The Chronicles,Babylon s Fall,LEGO Brawls,CrossfireX,POSTAL 4: No Regerts
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sid Meier s Civilization II,1.0,0.0,0.0,0.012226,0.015223,0.0,0.0,0.0,0.00284,0.01824,...,0.003517,0.030888,0.003381,0.012914,0.041194,0.055747,0.01193,0.006644,0.007067,0.007686
Quake,0.0,1.0,0.0,0.019436,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.021389,0.0,0.018832,0.0,0.0,0.0,0.0
Diablo,0.0,0.0,1.0,0.010456,0.002868,0.003003,0.018073,0.005576,0.032633,0.0,...,0.005751,0.030192,0.026296,0.013454,0.0,0.030582,0.034874,0.001294,0.001903,0.01272
Super Mario 64,0.012226,0.019436,0.010456,1.0,0.034077,0.055182,0.016893,0.0,0.005262,0.008448,...,0.024315,0.015385,0.015973,0.031407,0.0,0.002568,0.006149,0.021765,0.0,0.006013
Wipeout XL,0.015223,0.0,0.002868,0.034077,1.0,0.0145,0.0,0.0,0.010276,0.010518,...,0.009373,0.019424,0.004594,0.023171,0.017006,0.003198,0.007656,0.002449,0.013219,0.009803


In [16]:
cos_df.to_pickle('cosine_similarity.pkl') # ~ 1.2 gb