In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

In [2]:
# Load Dataset
metadata = pd.read_csv('test.csv', low_memory = False)
metadata['description'] = metadata['description'].fillna('');

print(metadata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           1030 non-null   object 
 1   title        1030 non-null   object 
 2   developBy    1030 non-null   object 
 3   genre        1030 non-null   object 
 4   price        1030 non-null   object 
 5   rating       1030 non-null   float64
 6   description  1030 non-null   object 
 7   logoApp      1030 non-null   object 
 8   Url          1030 non-null   object 
dtypes: float64(1), object(8)
memory usage: 72.5+ KB
None


In [3]:
# Word Vectorize
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(metadata['description'])

print(tfidf_matrix)

  (0, 4618)	0.06454701795742221
  (0, 8767)	0.03671577912483391
  (0, 11064)	0.03497202116452975
  (0, 5329)	0.053313554685802345
  (0, 186)	0.05737270145359951
  (0, 7679)	0.022954419986426337
  (0, 1155)	0.035639023180177476
  (0, 5653)	0.05035439439108734
  (0, 4399)	0.06359962017083642
  (0, 17289)	0.04674452678769062
  (0, 10158)	0.04569639517052804
  (0, 11849)	0.03481098257628915
  (0, 1689)	0.04491997565600369
  (0, 5991)	0.03769453097915857
  (0, 16506)	0.043843219711347255
  (0, 14680)	0.04765544949564158
  (0, 6245)	0.024522337565550185
  (0, 7444)	0.04401601334440151
  (0, 17499)	0.04742106255629037
  (0, 13191)	0.03843124051055994
  (0, 5251)	0.03282755451350397
  (0, 3523)	0.05737270145359951
  (0, 13089)	0.08095541101976175
  (0, 15903)	0.05737270145359951
  (0, 2643)	0.042544806471461895
  :	:
  (1029, 8865)	0.02015249230469032
  (1029, 5322)	0.012458376206850847
  (1029, 14824)	0.020092331161681756
  (1029, 11801)	0.029588394864598147
  (1029, 19094)	0.1106933776123904

In [4]:
# Create app list and pandas series
app = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

print(app)

title
Yatzy - Dice Game                               0
Real Chess 3D                                   1
Ludo King™                                      2
Checkers                                        3
Ludo Club - Fun Dice Game                       4
                                             ... 
NFT Creator - NinjaFT                        1025
Concepts: Sketch Note Draw                   1026
SketchAR Create Art and get NFT instantly    1027
Pango Paper Color : colouring                1028
ibis Paint                                   1029
Length: 1030, dtype: int64


In [5]:
# Compute cosine (similarity)
cosine_overview = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_overview)

[[1.         0.06369625 0.14925818 ... 0.02107304 0.0042477  0.01039856]
 [0.06369625 1.         0.07382188 ... 0.03501472 0.00460854 0.01667988]
 [0.14925818 0.07382188 1.         ... 0.0129729  0.01060844 0.01300657]
 ...
 [0.02107304 0.03501472 0.0129729  ... 1.         0.09229257 0.10972916]
 [0.0042477  0.00460854 0.01060844 ... 0.09229257 1.         0.01395419]
 [0.01039856 0.01667988 0.01300657 ... 0.10972916 0.01395419 1.        ]]


In [6]:
# Dump apps list and cosine to pickle
pickle.dump((app, cosine_overview), open('app.p', 'wb'))

In [7]:
# Load apps list and cosine from pickle
apps, cosine_overview = pickle.load(open('app.p', 'rb'))

def get_recommendations_returnList(title, cosine_sim=cosine_overview):
    list_recommendations = []
    idx = app[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    app_indics = [i[0] for i in sim_scores]
    for single_app in apps[app_indics].index:
        list_recommendations.append(single_app)
    return list_recommendations

In [8]:
text_recomend = get_recommendations_returnList('Nobodies: Murder Cleaner')

print(text_recomend)

['Critical Strike CS: Counter Terrorist Online FPS', 'Midsomer Murders: Mysteries', 'Unholy Adventure: point and click story game', 'Art Puzzle - picture art games', 'Criminal Case', 'Words of Wonders: Search', 'Wonder Word - A Fun Free Word Search Puzzle Game', 'Empires and Allies', 'Amnesia - Room Escape Games', 'Toy Blast']
