In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/naucno-izracunavanje/pivot_table.csv
/kaggle/input/naucno-izracunavanje/test_data.csv
/kaggle/input/pivot-table-csv/pivot_table.csv
/kaggle/input/movie-titles-csv/movie_titles.csv


In [12]:
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import pickle

In [13]:
import pandas as pd
import numpy as np
import time
df_pivot = pd.read_csv('/kaggle/input/naucno-izracunavanje/pivot_table.csv', index_col=0)
print(df_pivot.columns)

Index(['3', '8', '16', '17', '18', '26', '28', '30', '32', '33',
       ...
       '4472', '4474', '4478', '4479', '4485', '4488', '4490', '4492', '4493',
       '4496'],
      dtype='object', length=1350)


In [14]:
df_melt = df_pivot.stack().reset_index().rename(columns={'level_1': 'Movie_Id', 0: 'Rating'})
print(df_melt.columns)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_melt[['Cust_Id', 'Movie_Id', 'Rating']], reader)

Index(['Cust_Id', 'Movie_Id', 'Rating'], dtype='object')


In [15]:
trainset, validset = train_test_split(data, test_size=0.25)

In [16]:
similarity_options = {'name': 'cosine', 'user_based': False}
best_rmse = float('inf')
best_model = None

In [17]:
for k in [25, 30, 35, 40, 45]:
    algo = KNNBasic(k=k, sim_options=similarity_options)
    start_time = time.time()
    algo.fit(trainset)
    end_time = time.time()
    print(f"Vreme fitovanja: {end_time - start_time:.2f} sekundi")
    start_time = time.time()
    predictions = algo.test(validset)
    end_time = time.time()
    print(f"Vreme predvidjanja: {end_time - start_time:.2f} sekundi")
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f'Cosine similarity, k={k}, RMSE={rmse}')
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = algo
        best_params = {'similarity': 'cosine', 'k': k}


Computing the cosine similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 50.78 sekundi
Vreme predvidjanja: 635.23 sekundi
Cosine similarity, k=25, RMSE=0.9660164188205965
Computing the cosine similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 51.43 sekundi
Vreme predvidjanja: 661.47 sekundi
Cosine similarity, k=30, RMSE=0.9657651833087967
Computing the cosine similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 53.60 sekundi
Vreme predvidjanja: 693.43 sekundi
Cosine similarity, k=35, RMSE=0.9662320384445655
Computing the cosine similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 52.95 sekundi
Vreme predvidjanja: 703.04 sekundi
Cosine similarity, k=40, RMSE=0.9671032990517309
Computing the cosine similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 52.35 sekundi
Vreme predvidjanja: 717.72 sekundi
Cosine similarity, k=45, RMSE=0.9680750419831743


In [18]:
similarity_options['name'] = 'pearson'
for k in [10, 15, 18, 20, 25]:
    algo = KNNBasic(k=k, sim_options=similarity_options)
    start_time = time.time()
    algo.fit(trainset)
    end_time = time.time()
    print(f"Vreme fitovanja: {end_time - start_time:.2f} sekundi")
    start_time = time.time()
    predictions = algo.test(validset)
    end_time = time.time()
    print(f"Vreme predvidjanja: {end_time - start_time:.2f} sekundi")
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f'Pearson similarity, k={k}, RMSE={rmse}')
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = algo
        best_params = {'similarity': 'pearson', 'k': k}

Computing the pearson similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 64.09 sekundi
Vreme predvidjanja: 507.93 sekundi
Pearson similarity, k=10, RMSE=0.9325130250507827
Computing the pearson similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 62.57 sekundi
Vreme predvidjanja: 548.46 sekundi
Pearson similarity, k=15, RMSE=0.9267968734613501
Computing the pearson similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 61.24 sekundi
Vreme predvidjanja: 578.52 sekundi
Pearson similarity, k=18, RMSE=0.9262457375031656
Computing the pearson similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 61.71 sekundi
Vreme predvidjanja: 587.67 sekundi
Pearson similarity, k=20, RMSE=0.9264013583605124
Computing the pearson similarity matrix...
Done computing similarity matrix.
Vreme fitovanja: 60.99 sekundi
Vreme predvidjanja: 631.99 sekundi
Pearson similarity, k=25, RMSE=0.9277665644010181


In [19]:
with open('best_knn_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f'Najbolji model koristi {best_params["similarity"]} sličnost sa k={best_params["k"]} i RMSE={best_rmse}')


Najbolji model koristi pearson sličnost sa k=18 i RMSE=0.9262457375031656


In [20]:
test_data = pd.read_csv('test_data.csv')
test_data = Dataset.load_from_df(test_data[['Cust_Id', 'Movie_Id', 'Rating']], reader)
testset = test_data.build_full_trainset().build_testset()

with open('best_knn_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

predictions = best_model.test(testset)
final_rmse = accuracy.rmse(predictions)

FileNotFoundError: [Errno 2] No such file or directory: 'test_data.csv'