In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/naucno-izracunavanje/pivot_table.csv
/kaggle/input/naucno-izracunavanje/test_data.csv
/kaggle/input/netflix-models/best_knn_model.pkl
/kaggle/input/netflix-models/best_svd_model.pkl


In [2]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import time

In [3]:
with open('/kaggle/input/netflix-models/best_knn_model.pkl', 'rb') as f:
    best_knn_model = pickle.load(f)

with open('/kaggle/input/netflix-models/best_svd_model.pkl', 'rb') as f:
    best_svd_model = pickle.load(f)

In [4]:
df_pivot = pd.read_csv('/kaggle/input/naucno-izracunavanje/pivot_table.csv', index_col=0)
print(df_pivot.head())

df_melt = df_pivot.stack().reset_index().rename(columns={'level_1': 'Movie_Id', 0: 'Rating'})
print(df_melt.head())

          3    8  16  17  18  26   28   30  32  33  ...  4472  4474  4478  \
Cust_Id                                             ...                     
6       NaN  NaN NaN NaN NaN NaN  NaN  3.0 NaN NaN  ...   NaN   NaN   NaN   
7       NaN  5.0 NaN NaN NaN NaN  4.0  5.0 NaN NaN  ...   3.0   NaN   NaN   
79      NaN  NaN NaN NaN NaN NaN  NaN  NaN NaN NaN  ...   4.0   NaN   NaN   
97      NaN  NaN NaN NaN NaN NaN  NaN  NaN NaN NaN  ...   NaN   NaN   NaN   
134     NaN  NaN NaN NaN NaN NaN  NaN  NaN NaN NaN  ...   NaN   NaN   NaN   

         4479  4485  4488  4490  4492  4493  4496  
Cust_Id                                            
6         NaN   NaN   NaN   NaN   NaN   NaN   NaN  
7         NaN   NaN   NaN   NaN   NaN   NaN   NaN  
79        NaN   NaN   NaN   4.0   NaN   NaN   NaN  
97        NaN   NaN   NaN   NaN   NaN   NaN   NaN  
134       NaN   NaN   NaN   NaN   NaN   NaN   NaN  

[5 rows x 1350 columns]
   Cust_Id Movie_Id  Rating
0        6       30     3.0
1        6     

In [5]:
train_data, val_data = train_test_split(df_melt, test_size=0.25, random_state=42)

print(f"Trening podaci: {len(train_data)} redova")
print(f"Validacioni podaci: {len(val_data)} redova")

Trening podaci: 9102165 redova
Validacioni podaci: 3034055 redova


In [6]:
# Funkcija za generisanje dodatnih atributa koje ce koristiti hibridni modeli
# Funkcija generise: 
#     knn_pred - Predvidjanje KNN modela
#     svd_pred - Predvidjanje SVD modela
#     movie_mean_rating - Prosecna ocena filma
#     user_mean_rating - Prosecna ocena koju korisnik daje
#     user_review_count - Ukupan broj ocena koje je korisnik dao
#     movie_review_count - Ukupan broj ocena koje film ima
#     user_rating_variability - Varijansa ocena korisnika
    
def generate_features_targets(data, knn_model, svd_model, pivot_table):
    df_pivot = pivot_table.copy()
    
    knn_preds = np.array([knn_model.predict(row['Cust_Id'], row['Movie_Id']).est for _, row in data.iterrows()])
    svd_preds = np.array([svd_model.predict(row['Cust_Id'], row['Movie_Id']).est for _, row in data.iterrows()])
    
    movie_mean_ratings = df_pivot.mean()
    user_mean_ratings = df_pivot.mean(axis=1)
    user_review_counts = df_pivot.notna().sum(axis=1)
    movie_review_counts = df_pivot.notna().sum(axis=0)
    user_rating_variabilities = df_pivot.std(axis=1)
    
    data['knn_pred'] = data.apply(lambda row: knn_model.predict(row['Cust_Id'], row['Movie_Id']).est, axis=1)
    data['svd_pred'] = data.apply(lambda row: svd_model.predict(row['Cust_Id'], row['Movie_Id']).est, axis=1)
    data['movie_mean_rating'] = data['Movie_Id'].map(movie_mean_ratings)
    data['user_mean_rating'] = data['Cust_Id'].map(user_mean_ratings)
    data['user_review_count'] = data['Cust_Id'].map(user_review_counts)
    data['movie_review_count'] = data['Movie_Id'].map(movie_review_counts)
    data['user_rating_variability'] = data['Cust_Id'].map(user_rating_variabilities)
    
    features = data[['knn_pred', 'svd_pred', 'movie_mean_rating', 'user_mean_rating', 'user_review_count', 'movie_review_count', 'user_rating_variability']]
    targets = data['Rating']
    
    return features, targets

In [7]:
start_time = time.time()
X_train, y_train = generate_features_targets(train_data, best_knn_model, best_svd_model, df_pivot)
end_time = time.time()
print(f"Vreme generisanja feature-a: {end_time - start_time:.2f} sekundi")

Vreme generisanja feature-a: 4619.67 sekundi


In [8]:
start_time = time.time()
X_val, y_val = generate_features_targets(val_data, best_knn_model, best_svd_model, df_pivot)
end_time = time.time()
print(f"Vreme generisanja feature-a: {end_time - start_time:.2f} sekundi")

Vreme generisanja feature-a: 1547.39 sekundi


In [9]:
test_data = pd.read_csv('/kaggle/input/naucno-izracunavanje/test_data.csv')
print(test_data.head())

   Cust_Id  Rating  Movie_Id
0   524869     3.0      4330
1  1726226     3.0      1467
2   310055     3.0      3522
3   680917     4.0      3650
4  2631815     4.0      1885


In [10]:
start_time = time.time()
X_test, y_test = generate_features_targets(test_data, best_knn_model, best_svd_model, df_pivot)
end_time = time.time()
print(f"Vreme generisanja feature-a: {end_time - start_time:.2f} sekundi")

Vreme generisanja feature-a: 977.78 sekundi


In [13]:
X_train.loc[:, 'target'] = y_train
X_val.loc[:, 'target'] = y_val
X_test.loc[:, 'target'] = y_test

X_train.to_csv('train_features.csv', index=False)
X_val.to_csv('val_features.csv', index=False)
X_test.to_csv('test_features.csv', index=False)

In [15]:
import shutil

shutil.make_archive('/kaggle/working/features', 'zip', '/kaggle/working', 
                    'train_features.csv')
shutil.make_archive('/kaggle/working/features', 'zip', '/kaggle/working', 
                    'test_features.csv')
shutil.make_archive('/kaggle/working/features', 'zip', '/kaggle/working', 
                    'val_features.csv')

!cp /kaggle/working/features.zip /kaggle/input/

cp: cannot create regular file '/kaggle/input/features.zip': Read-only file system


In [21]:
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/train_features.csv
/kaggle/working/test_features.csv
/kaggle/working/features.zip
/kaggle/working/val_features.csv


aaa
