In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

import random
import os

# MovieLens 100k

In [13]:
PATH = '../data/movielens/100k/'
LOAD_PATH = os.path.join(PATH, 'raw')
CLEAN_PATH = os.path.join(PATH, 'clean')

In [14]:
r_cols = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(os.path.join(LOAD_PATH, 'u.data'), sep='\t', names=r_cols, usecols=range(4), encoding="ISO-8859-1")

m_cols = ['movie_id', 'title', 'date', 'idk', "url"] + [f'feature{i + 1}' for i in range(19)]
movies = pd.read_csv(os.path.join(LOAD_PATH, 'u.item'), sep='|', names=m_cols, encoding="ISO-8859-1")
movies.drop(['idk', 'url'], axis=1, inplace=True)
# ratings = pd.merge(movies, ratings)

ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [15]:
ratings['userId'].nunique(), ratings['movieId'].nunique(), ratings['rating'].mean(), ratings['rating'].std()

(943, 1682, 3.52986, 1.125673599144316)

In [16]:
ratings.shape

(100000, 4)

In [17]:
movies['title'] = movies['title'].apply(lambda x: x[:-5][:-2])
movies['date'] = pd.to_datetime(movies['date'].values).astype(int) / 10**9

In [18]:
movies.to_csv(os.path.join(CLEAN_PATH, 'movies.csv'), index=False)

In [19]:
ratings.to_csv(os.path.join(CLEAN_PATH, 'ratings.csv'), index=False)

In [20]:
user_profiles = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')
user_profiles.fillna(0, inplace=True)
user_profiles

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
train_data, rest_data = train_test_split(user_profiles, test_size=0.2, shuffle=True, random_state=42)

In [23]:
# bound = len(rest_data) // 2
# val_data, test_data = rest_data.iloc[:bound], rest_data.iloc[bound:]

In [24]:
train_data.to_csv(os.path.join(CLEAN_PATH, 'train_data.csv'), index=False)
# val_data.to_csv(os.path.join(CLEAN_PATH, 'val_data.csv'), index=False)
test_data.to_csv(os.path.join(CLEAN_PATH, 'test_data.csv'), index=False)
train_data.to_pickle(os.path.join(CLEAN_PATH, 'train_data.pickle'), protocol=2)
# val_data.to_pickle(os.path.join(CLEAN_PATH, 'val_data.pickle'), protocol=2)
test_data.to_pickle(os.path.join(CLEAN_PATH, 'test_data.pickle'), protocol=2)

## Test data prepared for recommendations

In [28]:
fr = 2
user_profile_for_recom = []

for user, movies in test_data.iterrows():
    #vytahnu originalni ratingy useru serazeny podle time_stamp
    user_orig_items = ratings[ratings['userId'] == user]
    #vytahnu random pozici filmu usera
    del_item_index = 1
    if len(user_orig_items['movieId'].unique()) > 2:
        del_item_index = random.randint(fr, len(user_orig_items['movieId'].unique()) - 1)
    if del_item_index is not None:
        #vytahnu movieItem -> jeto ten na ktery se budu dotazovat
        control_item = user_orig_items['movieId'].iloc[del_item_index]
        #vytahnu jeho index
        control_item_index = user_orig_items[user_orig_items['movieId'] == control_item].index[0]
        #vytahnu itemy ktery musis smazat
        items_to_remove = user_orig_items[user_orig_items.index >= control_item_index]['movieId'].values
        
        #odstarim tento movie z profilu usera
        user_movies_profile_simple = movies.replace(movies[items_to_remove], 0.0)
        
        user_keep_movies = user_movies_profile_simple[user_movies_profile_simple > 0.0].index.values
        
        # pro maticovou faktorizaci je potreba mit radek jako dataframe
        user_movies_profile_dataframe = test_data.loc[[user]]
        # rating usera premaznu na 0.0
        user_movies_profile_dataframe[items_to_remove] = 0.0
        
        user_profile_for_recom.append([user,user_movies_profile_simple,user_movies_profile_dataframe,control_item,user_keep_movies])

retDf = pd.DataFrame(user_profile_for_recom, columns=['user','user_movies_profile_simple', 'user_movies_profile_dataframe', 'control_item', 'user_keep_movies'])
retDf = retDf.set_index('user')
# retDf.to_pickle(os.path.join(CLEAN_PATH, 'test_data_prepare_for_recom.pickle'), protocol=2)
# retDf.to_csv('test_data_prepared.csv', index=False)

In [29]:
bound = len(retDf) // 2
val_df = retDf[:bound]
test_df = retDf[bound:]

retDf.to_pickle(os.path.join(CLEAN_PATH, 'val_data_prepare_for_recom.pickle'), protocol=2)
retDf.to_pickle(os.path.join(CLEAN_PATH, 'test_data_prepare_for_recom.pickle'), protocol=2)

## Item sum dif ratings

In [30]:
train_data = pd.read_pickle(os.path.join(CLEAN_PATH, 'train_data.pickle'))

user_avg_rating = []
i = 0
for k, v in train_data.iterrows():
    tmp = v.values
    tmp = tmp[tmp != 0]
    user_rating = tmp.sum() / len(tmp)
    user_avg_rating.append([k,i,user_rating])
    i += 1 
    
user_avg_ratingDf = pd.DataFrame(user_avg_rating, columns=['userId', 'index', 'avg_rating'])

In [31]:
item_suma_rating = []
index = 0

for k,v in train_data.T.iterrows():
    tmp = csr_matrix(v.values)
    rows, cals = tmp.nonzero()
    item_index = [[i, j] for (i, j) in zip(cals, tmp.data)]
    item_by_users = 0
    for i in item_index:
        user_rating = user_avg_ratingDf[user_avg_ratingDf['index'] == i[0]][['avg_rating']].values[0][0]
        rozdil = i[1] - user_rating
        item_by_users += rozdil
        
    item_suma_rating.append([k,index,item_by_users])
    index += 1

item_suma_ratingDf = pd.DataFrame(item_suma_rating, columns=['movieId', 'index', 'sum_rating'])

In [32]:
item_suma_ratingDf.to_pickle(os.path.join(CLEAN_PATH, 'item_sum_dif_rating.pickle'), protocol=2)

# MovieLens 1m

In [2]:
PATH = '../data/movielens/1m/'
LOAD_PATH = os.path.join(PATH, 'raw')
CLEAN_PATH = os.path.join(PATH, 'clean')

In [3]:
r_cols = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(os.path.join(LOAD_PATH, 'ratings.dat'), sep='::', names=r_cols, usecols=range(4), encoding="ISO-8859-1", engine='python')

m_cols = ['movieId', 'title', 'genres']
movies = pd.read_csv(os.path.join(LOAD_PATH, 'movies.dat'), sep='::', encoding="ISO-8859-1", names=m_cols, engine='python', header=None)

# ratings = pd.merge(movies, ratings)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [4]:
ratings['userId'].nunique(), ratings['movieId'].nunique(), ratings['rating'].mean(), ratings['rating'].std()

(6040, 3706, 3.581564453029317, 1.117101845373258)

In [5]:
ratings.shape

(1000209, 4)

In [6]:
movies_metadata = pd.read_csv('../data/movies_metadata.csv', low_memory=False)

In [7]:
movies['date'] = movies['title'].apply(lambda x: f'{x[-5:][:-1]}-01-01')
movies['title'] = movies['title'].apply(lambda x: x[:-5][:-2])

In [8]:
movies['date'] = pd.to_datetime(movies['date'].values).astype(int)/ 10**9

In [9]:
dummies = movies['genres'].str.get_dummies(sep='|')
dummies.columns = [f'feature{i + 1}' for i in range(dummies.shape[1])]

In [10]:
movies.drop('genres', axis=1, inplace=True)

In [11]:
movies = pd.concat([movies, dummies], axis=1)

In [12]:
movies

Unnamed: 0,movieId,title,date,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18
0,1,Toy Story,788918400.0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,788918400.0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,788918400.0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,788918400.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,788918400.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,946684800.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream,946684800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland,946684800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House,946684800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
movies.to_csv(os.path.join(CLEAN_PATH, 'movies.csv'), index=False)

In [14]:
ratings.to_csv(os.path.join(CLEAN_PATH, 'ratings.csv'), index=False)

In [15]:
user_profiles = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')
user_profiles.fillna(0, inplace=True)
user_profiles

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
train_data, test_data = train_test_split(user_profiles, test_size=0.2, shuffle=True, random_state=42)

In [18]:
train_data.to_csv(os.path.join(CLEAN_PATH, 'train_data.csv'), index=False)
test_data.to_csv(os.path.join(CLEAN_PATH, 'test_data.csv'), index=False)
train_data.to_pickle(os.path.join(CLEAN_PATH, 'train_data.pickle'), protocol=2)
test_data.to_pickle(os.path.join(CLEAN_PATH, 'test_data.pickle'), protocol=2)

## Test data prepared for recommendations

In [19]:
fr = 2
user_profile_for_recom = []

for user, movies in test_data.iterrows():
    #vytahnu originalni ratingy useru serazeny podle time_stamp
    user_orig_items = ratings[ratings['userId'] == user]
    #vytahnu random pozici filmu usera
    del_item_index = 1
    if len(user_orig_items['movieId'].unique()) > 2:
        del_item_index = random.randint(fr, len(user_orig_items['movieId'].unique()) - 1)
    if del_item_index is not None:
        #vytahnu movieItem -> jeto ten na ktery se budu dotazovat
        control_item = user_orig_items['movieId'].iloc[del_item_index]
        #vytahnu jeho index
        control_item_index = user_orig_items[user_orig_items['movieId'] == control_item].index[0]
        #vytahnu itemy ktery musis smazat
        items_to_remove = user_orig_items[user_orig_items.index >= control_item_index]['movieId'].values
        
        #odstarim tento movie z profilu usera
        user_movies_profile_simple = movies.replace(movies[items_to_remove], 0.0)
        
        user_keep_movies = user_movies_profile_simple[user_movies_profile_simple > 0.0].index.values
        
        # pro maticovou faktorizaci je potreba mit radek jako dataframe
        user_movies_profile_dataframe = test_data.loc[[user]]
        # rating usera premaznu na 0.0
        user_movies_profile_dataframe[items_to_remove] = 0.0
        
        user_profile_for_recom.append([user,user_movies_profile_simple,user_movies_profile_dataframe,control_item,user_keep_movies])

retDf = pd.DataFrame(user_profile_for_recom, columns=['user','user_movies_profile_simple', 'user_movies_profile_dataframe', 'control_item', 'user_keep_movies'])
retDf = retDf.set_index('user')
# retDf.to_pickle(os.path.join(CLEAN_PATH, 'test_data_prepare_for_recom.pickle'), protocol=2)
# retDf.to_csv('test_data_prepared.csv', index=False)

In [20]:
bound = len(retDf) // 2
val_df = retDf[:bound]
test_df = retDf[bound:]

retDf.to_pickle(os.path.join(CLEAN_PATH, 'val_data_prepare_for_recom.pickle'), protocol=2)
retDf.to_pickle(os.path.join(CLEAN_PATH, 'test_data_prepare_for_recom.pickle'), protocol=2)

## Item sum dif ratings

In [21]:
train_data = pd.read_pickle(os.path.join(CLEAN_PATH, 'train_data.pickle'))

user_avg_rating = []
i = 0
for k, v in train_data.iterrows():
    tmp = v.values
    tmp = tmp[tmp != 0]
    user_rating = tmp.sum() / len(tmp)
    user_avg_rating.append([k,i,user_rating])
    i += 1 
    
user_avg_ratingDf = pd.DataFrame(user_avg_rating, columns=['userId', 'index', 'avg_rating'])

In [22]:
item_suma_rating = []
index = 0
for k,v in train_data.T.iterrows():
    tmp = csr_matrix(v.values)
    rows, cals = tmp.nonzero()
    item_index = [[i, j] for (i, j) in zip(cals, tmp.data)]
    item_by_users = 0
    for i in item_index:
        user_rating = user_avg_ratingDf[user_avg_ratingDf['index'] == i[0]][['avg_rating']].values[0][0]
        rozdil = i[1] - user_rating
        item_by_users += rozdil
        
    item_suma_rating.append([k,index,item_by_users])
    index += 1

item_suma_ratingDf = pd.DataFrame(item_suma_rating, columns=['movieId', 'index', 'sum_rating'])

In [23]:
item_suma_ratingDf.to_pickle(os.path.join(CLEAN_PATH, 'item_sum_dif_rating.pickle'), protocol=2)