In [2]:
import sys
sys.path.append("..") # fix for relative imports

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix

import random
import os

In [4]:
from utils.helpers import get_movies_by_profile

# MovieLens 1m

In [7]:
PATH = '../data/movielens/10m/'
LOAD_PATH = os.path.join(PATH, 'raw')
CLEAN_PATH = os.path.join(PATH, 'clean')

In [41]:
r_cols = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(os.path.join(LOAD_PATH, 'ratings.dat'), sep='::', names=r_cols, usecols=range(4), encoding="ISO-8859-1", engine='python')

m_cols = ['movieId', 'title', 'genres']
movies = pd.read_csv(os.path.join(LOAD_PATH, 'movies.dat'), sep='::', encoding="ISO-8859-1", names=m_cols, engine='python', header=None)

# ratings = pd.merge(movies, ratings)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392
...,...,...,...,...
10000049,71567,2107,1.0,912580553
10000050,71567,2126,2.0,912649143
10000051,71567,2294,5.0,912577968
10000052,71567,2338,2.0,912578016


In [42]:
movies['date'] = movies['title'].apply(lambda x: f'{x[-5:][:-1]}-01-01')
movies['title'] = movies['title'].apply(lambda x: x[:-5][:-2])

In [43]:
s = movies['genres'].str.split('|')
mlb = MultiLabelBinarizer()
dummies = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_)

In [44]:
movies.drop('genres', axis=1, inplace=True)

In [48]:
dummies.columns = [f'feature{i + 1}' for i in range(dummies.shape[1])]

In [50]:
movies = pd.concat([movies, dummies], axis=1)

In [51]:
movies

Unnamed: 0,movieId,title,date,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20
0,1,Toy Story,1995-01-01,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995-01-01,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10676,65088,Bedtime Stories,2008-01-01,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
10677,65091,Manhattan Melodrama,1934-01-01,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
10678,65126,Choke,2008-01-01,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10679,65130,Revolutionary Road,2008-01-01,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [52]:
movies.to_csv(os.path.join(CLEAN_PATH, 'movies.csv'), index=False)

In [36]:
ratings.to_csv(os.path.join(CLEAN_PATH, 'ratings.csv'), index=False)

In [37]:
user_profiles = ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')
user_profiles.fillna(0, inplace=True)
user_profiles

movieId,1,2,3,4,5,6,7,8,9,10,...,65006,65011,65025,65027,65037,65088,65091,65126,65130,65133
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71564,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71565,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71566,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
user_ids = np.unique(ratings['userId'])

In [39]:
train_data, test_data = train_test_split(user_profiles, test_size=0.2, shuffle=True, random_state=42)

In [40]:
train_data.to_csv(os.path.join(CLEAN_PATH, 'train_data.csv'))
test_data.to_csv(os.path.join(CLEAN_PATH, 'test_data.csv'))

## Item sum dif ratings

In [53]:
# train_data = pd.read_pickle(os.path.join(CLEAN_PATH, 'train_data.pickle'))

user_avg_rating = []
i = 0
for k, v in train_data.iterrows():
    tmp = v.values
    tmp = tmp[tmp != 0]
    user_rating = tmp.sum() / len(tmp)
    user_avg_rating.append([k,i,user_rating])
    i += 1 
    
user_avg_ratingDf = pd.DataFrame(user_avg_rating, columns=['userId', 'index', 'avg_rating'])

In [None]:
item_suma_rating = []
index = 0
for k,v in train_data.T.iterrows():
    tmp = csr_matrix(v.values)
    rows, cals = tmp.nonzero()
    item_index = [[i, j] for (i, j) in zip(cals, tmp.data)]
    item_by_users = 0
    for i in item_index:
        user_rating = user_avg_ratingDf[user_avg_ratingDf['index'] == i[0]][['avg_rating']].values[0][0]
        rozdil = i[1] - user_rating
        item_by_users += rozdil
        
    item_suma_rating.append([k,index,item_by_users])
    index += 1

item_suma_ratingDf = pd.DataFrame(item_suma_rating, columns=['movieId', 'index', 'sum_rating'])

In [None]:
item_suma_ratingDf.to_pickle(os.path.join(CLEAN_PATH, 'item_sum_dif_rating.pickle'), protocol=2)