In [None]:
import pandas as pd
import numpy as np
from keras.utils import to_categorical
import keras
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Embedding, Concatenate, Lambda, Flatten, Multiply, Lambda
from random import shuffle
import keras.backend as K

In [None]:
df_movie = pd.read_csv('movie/movies.dat', sep='::', header=None)

In [None]:
data_movie = df_movie.values

In [None]:
category_set = set()
movie_id_set = set()
for movie in data_movie:
    movie_id = movie[0]
    movie_id_set.add(movie_id)
    categories = movie[-1].split('|')
    for category in categories:
        category_set.add(category)

In [None]:
category_map = {category : i  for i, category in enumerate(list(category_set))}
movie_id_map = {movie_id : i  for i, movie_id in enumerate(list(movie_id_set))}

In [None]:
movie_categories = {}
for movie in data_movie:
    movie_id = movie[0]
    categories = movie[-1].split('|')
    vec = np.zeros(len(category_map))
    for category in categories:
        vec[category_map[category]] = 1
        
    movie_categories[movie_id] = vec
    

In [None]:
df_user = pd.read_csv('movie/users.dat', sep='::', header=None)

In [None]:
data_user = df_user.values

In [None]:
age_set = set()
job_set = set()

In [None]:
for user in data_user:
    user_id, gender, age_id, job_id, _ = user
    
    age_set.add(age_id)
    job_set.add(job_id)

In [None]:
age_map = {age_id : i for i, age_id in enumerate(list(age_set))}
job_map = {job_id : i for i, job_id in enumerate(list(job_set))}

In [None]:
users_info = {}
users_id_set = set()
for user in data_user:
    user_id, gender, age_id, job_id, _ = user
    users_id_set.add(user_id)
    users_info[user_id] = [0.0 if gender == 'M' else 1.0] + \
        list(to_categorical(age_map[age_id], len(age_set))) + \
        list(to_categorical(job_map[job_id], len(job_set)))

In [None]:
user_id_map = {user_id : i  for i, user_id in enumerate(list(users_id_set))}

In [None]:
df_rating = pd.read_csv('movie/train.csv', sep=',')
data_rating = df_rating.values

In [None]:
data = []

for item in data_rating:
    user_id, movie_id, rating, _ = item    
    if user_id in user_id_map:
        x1 = list(movie_categories[movie_id]) + users_info[user_id]
        data.append((x1, user_id_map[user_id], movie_id_map[movie_id], rating))

In [None]:
df_rating.head()

In [440]:
Ntrain = int(0.7*len(data))
train_data = data#[:Ntrain]
test_data = data[Ntrain:]

In [442]:
X1train = np.array([item[0] for item in train_data])
X2train = np.array([item[1] for item in train_data])
X3train = np.array([item[2] for item in train_data])
Ytrain = np.array([item[3]**2 for item in train_data])

X1test = np.array([item[0] for item in test_data])
X2test = np.array([item[1] for item in test_data])
X3test = np.array([item[2] for item in test_data])
Ytest = np.array([item[3]**2 for item in test_data])

In [443]:
X1train.shape, X2train.shape, X3train.shape

((500100, 47), (500100,), (500100,))

In [444]:
x1 = Input(shape=(47, ))

x2 = Input(shape=(1,))
y2 = Embedding(len(user_id_map), 1)(x2)
y2 = Flatten()(y2)

x3 = Input(shape=(1,))
y3 = Embedding(len(movie_id_map), 1)(x3)
y3 = Flatten()(y3)

#y = Multiply()([y2, y3])
#y = Dense(1, activation='linear')(y)

y = Concatenate()([x1, y3])
y = Dense(5, activation='relu')(y)
y = Dense(1, activation='linear')(y)

model = Model(inputs=[x1, x2, x3], outputs=y)

model.compile(loss='mse', metrics=['mae'], optimizer='adam')    

model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_137 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_106 (Embedding)       (None, 1, 1)         3883        input_137[0][0]                  
__________________________________________________________________________________________________
input_135 (InputLayer)          (None, 47)           0                                            
__________________________________________________________________________________________________
flatten_106 (Flatten)           (None, 1)            0           embedding_106[0][0]              
__________________________________________________________________________________________________
concatenat

In [445]:
len(user_id_map) * len(movie_id_map) * 0.05

1172666.0

In [448]:
model.fit([X1train, X2train, X3train], Ytrain, epochs=1,  verbose=True)

Epoch 1/1


<keras.callbacks.History at 0x1b503e1b668>

In [449]:
model.evaluate([X1test, X2test, X3test], Ytest)



[44.17054431948083, 5.474001065265463]

In [433]:
Y = model.predict([X1test, X2test, X3test]).reshape((-1,))

In [434]:
indexes = np.argsort(-Y)

In [435]:
scores = []
for i in range(7500):
    scores.append(Ytest[indexes[i]] ** 0.5)

In [436]:
print(np.mean(scores))

4.383066666666667


In [481]:
def get_score(user_id, movie_id):
    x1 = np.array([list(movie_categories[movie_id]) + users_info[user_id]])
    x2 = np.array([user_id_map[user_id]])
    x3 = np.array([movie_id_map[movie_id]])
    return (model.predict([x1,x2,x3])[0][0]) ** 0.5
    

In [482]:
print(get_score(2783,2640))

3.5920744597315033


In [484]:
out_data = []
with open('movie/sample_submission.csv') as f:
    for line in f:
        user_id, _, user_movie = line.strip().split(',')
        if user_id != 'user':
            _, movie_id = user_movie.split('_')
            user_id = int(user_id)
            movie_id = int(movie_id)
            score = get_score(user_id, movie_id)
            out_data.append((user_id, movie_id, score))
            if len(out_data) % 10000 == 0:
                print(len(out_data))

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000


In [486]:
with open('submisson.csv', 'w') as f:
    f.write('user,rating,id\n')
    for user_id, movie_id, score in out_data:
        f.write(f'{user_id},{round(score,1)},{user_id}_{movie_id}\n')

In [None]:
users_info = {}
for user in data_user:
    user_id, gender, age_id, job_id, _ = user
    users_info[user_id] = {'gender' : gender, 'age_id' : age_id, 'job_id' : job_id}

In [None]:
movie_categories = {}
for movie in data_movie:
    movie_id = movie[0]        
    movie_categories[movie_id] = list(map(lambda x : category_map[x],  movie[-1].split('|')))


In [None]:
df_rating.head()

In [None]:
M = np.zeros

In [None]:
df_rating['age_id'] = df_rating['user'].apply(lambda x : users_info[x]['age_id'])
df_rating['job_id'] = df_rating['user'].apply(lambda x : users_info[x]['job_id'])
df_rating['gender'] = df_rating['user'].apply(lambda x : users_info[x]['gender'])

In [None]:
scores = {}

for age_id, df in  df_rating.groupby('age_id'):
    score = [[] for _ in range(len(category_map))]
        
    for movie, rating in df[['movie','rating']].values:
        for categ in movie_categories[movie]:
            score[categ].append(rating)
            
    score = [np.mean(x) for x in score]
    scores[age_id] = score
    

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from numpy import dot
from numpy.linalg import norm

cos_sim = lambda a,b : dot(a, b)/(norm(a)*norm(b))

In [None]:
scores.keys()

In [None]:
scores[56]