In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
import os
directory = '/home/g40/PycharmProjects/finaltask-project/dataset/'

rating_headers = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(directory + 'ml-1m/ratings_min_30.dat', sep='::', header=None, names=rating_headers, engine='python')

movie_headers = ['movie_id', 'title', 'genres']
movies = pd.read_table(directory + 'ml-1m/movies.dat', engine='python',
                       sep='::', header=None, names=movie_headers)

movie_titles = movies.title.tolist()

In [3]:
movie_titles

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Balto (1995)',
 'Nixon (1995)',
 'Cutthroat Island (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Assassins (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (1995)',
 'Wings of Courage (1995)',
 'Babe (1995)',
 'Carrington (1995)',
 'Dead Man Walking (1995)',
 'Across the Sea of Time (1995)',
 'It Takes Two (1995)',
 'Clueless (

In [4]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df = ratings.join(movies, on=['movie_id'], rsuffix='_r')
del df['movie_id_r']

In [6]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres
0,1,1193,5,978300760,Wings of Desire (Der Himmel �ber Berlin) (1987),Comedy|Drama|Romance
1,1,661,3,978302109,Bloodsport 2 (1995),Action
2,1,914,3,978301968,All About Eve (1950),Drama
3,1,3408,4,978300275,Empire Records (1995),Comedy|Drama
4,1,2355,5,978824291,You've Got Mail (1998),Comedy|Romance


In [7]:
rp = df.pivot_table(columns=['movie_id'],index=['user_id'],values='rating')
rp.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3937,3943,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
6,4.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
rp = rp.fillna(0); # Replace NaN

In [9]:
Q = rp.values
Q.shape

(5289, 2836)

In [10]:
W = Q>0.5
W[W == True] = 1
W[W == False] = 0
# To be consistent with our Q matrix
W = W.astype(np.float64, copy=False)

In [11]:
W.shape

(5289, 2836)

In [12]:
# params
lambda_ = 0.1
n_factors = 100
m, n = Q.shape
n_iterations = 20

In [13]:
X = 5 * np.random.rand(m, n_factors) # latent user 
Y = 5 * np.random.rand(n_factors, n) # latent item

In [14]:
def get_error(Q, X, Y, W):
    return np.sum((W * (Q - np.dot(X, Y)))**2)

In [15]:
errors = []
for ii in range(n_iterations):
    X = np.linalg.solve(np.dot(Y, Y.T) + lambda_ * np.eye(n_factors), 
                        np.dot(Y, Q.T)).T
    Y = np.linalg.solve(np.dot(X.T, X) + lambda_ * np.eye(n_factors),
                        np.dot(X.T, Q))
    if ii % 10 == 0:
        print('{}th iteration is completed'.format(ii))
    errors.append(get_error(Q, X, Y, W))
Q_hat = np.dot(X, Y)
print('Error of rated movies: {}'.format(get_error(Q, X, Y, W)))

0th iteration is completed


KeyboardInterrupt: 

In [None]:
plt.plot(errors);
plt.ylim([0, 20000]);

In [None]:
Q_hat.shape

In [None]:
movie_ids = np.argmax(Q_hat - 5 * W, axis=1)

In [None]:
movie_ids

In [None]:
movie_ids.shape

In [None]:
Q_out = np.array(Q_hat)
Q_out -= np.min(Q_hat)
Q_out *= float(5) / np.max(Q_hat)

In [None]:
Q_out

In [None]:
Q_out.max()

In [None]:
Q_out.min()

In [None]:
Q_hat

In [None]:
list(zip(range(m), movie_ids))

In [None]:
np(Q_hat - 5 * W)

In [None]:
my_array = np.array([[1,20,3,4],[50,6,7,8],[90,10,11,12],[13,14,15,17]])

In [None]:
my_array

In [None]:
np.argmax(my_array, axis=1)

In [None]:
np.argmax(my_array, axis=0)

In [None]:
R = np.array([
    [4, 3, 0, 4, 5, 0, 3, 3, 0, 5, 4, 3, 3, 4, 0, 4, 3, 3, 4, 0],
    # [4, 3, 3, 4, 5, 4, 3, 3, 4, 5, 4, 3, 3, 4, 5, 4, 3, 3, 4, 0],
    [4, 0, 0, 1, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 1, 0, 5, 0, 5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 0, 0, 4, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [1, 0, 5, 4, 0, 0, 5, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [5, 3, 4, 3, 3, 0, 4, 1, 5, 0, 2, 0, 5, 0, 5, 5, 0, 4, 5, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 5],
    [5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 4, 3, 3, 0, 4, 1, 5, 0, 2, 5, 0, 5, 0, 5, 4, 0, 0, 5]
])

In [None]:
user_id = 1 # get recommendation first user

In [None]:
R[user_id-1]

In [None]:
row, col = R.shape
for i in range(row):
    transformed = map(lambda idx: idx, enumerate(R[i]))
#     a = np.array([('a', 2), ('c', 1)], dtype=[('x', 'S1'), ('y', int)])
#     a.sort(order='y')
    final = np.array(list(transformed), dtype=[('x', int), ('y', float)])
    print(np.sort(final, order='y')[-5:])
    break

In [None]:
np.array(list(reversed(np.argsort(np.array([1,10,20,3,4,66,7]))[-5:]))) + 1

In [None]:
import math

In [None]:
list(enumerate([1,2,3]))

In [None]:
math.log2(4)

In [None]:
R

In [None]:
real_rating_u1 = R[1-1, :]

In [None]:
real_rating_u1[[1,2,3]]

In [61]:
R = df.pivot_table(columns=['movie_id'],index=['user_id'],values='rating')
R = R.fillna(0).as_matrix()

In [67]:
R[1].shape

(2836,)