In [None]:
import re, os

movie_id = None
triplets = []

files = ['combined_data_1.txt', 'combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt']

for file in files:
    with open(os.path.join('/Users/abhijitmondal/Downloads/Netflix', file)) as f:
        while True:
            line = f.readline()
            if len(line) == 0:
                break
            if re.match('[0-9]+:', line):
                movie_id = line[:-2]
            else:
                d = line.split(',')
                if len(d) >= 2:
                    user_id = d[0]
                    rating = d[1]
                    triplets.append((user_id, movie_id, float(rating)))

In [None]:
len(triplets)

In [None]:
import pickle

with open('data.pkl', 'wb') as f:
    pickle.dump(triplets, f, protocol=pickle.HIGHEST_PROTOCOL)

In [1]:
import pickle

with open('data.pkl', 'rb') as f:
    triplets = pickle.load(f)

In [2]:
import random
sampled_data = random.sample(triplets, 10**3)

In [3]:
with open('sampled_data.pkl', 'wb') as f:
    pickle.dump(sampled_data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import mat_facto_no_bias.matrixfactorization as matrixfactorization
import importlib
importlib.reload(matrixfactorization)
from mat_facto_no_bias.matrixfactorization import MatrixFactorization

In [None]:
facto1 = MatrixFactorization(50)

In [None]:
facto1.train(sampled_data, alpha=0.0002, beta=0.02, epochs=100)

In [None]:
with open('mat_facto_no_bias/facto.pkl', 'wb') as f:
    pickle.dump(facto1, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('mat_facto_no_bias/facto.pkl', 'rb') as f:
    facto1 = pickle.load(f)

In [None]:
facto1.user_ids[:2]

In [None]:
facto1.get_recommendations('100093', 10)

In [None]:
import mat_facto_biases.training.matrixfactorization as matrixfactorization
import importlib
importlib.reload(matrixfactorization)
from mat_facto_no_bias.training.matrixfactorization import MatrixFactorization

In [None]:
facto2 = MatrixFactorization(50)

In [None]:
facto2.train(sampled_data, alpha=0.0002, beta=0.02, epochs=1000)

In [None]:
user_ids = sorted(set([x for x, _, _ in sampled_data]))
user_ids_inv = {user_ids[i]:i for i in range(len(user_ids))}

movie_ids = sorted(set([y for _, y, _ in sampled_data]))
movie_ids_inv = {movie_ids[i]:i for i in range(len(movie_ids))}

In [None]:
len(user_ids)

In [None]:
import numpy as np
k = 50
p = np.random.rand(len(user_ids), k)
q = np.random.rand(len(movie_ids), k)
bu = np.random.rand(len(user_ids))
bm = np.random.rand(len(movie_ids))

In [None]:
def loss(sampled_data, mu, p, q, bu, bm, beta=0.02):
    e = 0
    for u, m, r in sampled_data:
        i = user_ids_inv[u]
        j = movie_ids_inv[m]
        reg = beta/2.0*(np.sum(p[i,:]**2)+np.sum(q[:,j]**2+bu[i]**2+bm[j]**2))
        e += (r-mu-bu[i]-bm[j]-np.dot(p[i,:],q[:,j]))**2+reg
        
    return e
            
def train(sampled_data, p, q, bu, bm, alpha=0.0002, beta=0.02):
    mu = np.mean([z for x, y, z in sampled_data])
    q = q.T
    
    for x in range(1000):
        if len(sampled_data) > 10000:
            g = random.sample(sampled_data, 10000)
        else:
            g = sampled_data
        
        for u, m, r in g:
            i = user_ids_inv[u]
            j = movie_ids_inv[m]
            e = r-mu-bu[i]-bm[j]-np.dot(p[i,:],q[:,j])

            for k in range(50):
                p[i][k] = p[i][k]+alpha*(2*e*q[k][j]-beta*p[i][k])
                q[k][j] = q[k][j]+alpha*(2*e*p[i][k]-beta*q[k][j])
                
            bu[i] = bu[i]+alpha*(2*e-beta*bu[i])
            bm[j] = bm[j]+alpha*(2*e-beta*bm[j])
        
        if x % 10 == 0:
            l = loss(sampled_data, mu, p, q, bu, bm, beta)
            print(x, l)
        
    return mu, p, q.T, bu, bm

def get_recommendations(user, mu, p, q, bu, bm, num_preds=10):
    i = user_ids_inv[user]
    
    heap = []
    
    for m in movie_ids:
        j = movie_ids_inv[m]
        rpred = mu+bu[i]+bm[j]+np.dot(p[i,:],q[:,j])
        
        if len(heap) < num_preds:
            heapq.heappush(heap, (rpred, j))
        else:
            if rpred > heap[0][0]:
                heapq.heappop(heap)
                heapq.heappush(heap, (rpred, j))
    
    print(heap)
    return [movie_ids[y] for x, y in heap]

In [None]:
mu, p, q, bu, bm = train(sampled_data, p, q, bu, bm)

In [None]:
get_recommendations('1000038', 3.6, p, q.T, bu, bm, 10)