In [1]:
import random

import numpy as np
import pandas as pd

from scipy.linalg import norm
from scipy.sparse import coo_matrix, dia_matrix
from scipy.sparse.linalg import svds

random.seed(42)

MAX_NEIGHBORS = 20   #
MAX_COMPONENTS = 20  # 
MEAN_SCORE = 2.75    # (MaxScore - MinScore) / 2, for normalization
TRAIN_RATIO = 0.8    # what percentage of users will be used for training

movies_df = pd.read_csv('ml-latest-small/processed_movies.csv')
movies_df.drop("genres", axis=1, inplace=True)
n_movies = max(movies_df['movieId'])
movies_df['movieId'] = movies_df['movieId'] - 1

ratings_df = pd.read_csv('ml-latest-small/processed_ratings.csv')
ratings_df.drop("timestamp", axis=1, inplace=True)
ratings_df['movieId'] = ratings_df['movieId'] - 1
ratings_df['rating'] = ratings_df['rating'] - MEAN_SCORE

In [2]:
# Function
user_ids = list(set(ratings_df["userId"]))
random.shuffle(user_ids)
number_of_train_ids = round(len(user_ids) * TRAIN_RATIO)
train_ids = set(user_ids[:number_of_train_ids])

train_df = ratings_df[ratings_df["userId"].isin(train_ids)]
test_df = ratings_df[~ratings_df["userId"].isin(train_ids)]

In [3]:
def rekey_df(df, colname):
    old_ids = set(df['userId'])
    old_to_new_id = {k:v for (v,k) in enumerate(old_ids)}
    df['userId'] = df['userId'].apply(old_to_new_id.get)
    
rekey_df(train_df, 'userId')
rekey_df(test_df, 'userId')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [4]:
def ratings_to_matrix(ratings_df, columns):
    return coo_matrix((ratings_df['rating'],
                        (ratings_df['userId'], ratings_df['movieId'])),
                        shape=(max(ratings_df['userId'])+1, columns))

train_matrix = ratings_to_matrix(train_df, n_movies).tocsr()
test_matrix = ratings_to_matrix(test_df, n_movies).tocsr()

In [5]:
class Recommender:
    def __init__(self, train_matrix, dimension, neighbors=10):
        u, s, vt = svds(train_matrix, k=dimension)
        self._neighbors = neighbors
        self._vt = vt
        self._original_matrix = train_matrix
        self._reduced_matrix = u.dot(np.diag(s))
        # normalize the rows in the reduced matrix
        for i, row in enumerate(self._reduced_matrix):
            norm_i = norm(self._reduced_matrix[i])
            self._reduced_matrix[i] = self._reduced_matrix[i] / norm_i
                
    def get_estimated_vector(self, user_vector):
        # project and normalize user vector
        reduced_vector = (self._vt).dot(user_vector)
        reduced_vector = reduced_vector / norm(reduced_vector)
        cos_sims = self._reduced_matrix.dot(reduced_vector.T)
        
        neighbors = np.argpartition(cos_sims, -self._neighbors)[-(self._neighbors):]
        estimated_vector = np.zeros(self._original_matrix.shape[1])
        for neighbor in neighbors:
            estimated_vector += self._original_matrix[neighbor].toarray()[0]
        estimated_vector /= len(neighbors)
        return estimated_vector
    
    def get_recommendations(self, user_vector, n=10):
        ev = self.get_estimated_vector(user_vector)
        ev[user_vector.nonzero()[0]] = -2.5
        top_scores = np.argpartition(ev, -n)[-n:]
        return(top_scores)        

In [6]:
def drop_entries(vec, ratio):
    nonzero_idxs = vec.nonzero()[0]
    random.shuffle(nonzero_idxs)
    n_drop = round(len(nonzero_idxs) * (1-ratio))
    drop_idxs = nonzero_idxs[:n_drop]
    vec[drop_idxs] = 0

def get_error(rec, test_matrix, norm_ord=2):
    total_error = 0
    for vec in test_matrix.toarray():
        dropped_vec = vec
        drop_entries(dropped_vec, 0.75)
        ev = rec.get_estimated_vector(dropped_vec)
        total_error += norm(vec - ev, norm_ord) 
    return total_error

errors = pd.DataFrame(index=range(1, MAX_NEIGHBORS+1), columns=range(1, MAX_COMPONENTS+1))
for dim in range(1, MAX_COMPONENTS+1):
    print(dim)
    s = Recommender(train_matrix, dim, 1)
    for k in range(1, MAX_NEIGHBORS+1):
        s._neighbors = k
        errors.loc[dim, k] = get_error(s, test_matrix)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [9]:
errors

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
1,5677.95,3223.55,2472.14,2124.48,1949.0,1832.56,1795.76,1740.42,1733.81,1755.11,1714.47,1728.2,1698.78,1697.99,1680.17,1666.77,1658.48,1647.44,1637.67,1635.54
2,2426.2,2070.3,1880.33,1804.11,1755.48,1746.55,1753.61,1706.99,1696.6,1683.75,1663.74,1651.05,1640.11,1638.21,1624.98,1634.08,1622.16,1620.24,1607.32,1608.37
3,2381.87,1954.09,1840.93,1789.13,1734.69,1708.29,1678.09,1665.63,1649.18,1640.68,1628.91,1622.04,1611.29,1603.7,1603.55,1609.55,1600.23,1581.78,1588.35,1585.67
4,2259.09,2031.19,1865.97,1774.37,1720.12,1688.46,1679.78,1644.65,1640.5,1618.82,1619.35,1607.41,1602.7,1583.51,1597.96,1586.09,1585.55,1579.76,1577.65,1571.91
5,2330.67,1960.6,1822.78,1747.45,1710.71,1684.58,1660.0,1648.71,1622.01,1603.2,1597.99,1588.3,1593.35,1577.99,1581.61,1570.52,1574.52,1559.63,1575.21,1561.53
6,2239.71,1942.36,1803.65,1771.25,1713.28,1676.09,1661.83,1637.52,1616.98,1621.45,1606.13,1609.93,1568.62,1588.54,1578.36,1585.04,1567.92,1565.27,1571.38,1562.7
7,2310.1,1983.4,1828.54,1748.58,1704.42,1673.52,1646.34,1609.63,1618.57,1612.01,1601.74,1604.35,1585.16,1578.5,1567.47,1566.92,1568.82,1571.48,1571.27,1554.28
8,2374.69,1967.28,1822.27,1728.96,1690.7,1673.38,1651.47,1635.95,1637.13,1627.83,1599.16,1598.46,1590.15,1584.2,1598.42,1579.99,1563.55,1573.79,1567.77,1566.02
9,2265.03,1970.82,1807.36,1732.74,1713.67,1662.75,1651.63,1641.56,1611.68,1610.54,1609.04,1599.68,1603.58,1587.86,1575.35,1560.24,1564.22,1561.84,1561.05,1556.09
10,2316.93,1932.17,1806.6,1759.47,1709.77,1656.43,1650.82,1633.29,1620.9,1601.18,1600.37,1587.14,1590.66,1570.39,1581.49,1561.26,1573.64,1563.14,1559.56,1564.19


In [8]:
t

NameError: name 't' is not defined

In [None]:
max(t.toarray()[0])

In [None]:
max(t)

In [None]:
k[[1, 2, 3]] = 4

In [None]:
k

In [None]:
t = np.array([3.5, 4.3, 5.3, 2.5, 0, 0, 0, 0, 0, 0, 2.3, 5.4])
drop_entries(t, 0.75)

In [None]:
t