In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from numpy.linalg import solve
import scipy.optimize as optimize
from numpy.linalg import solve

In [3]:
df_gender = pd.read_csv("../data/external/gender.dat",header=None,names=["UserID","Gender"])

In [4]:
df_gender.shape

(220970, 2)

In [5]:
df_ratings = pd.read_csv("../data/external/ratings.dat",header=None,error_bad_lines=False,names=["UserID","ProfileID","Rating"])

In [6]:
df_gender.head()

Unnamed: 0,UserID,Gender
0,1,F
1,2,F
2,3,U
3,4,F
4,5,F


In [7]:
df_ratings.head()

Unnamed: 0,UserID,ProfileID,Rating
0,1,133,8
1,1,720,6
2,1,971,10
3,1,1095,7
4,1,1616,10


In [8]:
rater_count = df_ratings["UserID"].unique().shape[0]

In [9]:
ratee_count = df_ratings["ProfileID"].unique().shape[0]

In [10]:
df_ratings.shape

(18242674, 3)

In [11]:
sparsity = (rater_count * ratee_count -df_ratings.shape[1])/(rater_count * ratee_count)

In [12]:
print("sparsity of matrix is" ,sparsity*100 )

sparsity of matrix is 99.99999998686947


In [13]:
ratings_matrix_size = df_gender.shape[0]

In [14]:
#ratings_matrix = csr_matrix((df_ratings_new["Rating"],(df_ratings_new["UserID"]-1,df_ratings_new["ProfileID"])))

In [246]:
class matrix_factor():
    def __init__(self,ratings_matrix,latent_vectors,regularization_user=0,regularization_item=0,non_negative=0,solver = "BFGS"):
        self.ratings_matrix = ratings_matrix
        self.latent_vectors = latent_vectors
        self.regularization_user = regularization_user
        self.regularization_item = regularization_item
        self.non_negative = non_negative
        self.ratings_matrix_shape = ratings_matrix.shape
        self.user = np.random.rand(self.ratings_matrix_shape[0],latent_vectors)
        self.item = np.random.rand(self.ratings_matrix_shape[1],latent_vectors)
        self.solver = solver
    
    def __optimization_function_generator(self,constant = "user"):
        if constant=="user":
            def optimize(x):
                x = np.reshape(x, (-1, self.latent_vectors))
                prediction = np.dot(self.user,x.T)
                error = 0
                for ((i,j),real) in np.ndenumerate(self.ratings_matrix):
                    if real != np.nan:
                        error = error + (self.ratings_matrix[i][j] - prediction[i,j])**2
                print(error)
                return error
            return optimize
        elif constant =="item":
            def optimize(x):
                x = np.reshape(x, (-1, self.latent_vectors))
                prediction = np.dot(x,self.item.T)
                error = 0
                for ((i,j),real) in np.ndenumerate(self.ratings_matrix):
                    if real != np.nan:
                        error = error + (self.ratings_matrix[i][j] - prediction[i,j])**2
                print(error)
                return error
            return optimize
        else:
            raise Exception("constant should be either 'user' or 'item'")
            
    def als(self,constant = "user"):
        if self.non_negative == 0:
            cons = ({'type': 'eq', 'fun': lambda x: 1})
        else :
            cons = ({'type': 'eq', 'fun': lambda x: 1})
        optimize_function = self.__optimization_function_generator(constant)
        if constant == "user":
            ans = optimize.minimize(optimize_function,self.item,method=self.solver,jac="cs")
            self.item = np.reshape(ans['x'],(-1,self.latent_vectors))
        elif constant == "item":
            ans = optimize.minimize(optimize_function,self.user,method=self.solver, options={'disp': False})
            self.user = np.reshape(ans['x'],(-1,self.latent_vectors))
    def train(self,n_iter = 10,verbose_iter=1):
        for i in range(n_iter):
            if i%verbose_iter == 0:
                print("On iteration ",i)
            self.als("user")
            self.als("item")
            


In [200]:
class matrix_factorization():
    def __init__(self,ratings_matrix,latent_vectors,regularization_user=0,regularization_item=0,non_negative=0):
        self.ratings_matrix = ratings_matrix
        self.latent_vectors = latent_vectors
        self.regularization_user = regularization_user
        self.regularization_item = regularization_item
        self.non_negative = non_negative
        self.ratings_matrix_shape = ratings_matrix.shape
        self.user = np.random.rand(self.ratings_matrix_shape[0],latent_vectors)
        self.item = np.random.rand(self.ratings_matrix_shape[1],latent_vectors)
    
            
    def als(self,constant = "user"):
        if constant == "user":
            RU = np.dot(self.ratings_matrix,self.user)
            UTU = self.user.T.dot(self.user)
            reg = self.regularization_user*np.eye(UTU.shape[0])
            IT = solve(UTU+reg,RU.T)
            self.item = IT.T
        elif constant == "item":
            RI = np.dot(self.ratings_matrix,self.item)
            ITI = self.item.T.dot(self.item)
            reg = self.regularization_item*np.eye(ITI.shape[0])
            UT = solve(ITI+reg,RI.T)
            self.user = UT.T

        else:
            raise Exception("constant should be either 'user' or 'item'")
    def train(self,n_iter = 10,verbose_iter=1):
        for i in range(n_iter):
            if i%verbose_iter == 0:
                print("On iteration ",i)
            self.als("user")
            self.als("item")
            


In [62]:
class matrix_factorization_sgd():
    def __init__(self,ratings_matrix,latent_vectors,regularization_user=0,regularization_item=0,non_negative=0,alpha = 0.1):
        self.ratings_matrix = ratings_matrix
        self.latent_vectors = latent_vectors
        self.regularization_user = regularization_user
        self.regularization_item = regularization_item
        self.non_negative = non_negative
        self.ratings_matrix_shape = ratings_matrix.shape
        self.user = np.random.rand(self.ratings_matrix_shape[0],latent_vectors)
        self.item = np.random.rand(self.ratings_matrix_shape[1],latent_vectors)
        self.aplha = alpha
    
            
    def sgd(self):
        prediction = np.dot(self.user,self.item.T)
        error = self.ratings_matrix - np.dot(self.user,self.item.T)
        #update item
        item_temp = self.item
        for ((i,q),Iiq) in np.ndenumerate(self.user):
                error_correc = 0
                for j in range(self.ratings_matrix.shape[1]):
                    if self.ratings_matrix[i,j] != 0:
                        error_correc = error_correc + self.aplha * error[i,j]*self.item[j,q]
                        print(error_correc)
                Iiq = Iiq + error_correc
                item_temp[i,q] = Iiq
        #update user
        user_temp = self.user
        for ((i,q),Uiq) in np.ndenumerate(self.user):
                error_correc = 0
                for j in range(self.ratings_matrix.shape[1]):
                    if self.ratings_matrix[i,j] != 0:
                        error_correc = error_correc + self.aplha * error[i,j]*self.user[j,q]
                Iiq = Iiq + error_correc
                item_temp[i,q] = Iiq
        self.item = item_temp
        self.user = user_temp
        
    def train(self,n_iter = 10,verbose_iter=1):
        for i in range(n_iter):
            if i%verbose_iter == 0:
                print("On iteration ",i)
            self.sgd()
            


In [63]:
a = matrix_factorization_sgd(rating100,2,aplha=0.001)

In [64]:
a.train(n_iter=1)

On iteration  0
-0.005247980459562071
0.3437303794636566
0.6339870039728948
-0.005406206639560206
0.4939557801924833
0.7679058243240786
0.1762367873211818
0.6025280711318636
1.448017066795615
1.6957159271384583
2.445507371273411
3.0880915266435514
3.1359739922159955
3.291168173391088
3.312392370323256
3.6227886255654247
0.7826529428319534
0.9744199098986882
1.8975977639039874
1.9911852045853171
2.0653793211411213
2.696902892137971
3.0380202511280325
3.1761042778988373
3.2143801494899154
3.2720407656215538
0.4705984599882678
1.0675290368232648
1.1849157473361842
1.1955945353609274
1.5012952005894042
0.8428295230587105
1.429485335172381
1.5339301639299436
1.5531883681621101
1.6099767101966276
0.6651316109793455
0.7038977679143931
1.5600153861772639
1.8126837164553802
2.347200813905909
2.5931203613414575
3.3825791660975435
0.21380888874664672
0.27006713551083394
1.1903617684910153
1.4223647176144136
1.6008548026959497
1.8879700345380988
2.53009696010789
0.1581234181668727
0.28082245648854

In [61]:
calculate_rmse(rating100,a.user,a.item)

6.850174406819894e+231

In [204]:
a = matrix_factorization(rating100,2)

In [205]:
a.train(n_iter=10000,verbose_iter=1000)

On iteration  0
On iteration  1000
On iteration  2000
On iteration  3000
On iteration  4000
On iteration  5000
On iteration  6000
On iteration  7000
On iteration  8000
On iteration  9000


In [56]:
calculate_rmse(rating100,a.user,a.item)

754671662.1875356

In [None]:
np.eye

In [264]:
a = matrix_factor(rating100,2,solver="trust-krylov")

In [265]:
a.train()

On iteration  0


  "options for 'jac'. Using '2-point' instead." % jac)


ValueError: ('Jacobian is required for trust region ', 'exact minimization.')

In [None]:
solver = ["L-BFGS-B"]

In [38]:
rating100[np.isnan(rating100)] = 0

In [40]:
rating100 = rating100.values

In [42]:
rating100.sum()

3688.0

In [153]:
np.__version__


'1.15.3'

In [93]:
rating100.sum().sum()/rating100.count_nonzero()

AttributeError: 'DataFrame' object has no attribute 'count_nonzero'

In [53]:
def calculate_rmse(ratings_matrix,user_latent_vector,item_latent_vector):
    prediction = np.dot(user_latent_vector,item_latent_vector.T)
    error = 0
    for ((i,j),real) in np.ndenumerate(ratings_matrix):
        if real != np.nan:
            error = error + (ratings_matrix[i,j] - prediction[i,j])**2
    return error

In [184]:
np.square(a.user.dot(a.item.T) - rating100).sum().sum()

29544.518393498096

In [55]:
calculate_rmse(rating100,a.user,a.item)

754671662.1875356

In [73]:
rating100[0][0]

nan

In [None]:
BFGS start 28549.376750684434

In [None]:
rmse for bfgs 4101.890669183573

In [115]:
?np.reshape

In [16]:
a = np.array([[1,1],[1,-1]])

In [40]:
b = np.array([-2,0])

In [41]:
solve(a,b)

array([-1., -1.])

In [42]:
def f(x):
    y = np.dot(a, x) - b
    return np.dot(y, y)

In [57]:
cons = ({'type': 'ineq', 'fun': lambda x: 1})
res = optimize.minimize(f, [0, 0], method='COBYLA', constraints = cons,
                        options={'disp': False})
xbest = res['x']

In [58]:
xbest

array([-0.99990899, -0.99998043])

In [None]:
efef

In [None]:
ratings_matrix.dot(ratings_matrix.T)

In [None]:
er243r

## subsetting code

In [14]:
a = df_ratings.groupby("ProfileID")

In [15]:
x = a.count()

In [16]:
subset_ids = x.sort_values(by="UserID",ascending=False).reset_index()["ProfileID"][:1000]

In [17]:
df_ratings_new = df_ratings[df_ratings["ProfileID"].isin(subset_ids)]

In [18]:
df_ratings_new.shape

(4061536, 3)

In [19]:
mapping = {}
for i in range(len(subset_ids)):
    mapping[subset_ids[i]] = i

In [20]:
df_ratings_new["ProfileID"] = df_ratings_new["ProfileID"].map(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
df_ratings_new["UserID"].unique().shape

(135179,)

In [22]:
df_ratings_new = df_ratings_new[df_ratings_new["UserID"].isin(subset_ids)]

In [23]:
df_ratings_new["UserID"] = df_ratings_new["UserID"].map(mapping)

In [24]:
df_ratings_new["UserID"].unique().shape

(803,)

In [25]:
ratings_matrix = pd.crosstab(index = df_ratings_new["UserID"],columns = df_ratings_new["ProfileID"],values =df_ratings_new["Rating"],aggfunc="mean")

In [26]:
ratings_matrix

ProfileID,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,9.0,,,,10.0,,,,,...,,,,,,,,,,
5,,,,,,,10.0,,,,...,,,,,,,,,,
6,10.0,,,,,,,,,,...,,,,,,,,,,
7,,8.0,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,10.0,,,,...,,,,,,,,,,
12,,,9.0,,,,,,,,...,,,,,,,,,,


In [27]:
ratings_matrix.shape

(803, 1000)

In [28]:
ratings_matrix = ratings_matrix[:][ratings_matrix.index]

In [29]:
rating100 = ratings_matrix.iloc[:100,:100]

In [30]:
rating100.shape

(100, 100)

In [31]:
mapping2 = rating100.index

In [32]:
rating100 = rating100.reset_index(drop=True)

In [33]:
rating100.columns = range(len(rating100))

In [34]:
subset_ids[mapping2[1]]

121859

In [35]:
df_ratings_new["UserID"].unique().shape

(803,)

In [36]:
rating100.iloc[4,5]

nan

In [44]:
#ratings_matrix = ratings_matrix.todense()

In [45]:
df_ratings_new = df_ratings_new.reset_index(drop=True)

In [46]:
sparsity = ((10000 - 418) / 10000)*100

In [47]:
print("subsetted matrix sparsity is ",sparsity,"%")

subsetted matrix sparsity is  95.82000000000001 %


In [61]:
df_ratings_new["ProfileID"].unique().shape

(86,)

In [60]:
(df_ratings_new["Rating"] > 10).sum()

0

In [13]:
import cvxpy as cp
m = 30
n = 20
np.random.seed(1)
A = np.random.randn(m, n)
b = np.random.randn(m)

# Construct the problem.
x = cp.Variable(n)
objective = cp.Minimize(cp.sum_squares(A*x - b))
constraints = [0 <= x, x <= 1]
prob = cp.Problem(objective, constraints)

# The optimal objective value is returned by `prob.solve()`.
result = prob.solve()
# The optimal value for x is stored in `x.value`.
print(x.value)
# The optimal Lagrange multiplier for a constraint is stored in
# `constraint.dual_value`.
print(constraints[0].dual_value)

[5.43267146e-09 2.85089711e-02 5.77683865e-09 1.01879549e-08
 5.99024543e-10 1.49303949e-01 2.09315550e-08 4.29272686e-09
 2.46714252e-01 5.78213740e-01 1.12980210e-09 9.70973777e-04
 1.88470383e-09 2.26753961e-01 2.74209340e-09 4.13051310e-09
 8.63796936e-09 9.81879034e-10 5.56051260e-09 1.37169102e-09]
[2.50937507e+00 4.96670260e-07 2.78354360e+00 1.79424254e+00
 1.30858209e+01 1.14293262e-07 7.37122988e-01 3.35346143e+00
 7.19157240e-08 3.30389650e-08 8.93825650e+00 6.67651662e-05
 7.02956358e+00 8.31516656e-08 4.71065828e+00 3.18872127e+00
 2.06086135e+00 1.00816916e+01 3.04811647e+00 8.53267461e+00]


In [20]:
cp.Variable(20,20)

Variable((20,))

In [3]:
import sys

In [4]:
sys.executable

'/home/roxor/bin/anaconda3/bin/python'

In [2]:
import cvxpy

In [1]:
import sys
sys.executable

'/home/roxor/bin/anaconda3/envs/personalization/bin/python'

In [86]:
from time import time
import numpy as np
import scipy.sparse as sp
from sys import argv, float_info


def prepare_matrices(l1=1000, l2=1000, alpha=0.1):
    # indexes and values of non-zero components
    # ii = np.random.randint(0, l, (int(l * l * alpha), 2))
    num = int(l1 * l2 * alpha)
    r = np.random.rand(num)
    i1 = np.random.randint(0, l1, size=num)
    i2 = np.random.randint(0, l2, size=num)

    # create a lil sparse matrix
    print("prepare..")
    t1 = time()
    # A = sp.coo_matrix((r, (i1, i2)), shape=(l, l))
    A = sp.csr_matrix((r, (i1, i2)), shape=(l1, l2))
    # A = sp.lil_matrix((l, l))
    # for n, i in enumerate(ii):
    #     A[i] = r[n]
    t2 = time()
    print("%8.3f sec" % (t2 - t1))
    print("%f Mbytes" % ((float)(A.data.nbytes) / 1000000))
    return A


def tests_sparse_matmul(A, vec):
    """
    compare speed of matrix product
    """
    print("compute..")
    t1 = time()
    A * vec
    t2 = time()
    print("%8.3f sec" % (t2 - t1))


l1 = 100
l2 = 100
l3 = 100
alpha = float(0.1)
A = prepare_matrices(l1, l2, alpha)
vec = np.random.rand(l2, l3)
#tests_sparse_matmul(A, vec)

prepare..
   0.001 sec
0.007656 Mbytes
