In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge

####Just create a tiny database here to illustrate concepts

In [3]:
data = pd.DataFrame({'Susan':[5,5,-1,0,0], "Mary":[5, -1, 4, 0, 0], "Phil":[0, -1, 0, 5, 5], "Greg":[0, 0, -1, 4, -1],
                    'Romantic':[0.9, 1.0, 0.99, 0.1, 0.0], 'Action':[0, 0.01, 0, 1.0, 0.9]}\
                   , columns = ['Susan', 'Mary', 'Phil', 'Greg', 'Romantic', 'Action']
                   , index=["Big", "The Notebook", "Barefoot in the Park", "The Bourne Legacy", "The International"]
                   )
data.index.name = "Movie"
data.columns.name = "User"

In [4]:
data[['Susan', 'Mary', 'Phil', 'Greg']]

User,Susan,Mary,Phil,Greg
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Big,5,5,0,0
The Notebook,5,-1,-1,0
Barefoot in the Park,-1,4,0,-1
The Bourne Legacy,0,0,5,4
The International,0,0,5,-1


#####But this is content based, so we have some information about the movies themselves

In [5]:
data

User,Susan,Mary,Phil,Greg,Romantic,Action
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Big,5,5,0,0,0.9,0.0
The Notebook,5,-1,-1,0,1.0,0.01
Barefoot in the Park,-1,4,0,-1,0.99,0.0
The Bourne Legacy,0,0,5,4,0.1,1.0
The International,0,0,5,-1,0.0,0.9


#####Suppose we know the parameters for Susan

In [8]:
s_theta = np.array([0.0, 5.0, 0.0])
print s_theta

[ 0.  5.  0.]


#####The feature for any given movie is determined by the feature values, remember the first value is the intercept

In [6]:
barefoot_fv = np.ones((3,))
barefoot_fv[1:] = data.ix['Barefoot in the Park'][['Romantic', 'Action']].values
print barefoot_fv

[ 1.    0.99  0.  ]


#####We need to manipulate theta into the correct shape
#####We need a 1 x 3 matrix for multiplication with another 3 x 1 matrix to get a 1 x 1 matrix or a scalar

In [13]:
s_theta = s_theta.reshape(3, 1)
print s_theta.shape
print s_theta.T
print s_theta.T.shape

(3, 1)
[[ 0.  5.  0.]]
(1, 3)


#####To predict we simply multiply $\theta^{T}x$

In [11]:
barefoot_prediction_susan = np.dot(s_theta.T, barefoot_fv)
print barefoot_prediction_susan

[ 4.95]


---
##To learn $\theta^{j}$ 
---

###$$\text{min } \theta^{j} \frac{1}{2} \sum_{i:r(i, j)==1}(\theta^{jT}x^{i} - y ^{i, j})^{2} + \frac{\lambda}{2}\sum_{k=1}^{N}(\theta_{k}^{j})^{2}$$

###where:
###- $j$ is the user
###- $i$ is the movie
###- $r(i,j) == 1$ if user $j$ has rated movie $i$
###- $\theta^{j}$ is the parameter vector for user $j$
###- $x^{i}$ is the feature vector for movie $i$
###- $y^{i,j}$ is the rating by user $j$ for movie $i$, if user $j$ has rated that movie
###$\frac{\lambda}{2}\sum_{k=1}^{N}(\theta_{k}^{j})^{2}$ is the regularization term
---

---
##To learn $\theta^{j} (i.e. \theta^{0}, \theta^{1}, \theta^{2}, ..., \theta^{nu})$ for all users 
---

###$$\text{min } \theta^{0},...,\theta^{nu} \frac{1}{2} \sum_{j=1}^{nu}\sum_{i:r(i, j)==1}(\theta^{jT}x^{i} - y ^{i, j})^{2} + \frac{\lambda}{2}\sum_{j=1}^{nu}\sum_{k=1}^{N}(\theta_{k}^{j})^{2}$$

###where:
###- $j$ is the user
###- $nu$ is the number of users
###- $i$ is the movie
###- $r(i,j) == 1$ if user $j$ has rated movie $i$
###- $\theta^{j}$ is the parameter vector for user $j$
###- $x^{i}$ is the feature vector for movie $i$
###- $y^{i,j}$ is the rating by user $j$ for movie $i$, if user $j$ has rated that movie
###$\frac{\lambda}{2}\sum_{j=1}^{nu}\sum_{k=1}^{N}(\theta_{k}^{j})^{2}$ is the regularization term
---

---
##Gradient descent update equations:
---

###For the intercept term:

$\theta_{k}^{j} = \theta_{k}^{j} - \alpha \sum_{i:r(i, j)==1}(\theta^{jT}x^{i} - y ^{i, j})x_{k}^{i}\text{  }
(\text{for }k=0)$


###For the other terms:
$\theta_{k}^{j} = \theta_{k}^{j} - \alpha \left[ \sum_{i:r(i, j)==1}(\theta^{jT}x^{i} - y ^{i, j})x_{k}^{i} + \lambda\theta_{k}^{j} \right]\text{  }
(\text{for }k\neq0)$


###$\alpha$ is the learning rate
---

---
##To predict a new rating for a new movie:
---

###Determine the feature values for the movie, e.g. romantic content vs action content vs...
###Hence, we have a feature vector, $x$ for the new movie
###We have the vector of $\theta$'s for the user
###The predicted rating is just $y^{i, j} = \theta^{jT}x^{i}$
---

---
I have NOT implemented gradient descent to solve for all the values of $\theta$ simultaneously
=====
***

####Let's find the parameters $\theta$ for a single user using linear regression

#####Take Susan's recommendations

In [14]:
data[data.Susan != -1]['Susan']

Movie
Big                  5
The Notebook         5
The Bourne Legacy    0
The International    0
Name: Susan, dtype: int64

#####Obtain the features for the movies for which Susan has recommendations

In [15]:
X = data[data.Susan != -1][['Romantic', 'Action']].values
print X

[[ 0.9   0.  ]
 [ 1.    0.01]
 [ 0.1   1.  ]
 [ 0.    0.9 ]]


In [16]:
y = data[data.Susan != -1]['Susan'].values
print y

[5 5 0 0]


####Now fit a standard linear regression model

In [17]:
clf = Ridge(alpha = 0.1)
clf.fit(X, y)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [18]:
print len(clf.coef_)
print clf.coef_
print clf.intercept_

2
[ 2.44222689 -2.6664512 ]
2.55211700065


####Now predict Susan's rating for Barefoot in the Park

In [19]:
X_new = data.ix['Barefoot in the Park'][['Romantic', 'Action']].values
print X_new

[ 0.99  0.  ]


In [20]:
clf.predict(X_new)

array([ 4.96992162])