### Testing our code on MovieLens 100k dataset

#### Downloading dataset

In [25]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

--2017-04-04 15:39:17--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: 'ml-100k.zip'


2017-04-04 15:39:29 (469 KB/s) - 'ml-100k.zip' saved [4924029/4924029]



In [26]:
!unzip ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [141]:
def nnmf(A, rank, lr=0.001, steps=1000, display='False'):
    """
    A: np.array (M, N)
    """
    
    if display:
        display_factor = int(steps/10)
    else:
        display_factor = steps
    np.random.seed(0)
    
    #Number of non-missing entries
    non_missing = np.sum(~np.isnan(A))
    
    tf_mask = tf.Variable(~np.isnan(A))
    shape = A.shape

    A_tf = tf.constant(A)
    
    A_max = pd.DataFrame(A).max().max()

    # Initializing random H and W
    temp_H = np.random.randn(rank, shape[1]).astype(np.float32)
    #temp_H = np.divide(temp_H, temp_H.max())

    temp_W = np.random.randn(shape[0], rank).astype(np.float32)
    #temp_W = np.divide(temp_W, temp_W.max())

    H =  tf.Variable(temp_H)
    W = tf.Variable(temp_W)
    WH = tf.matmul(W, H)
    
    #cost of Frobenius norm
    cost = tf.reduce_sum(tf.abs(tf.boolean_mask(A, tf_mask) - tf.boolean_mask(WH, tf_mask)))/non_missing

    #cost = tf.reduce_sum(tf.pow(tf.boolean_mask(A, tf_mask) - tf.boolean_mask(WH, tf_mask), 2))/(A.shape[0])

    # Clipping operation. This ensure that W and H learnt are non-negative
    clip_W = W.assign(tf.maximum(tf.zeros_like(W), W))
    clip_H = H.assign(tf.maximum(tf.zeros_like(H), H))
    clip = tf.group(clip_W, clip_H)


    train_step = tf.train.GradientDescentOptimizer(lr).minimize(cost)
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        for i in range(steps):
            sess.run(train_step)
            sess.run(clip)
            if i%display_factor==0:
                print("Cost after: %d is %0.2f" %(i, sess.run(cost)))
           
        learnt_W = sess.run(W)
        learnt_H = sess.run(H)
    return learnt_W, learnt_H

#### Parsing dataset

Code borrowed from [Greg Rada's blog](http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/)

In [30]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

In [34]:
len(ratings.user_id.unique())

943

In [35]:
len(ratings.movie_id.unique())

1682

In [65]:
movielens_matrix = np.empty((len(ratings.user_id.unique()), len(ratings.movie_id.unique())))
movielens_matrix[:] = np.NAN
for i, row in ratings.iterrows():
   
    user_id = row['user_id']
    movie_id = row['movie_id']
    rating = row['rating']
    movielens_matrix[user_id-1, movie_id-1] = rating

In [142]:
W_ml, H_ml = nnmf(movielens_matrix.astype('float32'), rank=10, lr=0.001, steps=1000, display='True')

Cost after: 0 is 2.32
Cost after: 100 is 2.32
Cost after: 200 is 2.32
Cost after: 300 is 2.32
Cost after: 400 is 2.32
Cost after: 500 is 2.32
Cost after: 600 is 2.32
Cost after: 700 is 2.32
Cost after: 800 is 2.32
Cost after: 900 is 2.32


In [139]:
pd.DataFrame(W_ml).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0,943.0
mean,0.415726,0.452973,0.42068,0.452552,0.400191,0.392183,0.460405,0.405014,0.398967,0.432511
std,0.565105,0.595632,0.578464,0.578352,0.552117,0.565254,0.58329,0.527027,0.551142,0.597876
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.022062,0.019174,0.018607,0.022829,0.018399,0.017678,0.023362,0.021117,0.020859,0.023536
50%,0.098475,0.108631,0.100834,0.129862,0.077781,0.086862,0.123513,0.102703,0.094134,0.105125
75%,0.6641,0.781905,0.669389,0.759872,0.641438,0.61218,0.764506,0.698207,0.643386,0.653523
max,3.134036,3.059459,3.348774,2.804953,2.909727,3.633679,3.400108,3.442057,3.788363,3.594876


In [140]:
pd.DataFrame(np.dot(W_ml, H_ml)).head().round()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,2.0,1.0,1.0,5.0,3.0,0.0,3.0,2.0,1.0,1.0,...,1.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,1.0,2.0
1,1.0,3.0,1.0,4.0,3.0,0.0,4.0,1.0,1.0,2.0,...,5.0,1.0,3.0,0.0,0.0,2.0,4.0,2.0,3.0,3.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,6.0,5.0,3.0,5.0,6.0,2.0,5.0,2.0,1.0,3.0,...,3.0,2.0,2.0,2.0,0.0,2.0,7.0,2.0,3.0,3.0
4,2.0,1.0,0.0,1.0,2.0,2.0,3.0,2.0,1.0,1.0,...,2.0,1.0,0.0,0.0,0.0,1.0,4.0,1.0,0.0,0.0


In [125]:
pd.DataFrame(movielens_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
1,4.0,,,,,,,,,2.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,3.0,,,,,,,,,...,,,,,,,,,,


#### 5-fold cross-validation

In [37]:
from sklearn.model_selection import KFold

In [41]:
kf = KFold(n_splits=5)

In [45]:
for train_index, test_index in kf.split(ratings):
    train_df = ratings.ix[train_index]
    

In [85]:
np.ma.max(movielens_matrix)

nan

In [88]:
pd.DataFrame(movielens_matrix).max().max()

5.0