In [1]:
import math
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
movie_names = ['Avatar', 'Get Shorty', 'Princess Bride', 'Goonies']
movie_descriptions = [
    'an epic science fiction film',
    'a crime thriller comedy film; adapted from a book',
    'a fantasy comedy adventure film; adapted from a book',
    'an adventure comedy film'
]
# ratings: row=user, col=movie
# All ratings are between 0 and 1, with 1 meaning the user really liked the movie.
# A 0 value means the user has not rated the movie.
ratings = csr_matrix([
    [.1, 0, .2, 0],
    [0, .9, 0, .3],
    [.3, 0, .9, 0],
    [0, 0, 0, .4],
    [0, 0, .3, .4]
    ])


In [3]:
# Do not modify.
vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b', dtype=np.float)
movie_term_matrix = vectorizer.fit_transform(movie_descriptions)
vocabulary = np.array(vectorizer.get_feature_names())
print('movie_term_matrix:')
print(movie_term_matrix.todense())
print('vocabulary')
print(vocabulary)

# movie_term_matrix: value i,j is the frequency of term j in the description of movie i.

movie_term_matrix:
[[ 0.  0.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  1.  0.]
 [ 2.  1.  0.  0.  1.  1.  1.  0.  0.  0.  1.  1.  0.  1.]
 [ 2.  1.  1.  0.  1.  1.  0.  0.  1.  0.  1.  1.  0.  0.]
 [ 0.  0.  1.  1.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.]]
vocabulary
[u'a' u'adapted' u'adventure' u'an' u'book' u'comedy' u'crime' u'epic'
 u'fantasy' u'fiction' u'film' u'from' u'science' u'thriller']


In [4]:
type(movie_term_matrix)


scipy.sparse.csr.csr_matrix

In [5]:
from collections import Counter
def document_frequencies(movie_term_matrix):
    """ Compute the number of different documents that each term appears in.
    Params:
      movie_term_matrix...csr_matrix where entry i,j is the number
                          of times term j appears in document i
    Returns:
      a numpy array with one element per term in the vocabulary."""
    ###TODO
    ###
    #print movie_term_matrix.shape[1]
    #movie_term_matrix.transpose()
    #num_documents = movie_term_matrix.sum(axis =0).tolist()[0]
    #print movie_term_matrix.todense()
    #print num_documents
    #for i in range(movie_term_matrix.shape[1]):
        #print movie_term_matrix.data[i]
        #X= movie_term_matrix.getcol(i)
    #return np.array([len((movie_term_matrix.getcol(i)).nonzero()[0]) for i in range(movie_term_matrix.shape[1])])
    #return np.array(Counter(num_documents))
    l_list =[]
    for i in range(movie_term_matrix.shape[1]):
        count = 0
        for j in range(movie_term_matrix.shape[0]):
            if movie_term_matrix[j,i] != 0:
                count +=1.0
        l_list.append(count)
    return np.array(l_list)
    
    
dfs = document_frequencies(movie_term_matrix)
dfs

array([ 2.,  2.,  2.,  2.,  2.,  3.,  1.,  1.,  1.,  1.,  4.,  2.,  1.,  1.])

In [6]:
def tfidf(movie_term_matrix, dfs):
    """ Create a new matrix that transforms movie_term_matrix using tfidf.
    Simply divide each value by the document frequency for that term.
    
    Params:
      movie_term_matrix...csr_matrix where entry i,j is the number
                          of times term j appears in document i
      dfs.................document frequencies for each term.
    Returns:
      A csr_matrix that is a copy of movie_term_matrix where value
      i,j is divided by the document frequency of term j"""
    ###TODO    
    ###
    X= movie_term_matrix.copy()
    for i in range(movie_term_matrix.shape[1]):
        d_freq = dfs[i]
        for j in range(movie_term_matrix.shape[0]):
             X[j,i] =  movie_term_matrix[j,i]/d_freq
    return X

# tfidf matrix: row=movie, col=term
tfidf_matrix = tfidf(movie_term_matrix, dfs)
tfidf_matrix.todense()



matrix([[ 0.        ,  0.        ,  0.        ,  0.5       ,  0.        ,
          0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
          0.25      ,  0.        ,  1.        ,  0.        ],
        [ 1.        ,  0.5       ,  0.        ,  0.        ,  0.5       ,
          0.33333333,  1.        ,  0.        ,  0.        ,  0.        ,
          0.25      ,  0.5       ,  0.        ,  1.        ],
        [ 1.        ,  0.5       ,  0.5       ,  0.        ,  0.5       ,
          0.33333333,  0.        ,  0.        ,  1.        ,  0.        ,
          0.25      ,  0.5       ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.5       ,  0.5       ,  0.        ,
          0.33333333,  0.        ,  0.        ,  0.        ,  0.        ,
          0.25      ,  0.        ,  0.        ,  0.        ]])

In [7]:
def make_user_profiles(ratings, tfidf_matrix):
    """
    Create a user profile matrix by computing the weighted average of the tfidf
    vectors of each movie he has rated. E.g., if a person has rated 
    one movie .2 with tfidf vector ([.1, .3]) and rated another movie
    .6 with tfidf vector([.2, .4]), then the weighted average is:
    [(.2*.1 + .6*.2) / (.2 + .6), (.2*.3 + .6*.4) / (.2 + .6)]
    Params:
      ratings........the user/movie ratings matrix
      tfidf_matrix...the movie/term tfidf matrix
    Returns:
      A csr matrix where each row represents a user and the columns represent terms.
    """
    ###TODO
    ###
    user_ratings = []
    M = ratings.shape[0]
    N = movie_term_matrix.shape[1]
    Y = lil_matrix((M,N))
    #print M, N
    X= ratings.copy()
    #print X.todense()
    #print X[0].nonzero()[1]
    for i in range(X.shape[0]):
        data = []
        for j in range(X.indptr[i],X.indptr[i+1]):
            row = (X.indices[j],X.data[j])
            data.append(row)
        user_ratings.append(data)
    user_ratings_np = np.array(user_ratings)
    #print user_ratings_np
    count_users= len(user_ratings_np)
    #print count_users
    indptr = []
    data_csr = []
    for p in range(0,count_users):
        indices  = []
        for r in range(0,movie_term_matrix.shape[1]):
            numr = 0
            denr = 0
            for q in range(len(user_ratings_np[p])):            
                movie_id = user_ratings_np[p][q][0]
                movie_rate = user_ratings_np[p][q][1]
                tfidf_wt = tfidf_matrix[movie_id,r]
                numr += movie_rate*tfidf_wt
                denr += movie_rate
            wtd_avg = numr/denr
            Y[p,r] = wtd_avg
            data_csr.append(wtd_avg)
            indices.append(r)
        #indptr.append(len(indices)-1) 
    return Y.tocsr()

user_profiles = make_user_profiles(ratings, tfidf_matrix)
dummy = csr_matrix([float(user_profiles[0,i]) for i in range(user_profiles.shape[1])])
print "dummy" , dummy
print user_profiles[0].toarray
user_profiles.todense()

dummy   (0, 0)	0.666666666667
  (0, 1)	0.333333333333
  (0, 2)	0.333333333333
  (0, 3)	0.166666666667
  (0, 4)	0.333333333333
  (0, 5)	0.222222222222
  (0, 7)	0.333333333333
  (0, 8)	0.666666666667
  (0, 9)	0.333333333333
  (0, 10)	0.25
  (0, 11)	0.333333333333
  (0, 12)	0.333333333333
<bound method csr_matrix.toarray of <1x14 sparse matrix of type '<type 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>>


matrix([[ 0.66666667,  0.33333333,  0.33333333,  0.16666667,  0.33333333,
          0.22222222,  0.        ,  0.33333333,  0.66666667,  0.33333333,
          0.25      ,  0.33333333,  0.33333333,  0.        ],
        [ 0.75      ,  0.375     ,  0.125     ,  0.125     ,  0.375     ,
          0.33333333,  0.75      ,  0.        ,  0.        ,  0.        ,
          0.25      ,  0.375     ,  0.        ,  0.75      ],
        [ 0.75      ,  0.375     ,  0.375     ,  0.125     ,  0.375     ,
          0.25      ,  0.        ,  0.25      ,  0.75      ,  0.25      ,
          0.25      ,  0.375     ,  0.25      ,  0.        ],
        [ 0.        ,  0.        ,  0.5       ,  0.5       ,  0.        ,
          0.33333333,  0.        ,  0.        ,  0.        ,  0.        ,
          0.25      ,  0.        ,  0.        ,  0.        ],
        [ 0.42857143,  0.21428571,  0.5       ,  0.28571429,  0.21428571,
          0.33333333,  0.        ,  0.        ,  0.42857143,  0.        ,
          0.

In [8]:
import math
def norm(vector):
    """
    Compute the Euclidean norm of one row of a csr_matrix.
    https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
    Input:
      vector...one row from a csr_matrix
    Returns:
      a float, the Euclidean norm of the vector.
    """
    ###TODO
    ###
    vec = vector.toarray()
    return np.sqrt(np.sum([vec[i]**2 for i in range(0,len(vec))]))
    
    
norm(csr_matrix([3,4]))
#norm(csr_matrix([3,8]))

5.0

In [20]:

import math
def cosine(v1, v2):
    """
    Compute the cosine similarity between two vectors (rows from a csr_matrix).
    https://en.wikipedia.org/wiki/Cosine_similarity
    Params:
      v1...one vector
      v2...another vector
    Returns:
      a float representing the cosine similarity/
    """
    ###TODO
    ###
    #numr = 0
    #denr_1 =0
    #denr_2 = 0
    #v1_list = v1.data[1]
    #v2_list = v2.data[1]
    v1_array = v1.toarray()
    v2_array = v2.toarray()
    print v1.shape[1]
    print v2.shape[1]
    if(v1.shape[1] == v2.shape[1]):                     
        numr = sum([x*y for x,y in zip(v1.data,v2.data)]) 
    else:
        print " error"
    denr_1 = np.sqrt(np.sum([v1_array[i]**2 for i in range(0,len(v1_array))]))
    denr_2 = np.sqrt(np.sum([v2_array[i]**2 for i in range(0,len(v2_array))]))   
    return numr/(denr_1*denr_2)
                
round(cosine(csr_matrix([2,4]), csr_matrix([3,8])), 5)

2


0.99451

In [31]:
def predict_ratings_w_user_profiles(ratings, user_profiles, tfidf_matrix):
    """
    Make a copy of the ratings matrix. Replace each 0 entry with a predicted score
    based on user_profile. Specifically, the ratings of user i for movie j is the 
    cosine similarity between user i's profile and movie's j tfidf vector.
    
    Params:
      ratings.........the user x movie ratings matrix.
      user_profiles...the user x term profile matrix
      tfidf_matrix....the move x term tfidf matrix
    Returns:
      a user x movie csr_matrix of ratings. It should be a copy of the original
      ratings matrix, where 0 values have been replaced by the prediced rating.
    """
    ###TODO
    ###
    R = ratings.copy()
    for row in range(R.shape[0]):
        for col in range(R.shape[1]):
            tfidf_temp =[]
            for col_idx in range(tfidf_matrix.shape[1]):
                if col_idx in user_profiles.indices[user_profiles.indptr[row]:user_profiles.indptr[row+1]]:
                    tfidf_temp.append(tfidf_matrix[col,col_idx]) 
            print tfidf_temp
            print "user prof", user_profiles[row]
            R[row,col]=round(cosine(csr_matrix(tfidf_temp),user_profiles[row]),5)
    return R
    """for row in range(R.shape[0]):
        for col in range(R.shape[1]):            
            #dummy_tfidf = csr_matrix([tfidf_matrix[col,col_idx])
            #print "tfidf" , dummy_tfidf
            print "user prof", user_profiles[row]
            R[row,col]=round(cosine(tfidf_matrix[col],user_profiles[row]),5)
    return R"""
predicted = predict_ratings_w_user_profiles(ratings, user_profiles, tfidf_matrix)
predicted.todense()

[0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 1.0, 0.0, 1.0, 0.25, 0.0, 1.0]
user prof   (0, 0)	0.666666666667
  (0, 1)	0.333333333333
  (0, 2)	0.333333333333
  (0, 3)	0.166666666667
  (0, 4)	0.333333333333
  (0, 5)	0.222222222222
  (0, 7)	0.333333333333
  (0, 8)	0.666666666667
  (0, 9)	0.333333333333
  (0, 10)	0.25
  (0, 11)	0.333333333333
  (0, 12)	0.333333333333
12
 error


UnboundLocalError: local variable 'numr' referenced before assignment