# Movie Recommendation Engine-Su, Chun Wen- Model 1: is using Collaborative Filtering  from users' ratings

## Step 1: Import packages and data files (csv files from MovieLens.com)

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from sklearn.metrics import mean_squared_error


In [2]:
df = pd.read_csv('ratings.csv', sep=',')
df_id = pd.read_csv('links.csv', sep=',')
df = pd.merge(df, df_id, on=['movieId'])

rating_matrix = np.zeros((df.userId.unique().shape[0], max(df.movieId)))
for row in df.itertuples():
    rating_matrix[row[1]-1, row[2]-1] = row[3]
rating_matrix = rating_matrix[:,:9000]

In [3]:
print(rating_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 0.]]


In [4]:
print(df)

        userId  movieId  rating   timestamp   imdbId    tmdbId
0            1       31     2.5  1260759144   112792    9909.0
1            7       31     3.0   851868750   112792    9909.0
2           31       31     4.0  1273541953   112792    9909.0
3           32       31     4.0   834828440   112792    9909.0
4           36       31     3.0   847057202   112792    9909.0
5           39       31     3.0   832525157   112792    9909.0
6           73       31     3.5  1255591860   112792    9909.0
7           88       31     3.0  1239755559   112792    9909.0
8           96       31     2.5  1223256331   112792    9909.0
9          110       31     4.0   840100695   112792    9909.0
10         111       31     3.5  1097429230   112792    9909.0
11         150       31     2.5  1130905954   112792    9909.0
12         161       31     3.0   837629820   112792    9909.0
13         165       31     3.5  1111981801   112792    9909.0
14         186       31     3.0  1276205768   112792   

In [5]:
sparsity = float(len(rating_matrix.nonzero()[0]))
sparsity /= (rating_matrix.shape[0] * rating_matrix.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.40%


## Step 2: Create training and test datasets

In [6]:
train_matrix = rating_matrix.copy()
test_matrix = np.zeros(rating_matrix.shape)

for i in range(rating_matrix.shape[0]):
    rating_idx = np.random.choice(
        rating_matrix[i, :].nonzero()[0], 
        size=10, 
        replace=True)
    train_matrix[i, rating_idx] = 0.0
    test_matrix[i, rating_idx] = rating_matrix[i, rating_idx]

## Step3: Use Cosine Similarity to cluster movies with similar user ratings

In [7]:
similarity_user = train_matrix.dot(train_matrix.T) + 1e-9
norms = np.array([np.sqrt(np.diagonal(similarity_user))])
similarity_user = ( similarity_user / (norms * norms.T) )

similarity_movie = train_matrix.T.dot(train_matrix) + 1e-9
norms = np.array([np.sqrt(np.diagonal(similarity_movie))])
similarity_movie = ( similarity_movie / (norms * norms.T) )

In [8]:
print(similarity_movie)

[[1.00000000e+00 3.91963000e-01 2.64434732e-01 ... 1.22191928e-01
  6.74271699e-02 6.74271699e-02]
 [3.91963000e-01 1.00000000e+00 2.06926651e-01 ... 4.30014695e-02
  7.20338466e-12 8.23243961e-12]
 [2.64434732e-01 2.06926651e-01 1.00000000e+00 ... 5.70248563e-12
  1.00301357e-11 1.14630122e-11]
 ...
 [1.22191928e-01 4.30014695e-02 5.70248563e-12 ... 1.00000000e+00
  5.68535244e-01 5.68535244e-01]
 [6.74271699e-02 7.20338466e-12 1.00301357e-11 ... 5.68535244e-01
  1.00000000e+00 1.00000000e+00]
 [6.74271699e-02 8.23243961e-12 1.14630122e-11 ... 5.68535244e-01
  1.00000000e+00 1.00000000e+00]]


## Step 4: Use the model to find similar movies

In [9]:
from sklearn.metrics import mean_squared_error

prediction = similarity_user.dot(train_matrix) / np.array([np.abs(similarity_user).sum(axis=1)]).T
prediction = prediction[test_matrix.nonzero()].flatten()
test_vector = test_matrix[test_matrix.nonzero()].flatten()
mse = mean_squared_error(prediction, test_vector)

print ("MSE = " + str(mse))

MSE = 9.800587917753155


## Step 5: Import the Get_poster function in order to get the movie poster online

In [10]:
import requests
import json
# Get base url filepath structure. w185 corresponds to size of movie poster.
headers = {'Accept': 'application/json'}
payload = {'api_key': '121243b2af1966b0d85754dc2b3bcc25'} 
response = requests.get("http://api.themoviedb.org/3/configuration", params=payload, headers=headers)
response = json.loads(response.text)
base_url = response['images']['base_url'] + 'w185'

def get_poster(imdbid, base_url):
    # Get IMDB movie ID
    movie_id = "tt0" + str(imdbid) 
    
    # Query themoviedb.org API for movie poster path.
    movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
    headers = {'Accept': 'application/json'}
    payload = {'api_key': '121243b2af1966b0d85754dc2b3bcc25'} 
    response = requests.get(movie_url, params=payload, headers=headers)
    try:
        file_path = json.loads(response.text)['posters'][0]['file_path']
    except:
        file_path = ""
        
    return (base_url + file_path, imdbid)

## Step 6: Make Recommendations

In [12]:
import requests
import json

from IPython.display import Image
from IPython.display import display
from IPython.display import HTML

idx_to_movie = {}
for row in df_id.itertuples():
    idx_to_movie[row[1]-1] = row[2]
idx_to_movie    

k = 6  
idx = 0  #what movie to use as the base
movies = [ idx_to_movie[x] for x in np.argsort(similarity_movie[idx,:])[:-k-1:-1] ] 

movies = filter(lambda imdb: len(str(imdb)) == 6, movies)

n_display = 6
URL = [0]*n_display
IMDB = [0]*n_display
i = 0
for movie in movies:
    (URL[i], IMDB[i]) = get_poster(movie, base_url)
    i += 1 
    
images = ''
for i in range(n_display):
    images += "<img style='width: 100px; margin: 0px; \
                float: left; border: 1px solid black;' src='%s' />" \
                % URL[i]

display(HTML(images))

## Step 7: Make another set of recommendations from another movie

In [13]:
import requests
import json

from IPython.display import Image
from IPython.display import display
from IPython.display import HTML

idx_to_movie = {}
for row in df_id.itertuples():
    idx_to_movie[row[1]-1] = row[2]
idx_to_movie    

k = 6  
idx = 20  #what movie to use as the base
movies = [ idx_to_movie[x] for x in np.argsort(similarity_movie[idx,:])[:-k-1:-1] ] 

movies = filter(lambda imdb: len(str(imdb)) == 6, movies)

n_display = 6
URL = [0]*n_display
IMDB = [0]*n_display
i = 0
for movie in movies:
    (URL[i], IMDB[i]) = get_poster(movie, base_url)
    i += 1 
    
images = ''
for i in range(n_display):
    images += "<img style='width: 100px; margin: 0px; \
                float: left; border: 1px solid black;' src='%s' />" \
                % URL[i]

display(HTML(images))

## End of Model 1