In [1]:
import pandas as pd
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
genome_scores = pd.read_csv('genome_scores.csv')
genome_tags = pd.read_csv('genome_tags.csv')
imdb_data = pd.read_csv('imdb_data.csv')
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')
movie = pd.read_csv('movies.csv')

In [None]:
train_movie_ids = set(train['movieId'])
test_movie_ids = set(test['movieId'])
movieIds = set(movie['movieId'])
# Get the movie ids that are in the test set but not in the train set
movie_ids_in_test_not_in_train = train_movie_ids - movieIds

movie_ids_in_test_not_in_train 

In [None]:
movie.head(5)

In [3]:
tags.drop(['timestamp'], axis=1, inplace = True)
tags.head(5)

Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good


In [4]:
genome_scores = pd.read_csv('genome_scores.csv')
genome_tags = pd.read_csv('genome_tags.csv')

In [5]:
genome_scores = genome_scores.merge(genome_tags, on='tagId', how='left')
genome_scores.drop(['tagId','relevance'],axis = 1, inplace = True)

In [20]:
genome_scores

Unnamed: 0,movieId,tag
0,1,007
1,1,007 (series)
2,1,18th century
3,1,1920s
4,1,1930s
...,...,...
15584443,206499,writing
15584444,206499,wuxia
15584445,206499,wwii
15584446,206499,zombie


In [22]:
tags.shape

(1093360, 3)

In [19]:
imdb_data.drop(['runtime','budget'], axis=1, inplace = True)
imdb_data.head(5)

Unnamed: 0,movieId,title_cast,director,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,fatherhood|doberman|dog|mansion


In [23]:
#combine all important feautures from other datasets in a new column in movie dataset based on movie Id
sets = [links, imdb_data]
for x in sets:
    movie = movie.merge(x, on='movieId', how='left')
movie.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,title_cast,director,plot_keywords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,toy|rivalry|cowboy|cgi animation
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,board game|adventurer|fight|game
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,boat|lake|neighbor|rivalry
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,black american|husband wife relationship|betra...
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,fatherhood|doberman|dog|mansion


In [24]:
movie['features'] = movie['genres']+' '+movie['title_cast']+ ' ' + movie['director']+' ' + movie['plot_keywords']
movie.drop(['genres' ,	'title_cast'	,'director'	,'plot_keywords'], axis = 1, inplace = True)


Unnamed: 0,movieId,title,imdbId,tmdbId,features
0,1,Toy Story (1995),114709,862.0,Adventure|Animation|Children|Comedy|Fantasy To...
1,2,Jumanji (1995),113497,8844.0,Adventure|Children|Fantasy Robin Williams|Jona...
2,3,Grumpier Old Men (1995),113228,15602.0,Comedy|Romance Walter Matthau|Jack Lemmon|Soph...
3,4,Waiting to Exhale (1995),114885,31357.0,Comedy|Drama|Romance Whitney Houston|Angela Ba...
4,5,Father of the Bride Part II (1995),113041,11862.0,Comedy Steve Martin|Diane Keaton|Martin Short|...


In [27]:
movie['features'] = movie['features'].fillna('')

movie.head()

Unnamed: 0,movieId,title,imdbId,tmdbId,features
0,1,Toy Story (1995),114709,862.0,Adventure|Animation|Children|Comedy|Fantasy To...
1,2,Jumanji (1995),113497,8844.0,Adventure|Children|Fantasy Robin Williams|Jona...
2,3,Grumpier Old Men (1995),113228,15602.0,Comedy|Romance Walter Matthau|Jack Lemmon|Soph...
3,4,Waiting to Exhale (1995),114885,31357.0,Comedy|Drama|Romance Whitney Houston|Angela Ba...
4,5,Father of the Bride Part II (1995),113041,11862.0,Comedy Steve Martin|Diane Keaton|Martin Short|...


In [29]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                     min_df=0.0, stop_words='english')

# Produce a feature matrix, where each row corresponds to a book,
# with TF-IDF features as columns 
tf_movieFeatures_matrix = tf.fit_transform(movie['features'])
cosine_sim= cosine_similarity(tf_movieFeatures_matrix, 
                                        tf_movieFeatures_matrix)
print (cosine_sim.shape) #now we have a sim matrix between different movies

(62423, 62423)


In [42]:
titles = movie['title']
indices = pd.Series(movie.index, index=movie['movieId'])

In [43]:
def content_generate_top_N_recommendations(movieId, N=10):
    # Convert the string book title to a numeric index for our 
    # similarity matrix
    m_idx = indices[movieId]
    # Extract all similarity values computed with the reference book title
    sim_scores = list(enumerate(cosine_sim[m_idx]))
    # Sort the values, keeping a copy of the original index of each value
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Select the top-N values for recommendation
    sim_scores = sim_scores[1:N]
    # Collect indexes 
    movie_indices = [i[0] for i in sim_scores]
    # Convert the indexes back into titles 
    return titles.iloc[movie_indices]

In [45]:
content_generate_top_N_recommendations(2)

10415                                       Zathura (2005)
3414                                   Return to Me (2000)
4681                                Little Man Tate (1991)
4970                   Adventures of Huck Finn, The (1993)
24158    Robin Williams: Weapons of Self Destruction (2...
19484                             Unspeakable Acts (1990) 
2017     Halloween H20: 20 Years Later (Halloween 7: Th...
4063                                     15 Minutes (2001)
2264                                  Bug's Life, A (1998)
Name: title, dtype: object

In [59]:
import operator # <-- Convenient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists
import numpy as np

In [61]:
def content_generate_rating_estimate(movieId, user, rating_data, k=20, threshold=0.0):
    # Convert the book title to a numeric index for our 
    # similarity matrix
    m_idx = indices[movieId]
    neighbors = [] # <-- Stores our collection of similarity values 
     
    # Gather the similarity ratings between each book the user has rated
    # and the reference book 
    for index, row in rating_data[rating_data['userId']==user].iterrows():
        sim = cosine_sim[m_idx-1, indices[row['movieId']]-1]
        neighbors.append((sim, row['rating']))
    # Select the top-N values from our collection
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # Compute the weighted average using similarity scores and 
    # user item ratings. 
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_neighbors:
        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user. 
        # We use the average rating for the reference item as a proxy in this case 
        predictedRating = np.mean(rating_data[rating_data['movieId']==movieId]['rating'])
    return predictedRating

In [57]:
train.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837
5,120949,81768,3.0,1289595242
6,19630,62049,4.0,1246729817
7,21066,2282,1.0,945785907
8,117563,120474,4.0,1515108225
9,144018,1997,5.0,1109967647


In [64]:
content_generate_rating_estimate(4144, 1, train)

4.218727388863374

In [None]:
def predict_rating(row):
    return content_generate_rating_estimate(row['movieId'], row['userId'], train)

# Apply the helper function to the test DataFrame and store the results
test['rating'] = test.apply(predict_rating, axis=1)


In [54]:
test.head(2) 

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
