In [2]:
import pandas as pd
from math import sqrt, pow

In [31]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ratings.dat', sep = '::', names = r_cols, encoding = 'latin-1', engine = 'python')

In [32]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


#### Getting no of users and movies in the dataset

In [33]:
user_ids = ratings.user_id.unique().tolist()
movie_ids = ratings.movie_id.unique().tolist()
print('No of users: {}'.format(len(user_ids)))
print('No of movies: {}'.format(len(movie_ids)))

No of users: 6040
No of movies: 3706


In [34]:
m_cols = ['movie_id', 'movie_title', 'genre']
movies = pd.read_csv('data/movies.dat', sep = '::', names = m_cols, encoding = 'latin-1', engine = 'python')

In [35]:
movies.head()

Unnamed: 0,movie_id,movie_title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Data Preprocessing

In [36]:
movies.genre = movies.genre.str.split('|')

In [45]:
genre_columns = list(set([j for i in movies['genre'].tolist() for j in i]))
len(genre_columns)

18

In [39]:
for j in genre_columns:
    movies[j] = 0
for i in range(movies.shape[0]):
    for j in genre_columns:
        if j in movies['genre'].iloc[i]:
            movies.loc[i, j] = 1

In [40]:
movies.head()

Unnamed: 0,movie_id,movie_title,genre,Crime,Musical,Film-Noir,Drama,Mystery,Sci-Fi,Comedy,...,Horror,Action,Fantasy,War,Romance,Thriller,Adventure,Western,Children's,Animation
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [47]:
split_values = movies['movie_title'].str.split("(", n=1, expand = True)
split_values.head()

Unnamed: 0,0,1
0,Toy Story,1995)
1,Jumanji,1995)
2,Grumpier Old Men,1995)
3,Waiting to Exhale,1995)
4,Father of the Bride Part II,1995)


In [48]:
movies.movie_title = split_values[0]

In [49]:
movies['release_year'] = split_values[1]

In [51]:
movies['realease_year'] = movies.release_year.str.replace(')', '')

In [52]:
movies.drop('genre', axis = 1, inplace=True)

In [53]:
movies.head()

Unnamed: 0,movie_id,movie_title,Crime,Musical,Film-Noir,Drama,Mystery,Sci-Fi,Comedy,Documentary,...,Fantasy,War,Romance,Thriller,Adventure,Western,Children's,Animation,release_year,realease_year
0,1,Toy Story,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,1995),1995
1,2,Jumanji,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,1995),1995
2,3,Grumpier Old Men,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1995),1995
3,4,Waiting to Exhale,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1995),1995
4,5,Father of the Bride Part II,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1995),1995


In [55]:
def get_rating_(userid, movieid):
    return ratings.loc[(ratings.user_id == userid) & (ratings.movie_id == movieid), 'rating'].iloc[0]

In [56]:
def get_movieids_(userid):
    return ratings.loc[ratings.user_id == userid, 'movie_id'].tolist()

In [91]:
def get_movie_title_(movieid):
    return (movies.loc[(movies.movie_id == movieid),'movie_title'].iloc[0])

In [61]:
def distance_similarity_score_(user1, user2):
    '''params --  user1, user2: user ids of two users between which similarity score is to be claculated'''
    #no of movies watched by both users
    both_watch_count = 0
    for element in get_movieids_(user1):
        if element in get_movieids_(user2):
            both_watch_count += 1
    if both_watch_count == 0:
        return 0
    
    # Calculating distance based similarity between both the users.
    distance = []
    for element in get_movieids_(user1):
        if element in get_movieids_(user2):
            rating1 = get_rating_(user1, element)
            rating2 = get_rating_(user2, element)
            distance.append(pow(rating1 - rating2, 2))
            total_distance = sum(distance)
        
        # Adding one to the denominator to avoid divide by zero error.
    return 1/(1+sqrt(total_distance))

In [66]:
#testing the above function
print('Distance based similarity between user ids 10 & 240: {}'.format(distance_similarity_score_(10, 240)))

Distance based similarity between user ids 10 & 240: 0.08240267256634183


In [85]:
def pearson_correlation_score_(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    # A list of movies watched by both the users.
    both_watch_count = []
    
    # Finding movies watched by both the users.
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            both_watch_count.append(element)
    
    # Returning '0' correlation for bo common movies.
    if len(both_watch_count) == 0 :
        return 0
    
    # Calculating Co-Variances.
    rating_sum_1 = sum([get_rating_(user1,element) for element in both_watch_count])
    rating_sum_2 = sum([get_rating_(user2,element) for element in both_watch_count])
    rating_squared_sum_1 = sum([pow(get_rating_(user1,element),2) for element in both_watch_count])
    rating_squared_sum_2 = sum([pow(get_rating_(user2,element),2) for element in both_watch_count])
    product_sum_rating = sum([get_rating_(user1,element) * get_rating_(user2,element) 
                              for element in both_watch_count])
    
    # Returning pearson correlation between both the users.
    numerator = product_sum_rating - ((rating_sum_1 * rating_sum_2) / len(both_watch_count))
    denominator = sqrt((rating_squared_sum_1 - pow(rating_sum_1,2) / len(both_watch_count)) * 
                       (rating_squared_sum_2 - pow(rating_sum_2,2) / len(both_watch_count)))
    
    # Handling 'Divide by Zero' error.
    if denominator == 0:
        return 0
    return numerator/denominator

In [86]:
#testing the above function
print('Pearson correlation between user 63 & 92: {}'.format(pearson_correlation_score_(63, 92)))

Pearson correlation between user 63 & 92: 0.1664062283752735


In [75]:
def most_similar_users_(user1, number_of_users, metric = 'pearson'):
    '''
    params -- 
    user1 : Targeted User
    number_of_users : number of most similar users you want to user1.
    metric : metric to be used to calculate inter-user similarity score.(default = 'pearson')
    '''
    # Getting distinct users ids
    user_ids = ratings.user_id.unique().tolist()
    
    # Getting similarity score between targeted user and all other users
    if(metric == 'pearson'):
        similarity_score = [(pearson_correlation_score_(user1, nth_user), nth_user)
                           for nth_user in user_ids[:100] if nth_user != user1]
    else:
         similarity_score = [(distance_similarity_score_(user1, nth_user), nth_user)
                           for nth_user in user_ids[:100] if nth_user != user1]
            
    similarity_score.sort()
    similarity_score.reverse()
    # Returning the top most 'number_of_users' similar users. 
    return similarity_score[:number_of_users]

In [78]:
#testing the above function
print(most_similar_users_(23, 5))

[(0.936585811581694, 61), (0.7076731463403717, 41), (0.6123724356957956, 21), (0.5970863767331771, 25), (0.5477225575051661, 64)]


In [89]:
def get_recommendation_(userid):
    user_ids = ratings.user_id.unique().tolist()
    total = {}
    similariy_sum = {}
    
    # Iterating over subset of user ids.
    for user in user_ids[:100]:
        
        # not comparing the user to itself
        if user == userid:
            continue
        
        # Getting similarity score between the users.
        score = pearson_correlation_score_(userid,user)
        
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
    return recommendations[:10]

In [92]:
print(get_recommendation_(32))

['Invisible Man, The ', 'Creature From the Black Lagoon, The ', 'Hellraiser ', 'Almost Famous ', 'Way of the Gun, The ', 'Shane ', 'Naked Gun 2 1/2: The Smell of Fear, The ', "Kelly's Heroes ", 'Official Story, The ', 'Everything You Always Wanted to Know About Sex ']


In [95]:
def get_recommendation_(userid):
    user_ids = ratings.user_id.unique().tolist()
    total = {}
    similariy_sum = {}
    
    # Iterating over subset of user ids.
    for user, score in most_similar_users_(userid, 100):
        
         
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
    return recommendations[:10]

In [96]:
print(get_recommendation_(32))

['Awakenings ', 'Christmas Story, A ', "Bug's Life, A ", 'Rain Man ', 'Last Days of Disco, The ', 'Ben-Hur ', 'Back to the Future ', "One Flew Over the Cuckoo's Nest ", 'Sound of Music, The ', 'Dumbo ']
