In [25]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# import the csv files into dataframes
users_frame = DataFrame(pd.read_csv('users.csv'))
ratings_frame = DataFrame(pd.read_csv('ratings.csv'))
movies_frame = DataFrame(pd.read_csv('movies.csv', encoding='latin-1'))

In [26]:
# get the genre:count or age_group:count 
def getStatsGroup(column, value):
    genderGrpUsers = users_frame[users_frame[column]== value]
    movieRating = ratings_frame[ratings_frame["UserID"].isin(genderGrpUsers["UserID"])]
    movieFrameGender = movies_frame[movies_frame["MovieID"].isin(movieRating["MovieID"])]
    genderGenres = movieFrameGender["Genre"]
    genderGenre_set = set()    
    for row_genre in genderGenres:
        for genre in row_genre.split("|"):
            genderGenre_set.add(genre)
    genre_count = {} # dict to store genre:movie_count_watched
    for genre in genderGenre_set:
        count = pd.value_counts(genderGenres.str.contains(genre))[True]
        genre_count[genre] = count
        
    gender_genre_frame = DataFrame.from_dict(genre_count, orient='index')
    
    return gender_genre_frame

In [44]:
def getFavGenre(UserID, topN):
    # get ratings of the specified user
    user_ratings = ratings_frame[ratings_frame["UserID"] == UserID]
    # get the user's gender and his/her age group
    user_gender = users_frame[users_frame["UserID"] == UserID]["Gender"].values[0]
    user_age_group = users_frame[users_frame["UserID"] == UserID]["Age-Group"].values[0]
    
    # get genres of the user's watched movies
    genre_movie = movies_frame[movies_frame["MovieID"].isin(user_ratings["MovieID"])]

    genre_all = genre_movie["Genre"] 
    # to get all the fav genre of the user (unique)
    genre_set = set()    
    for row_genre in genre_all:
        for genre in row_genre.split("|"):
            genre_set.add(genre)
    
    
    # count no. of movies watched by the user to get his/her fav genre 
    genre_count = {} # dict to store genre:movie_count_watched
    for genre in genre_set:
        count = pd.value_counts(genre_all.str.contains(genre))[True]
        genre_count[genre] = count
    # convert genre_count from dict to fram
    genre_count_frame = DataFrame.from_dict(genre_count, orient='index')
    # get the wt. avg 
    # 60% for genre
    updateDictValues(genre_count_frame, 0.60)
    # 25% for genre
    gender_count_frame = getStatsGroup("Gender", user_gender)
    updateDictValues(gender_count_frame, 0.25)
    # 15% for age-group
    age_group_count_frame = getStatsGroup("Age-Group", user_age_group)
    updateDictValues(age_group_count_frame, 0.15)
    
    all_genre_frame = genre_count_frame.add(gender_count_frame, fill_value = 0).add(age_group_count_frame,
                                                                                           fill_value = 0)
    top_genres = getTopN(all_genre_frame, 3, 0)
    
    return sorted(genre_count.items(), key=lambda x: x[1])[:topN] #top_genres.index.values
#sorted(genre_count.items(), key=lambda x: x[1])[:topN]
    
    

      

In [47]:
def weightedAvg(a):
    weightAvgGenre={}
    maxVal = max(a.values())
    for val in a.keys():  
        weightAvgGenre[val] = a[val]/maxVal
    return weightAvgGenre

In [48]:
def weightAvgRating(frame, average_rating):
    max_rating = max(frame["Rating"])
    average_rating.ix[1][0] = average_rating.ix[1][0]/max_rating
    return average_rating

In [42]:
def calcRecommendationParams(UserID, genre_count = 3):
    fav_genre = getFavGenre(UserID, genre_count)
    genre = set()
    for i in fav_genre:
        genre.add(i[0])

    # check the complete dataset where users fav genre matches
    movie_genre_wt_dict = {}    
    for i in range(len(movies_frame.index)):
        movie_id = movies_frame.ix[i]["MovieID"]
        movie_genre = set(movies_frame.ix[i]["Genre"].split("|"))
        movie_genre_wt_dict[movie_id] = len(movie_genre & genre)

    # get the wt.average of movie_genre_wt_dict
    movie_genre_wt_dict = weightedAvg(movie_genre_wt_dict)
    
    
    #STEP 2a. calculate the average rating per movie. 
    ratings_data = ratings_frame[["MovieID", "Rating"]].groupby('MovieID')
    average_rating = ratings_data.mean()
    # get the wt.average of average_rating
    popularity_ratings_count = ratings_data.count()
    popularity_ratings_wt_avg = weightAvgRating(popularity_ratings_count, average_rating)
    
    return movie_genre_wt_dict, average_rating, popularity_ratings_wt_avg

In [31]:
def updateDictValues(dictionary, factor):
    dictionary.update({n: factor * dictionary[n] for n in dictionary.keys()})

In [32]:
# this function returns the final movie data frame
def calcRecommendation(userId):
    # get all the required parameters
    movie_genre_wt_dict, average_rating, popularity_ratings_wt_avg = calcRecommendationParams(userId)
    # apply the formula to get top 10 recommended movies
    #STEP 5a.   recommendationScore = (MovieGenreScore*.6) + (AveRatingScore*.25)+(PopularityScore*.15)
    #step 5.1 : get weitage of 60% for movie genre and convert the output into data frame from dict
    updateDictValues(movie_genre_wt_dict, 0.6)
    movie_genre_frame = DataFrame.from_dict(movie_genre_wt_dict, orient='index')
    #step 5.1 : get weitage of 25% for average rating
    updateDictValues(average_rating, 0.25)
    #step 5.1 : get weitage of 25% for popularity score
    updateDictValues(popularity_ratings_wt_avg, 0.15)

    # get the final movie ratings based on addition of the above values
    #movie_recommendation_frame = movie_genre_frame + average_rating + popularity_ratings_wt_avg
    # to add values make the same coulmn names which are supposed to be added
    average_rating = average_rating.rename(columns={'Rating': 'wt_avg'})
    movie_genre_frame = movie_genre_frame.rename(columns={0: 'wt_avg'})
    movie_recommendation_frame = movie_genre_frame.add(average_rating, fill_value = 0).add(popularity_ratings_wt_avg,
                                                                                           fill_value = 0)
    return movie_recommendation_frame

In [33]:
def getTopN(dataframe, topN, column):
    return dataframe.nlargest(topN, column)

In [34]:
def getBestRecommendation(userId):
    movie_recommendation_frame = calcRecommendation(userId)
    best_recommendation = getTopN(movie_recommendation_frame, 10, "wt_avg")
    best_recommendation_movies = best_recommendation.join(movies_frame)
    return best_recommendation_movies[["Title", "Released on", "Genre"]]

In [49]:
best_recommendated_movies = getBestRecommendation(9999)
best_recommendated_movies

Unnamed: 0,Title,Released on,Genre
578,Metisse (Café au Lait),1993,Comedy
3730,Criminal Lovers (Les Amants Criminels),1999,Drama|Romance
3801,Suddenly: Last Summer,1959,Drama
1950,Dangerous Liaisons,1988,Drama|Romance
800,Kazaam,1996,Childrens|Comedy|Fantasy
924,Ninotchka,1939,Comedy|Romance
1264,Big Sleep: The,1946,Film-Noir|Mystery
2686,Wanted: Dead or Alive,1987,Action
2810,Operation Condor 2 (Longxiong hudi),1990,Action|Adventure|Comedy
123,Flirting With Disaster,1996,Comedy
