In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
PATH='datasets/'
csvfiles = ['links.csv','movies.csv','ratings.csv']

In [3]:
# load movielens datasets

def loadMovieLens(path=PATH):
    linksdf = pd.read_csv(path+csvfiles[0])
    moviesdf = pd.read_csv(path+csvfiles[1])
    ratingsdf = pd.read_csv(path+csvfiles[2])
    
    return linksdf, moviesdf, ratingsdf

In [4]:
linksdf, moviesdf, ratingsdf = loadMovieLens()


## Raw data transformation

### 1. Merging IMDb linksdf with moviesdf

In [5]:
moviesdf['imdbid']=linksdf.imdbId

### 2. movies.json

In [6]:
moviesdf.head()

Unnamed: 0,movieId,title,genres,imdbid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497
2,3,Grumpier Old Men (1995),Comedy|Romance,113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885
4,5,Father of the Bride Part II (1995),Comedy,113041


In [7]:
# extract years 
moviesdf['year'] = moviesdf.title.str.extract(r'\((\d+)\)', expand=False)

In [8]:
# clean title
moviesdf.title = moviesdf.title.str.replace(r'(\(\d+\))', '')
moviesdf.title = moviesdf.title.str.strip()

In [9]:
# cleaning genres
moviesdf.genres = moviesdf.genres.str.split('|')

In [10]:
moviesdf.head()

Unnamed: 0,movieId,title,genres,imdbid,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",114709,1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",113497,1995
2,3,Grumpier Old Men,"[Comedy, Romance]",113228,1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",114885,1995
4,5,Father of the Bride Part II,[Comedy],113041,1995


In [11]:
moviesdf.year.fillna(-1, inplace=True)

In [12]:
i_movieId, i_title, i_genres, i_imdbid, i_year = 0, 1, 2, 3, 4
movies = {}
for row in moviesdf.iterrows():
    row=row[1]
    moviedId = int(row[i_movieId])
    
    movies.setdefault(moviedId, {})
    movies[moviedId] = dict(title=row[i_title], genres=row[i_genres], imdbid=row[i_imdbid], year=row[i_year])
    
    

In [13]:
import json
json.dump(movies, open('movielens/movies.json','w+'))

__format :: movies.json__:

{

    _movieId_: {

        'title': _movie title_,

        'genres': [ _genre1_, _genre2_, ..],

        'year': _year of release_,

        'imdb': _IMDb id of movie_
     },
     .
     .
     .
}

movies.json is a dictionary of movie objects formated as above

### 3. genres.json

In [14]:
genres={}
for movie in movies.keys():
    
    # genres of this movie
    genrelist=movies[movie]['genres']
    
    # each genre in genrelist
    for g in genrelist:
        genres.setdefault(g,[])
        # classifing movies against genres
        genres[g].append(int(movie))



In [15]:
json.dump(genres,open('movielens/genres.json','w'))

__format :: genres.json__

{

    genre1: [ _movieId1_, _movieId2, ..],
    genre2: [ _movieId1_, _movieId2, ..],
    .
    .
    .

}

key: genre(string)

item: list of movieIds of key genre

### 4. ratings.json

In [16]:
ratingsdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [17]:
ratings = {}
user_history = {}
for row in ratingsdf.iterrows():
    row=row[1]
    userId = int(row[0])
    movieId = int(row[1])
    rating = row[2]
    timestamp = int(row[3])
    
    ratings.setdefault(userId,{})
    ratings[userId].setdefault(movieId,{})
    
    ratings[userId][movieId] = dict(rating=rating, timestamp=timestamp)
    
    

In [18]:
json.dump(ratings, open('movielens/ratings.json', 'w+'))

__format :: ratings.json__

{

    userId: {

        movieId1: {
            'rating': 0.0-5.0,
            'timestamp': when rated
        },
        .
        .
        .
    },
    .
    .
    .
}

__desc__

1. ratings.json is a dictionary of all movie ratings
2. each user who ever rated a movie has an object in this dictionary
3. each user object is a dictionary of rating objects
4. each rating object has movieId as key and two properties:
    1. user rating of that movie
    2. timestamp of rating

### 5. Transposed Ratings

Iterchange movieId keys with userId keys

In [19]:
def transpose(prefs):
    tp={}
    for user in prefs:
        for item in prefs[user]:
            
            tp.setdefault(item,{})
            tp[item].setdefault(user,{})
            
            tp[item][user]=prefs[user][item]
    return tp

In [20]:
trans_ratings = transpose(ratings)
json.dump(trans_ratings, open('movielens/ratings-transpose.json', 'w'))

__format :: ratings-transpose.json__

{

    movieId: {

        userId: {
            'rating': x in (0.0, 5.0)
            'timestamp': when rated
        },
        .
        .
        .
    },
    .
    .
    .
}

## User - Genre Preference Model

will calculate preference of each genre for each user

In [21]:
import math

### Genre Frequency in User Ratings

In [22]:
def user_genre_occurence_frequency(prefs, genres, userId, genre):
    '''
    TODO:
        find occurence rate of a genre in user ratings weighted by rating of movies
    
    HOW:
        R(u, m) :
            returns ratings by user 'u' for movie 'm'. will use to get ratings of movies
            of genre 'g' rated by user 'u'

        T(u, M)
            returns total number of movies rated by user 'u' which belong to universal
            movie set 'M'

        gf(u, g) = sum( R(u, m); all m in Mu and m of g ) / T(u, M) 
            ; Mu is set of movies rated by 'u'
        
            returns occurrence frequency of a particular genre 'g' in user 'u's ratings weighted by
            movie ratings.
    
    WHY:
        1. it shows how much a genre 'g' appeares in user ratings and how user 'u' rates those movies
        
        2. high value signifies high inclination towards for movies of that genre
    
    RETURN: 
        gf(u, g)
    
    
    NOTES: 
        1. analogous to Term Frequency from TF-IDF model
    '''
    
    userPrefs = prefs[userId]
    
    # total number of movies rated by user; T(u, M)
    numRatings=len(userPrefs.keys())
    
    # if no movies rated by user
    if numRatings==0: return 0
    
    # sum of all movie ratings of this genre; sum( R(u, m) m belongs to Mu and m of g )
    sumGenreRatings = sum([ userPrefs[movie]['rating'] for movie in genres[genre] if movie in userPrefs.keys() ])
    
    # gf(u, g)
    return sumGenreRatings/numRatings

### Logged inverse genre occurence frequency

In [23]:
def inverse_genre_occurence_frequency(movies, genres, genre):
    '''
    TODO: 
        find log of inverse occurence frequncy of a genre in all of movies.
    
    HOW:
        T(g, M)
            returns total number of movies of genre 'g' in universal movie set 'M'
        
        T(M)
            returns total number of movies in 'M'
        
        IGF(g, M) = log( T(M)/T(g, M) )
            returns log of inverse genre frequency of genre 'g' in 'M'
    
    WHY:
        1. It shows the importance of a genre 'g'; importance increases with value
        2. Genres which appear in many movies are not very distinctive hence, not very important
           for making recommendations.
    
    RETURN:
        IGF(g, M)
    
    NOTES: analogous to Inverse Document Frequency from TF-IDF model
    '''
    # total number of movies in all the dataset; T(M)
    N = len(movies.keys())
    
    # total number of movies of this genre; T(g, M)
    n = len(genres[genre])
    
    # log of inverse genre frequency; IGF(g, M)
    return math.log(N/n)
    

### Genre Preference Score

In [24]:
def user_genre_preference_score(prefs, movies, genres, userId, genre):
    '''
    TODO: 
        calculate a user's preference score for the given genre
    
    HOW:
        gf(u, g)
            preference score of genre 'g' for user 'u'
            
        IGF(g, M)
            inverse genre frequency of genre 'g' in universal movie set 'M'
        
        p(u, g) = gf(u, g) * IGF(g, M)
            returns user 'u' preference score for genre 'g'
    
    WHY:
        1. this stat shows user 'u's subjective preference for the given genre
        
        2. if user 'u' has seen many movies of genre 'g'; T(u, M) is high and p(u, g) is high, it shows that 
           user 'u' has high degree of preference for genre 'g' and would like to watch more 
           movies of this genre. 
           Recommed more movies of genre 'g'
        
        3. if user 'u' has seen many movies of genre 'g'; T(u, M) is high but p(u, g) is low, it shows that
           user 'u' is intrested in genre 'g' but haven't found satisfying movies of this genre. 
           Recommend top rated movies of this genre filtered by similar users
           
        4. if user 'u' has not seen many movies of genre 'g'; T(u, M) is low but p(u, g) is high, it shows that
           user 'u' might not be inclined towards genre 'g' but have found the movies they saw intresting.
           Recommend top rated movies of this genre filtered by similar users
        
        5. if user 'u' has not seen many movies of genre 'g'; T(u, M) is low and p(u, g) is low, it shows that
           user 'u' is not intrested in this genre and didn't liked the movies they saw.
    
    RETURN: 
            p(u, g)
        
    
    NOTES: analogous to IT-IDF model
    '''
    
    # tf of genre; gf(u, g)
    gf = user_genre_occurence_frequency(prefs, genres, userId, genre)
    
    # idf of genre; IGF(g, M)
    igf = inverse_genre_occurence_frequency(movies, genres, genre)
    
    return gf*igf
    

### Creating Genre Preference Dictionary

In [25]:
def create_user_genre_preference_dictionary(prefs, movies, genres):
    '''
    TODO: create a dictionary of normalized genre preferences of all users for all genres
    '''
    gps={}

    # all userIds
    for userId in prefs.keys():
        s=0 # sum of genre pref scores for userId; to normalize the score vector

        # all genres
        for genre in genres.keys():
            gps.setdefault(userId,{})

            # genre pref score for userId for this genre
            score = user_genre_preference_score(prefs, movies, genres, userId, genre)

            s+=score
            gps[userId][genre] = score

        # normalization
        for genre in gps[userId].keys():
            gps[userId][genre]= gps[userId][genre]/s   
    
    return gps


In [26]:
gps = create_user_genre_preference_dictionary(ratings, movies, genres)
json.dump(gps, open('movielens/user-genre-preferences.json','w'))

__format :: user-genre-preferences.json__

{

    userId: {    
        genre: preference score,
        genre: preference score,
        .
        .
        .
     }
     .
     .
     .
}

__desc__
1. dictionary of genre preference scores for all genres of all users

## Item-Genre Importance Model

will calculate weight of a genre in a movie

In [27]:
with open('movielens/ratings-transpose.json','r') as tr:
    trans_prefs=json.load(tr)

In [28]:
with open('movielens/ratings.json','r') as rj:
    prefs = json.load(rj)

In [29]:
with open('movielens/user-genre-preferences.json','r') as rj:
    genre_prefs = json.load(rj)

with open('movielens/movies.json','r') as rj:
    movies = json.load(rj)

In [30]:
def movie_genre_occurance_weight(prefs, trans_prefs, movies, genre_prefs, movie, genre):
    '''
    how much of genre factor present in movie
    '''
    # all the user who rated this movie
    users = trans_prefs[movie].keys()
    
    s=0
    rm=set()
    for user in users:
        
        # sum of product of movie ratings by all users and user genre prefrence scores for the genre
        s+=trans_prefs[movie][user]['rating']*genre_prefs[user][genre]
        
        # all the genres rated by these users
        for mid in prefs[user].keys():
            rm = rm.union(set(movies[mid]['genres']))
    
    return s/len(rm)
        