In [1]:
import math
import json
import os
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
sns.set_style('darkgrid')

In [2]:
# movielens datasets

PATH='datasets/'
MOVIELENS='movielens/'

linksdf = pd.read_csv(PATH+'links.csv')
moviesdf = pd.read_csv(PATH+'movies.csv')
ratingsdf = pd.read_csv(PATH+'ratings.csv')

### linksdf + moviesdf

In [3]:
moviesdf['imdbId']=linksdf.imdbId.astype('int64')

In [4]:
moviesdf['year']=moviesdf.title.str.extract(r'\((\d\d\d\d)\)', expand=False)


In [5]:
moviesdf.title = moviesdf.title.str.replace(r'\((\d\d\d\d)\)','')
moviesdf.title = moviesdf.title.str.strip()

In [6]:
moviesdf.year.loc[moviesdf.year.isna()]=moviesdf[moviesdf.year.isna()].title.str.extract(r'\((\d+)-', expand=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
moviesdf.year.fillna(-1, inplace=True)

In [8]:
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

def saveJson(obj, path):
    ensure_dir(path)
    with open(path, 'w+') as f:
        json.dump(obj, f)

In [24]:
movies_dict={}
movies_query_dict={}
for row in moviesdf.iterrows():
    data=row[1]
    movieId = str(data[0])
    title = data[1]
    genres = data[2].split('|')
    imdbId = data[3]
    year = data[4]
    
    movies_dict.setdefault('data', {})
    movies_dict.setdefault('meta', {})
    
    movies_dict['data'].setdefault(movieId, {})
    movies_query_dict.setdefault(title, {})
    
    movies_dict['data'][movieId]['title']=title
    movies_dict['data'][movieId]['genres']=genres
    movies_dict['data'][movieId]['imdbId']=imdbId
    movies_dict['data'][movieId]['year']=year
    movies_dict['data'][movieId].setdefault('meta', {})
    
    movies_query_dict[title]['movieId']=movieId
    movies_query_dict[title]['genres']=genres
    movies_query_dict[title]['imdbId']=imdbId
    movies_query_dict[title]['year']=year

movies_dict['meta']['number_of_movies']=len(movies_dict['data'].keys())

In [27]:
genres={}
for movie in movies_dict['data'].keys():
    
    # genres of this movie
    genrelist=movies_dict['data'][movie]['genres']
    
    
    # each genre in genrelist
    for g in genrelist:
        genres.setdefault(g,{})
        genres[g].setdefault('data',[])
        genres[g].setdefault('meta',{})
        # classifing movies_dict against genres
        genres[g]['data'].append(movie)
    
    for g in genres.keys():
        genres[g]['meta']['number_of_movies']=len(genres[g]['data'])
    


In [28]:
genres

{'(no genres listed)': {'data': ['151307',
   '132952',
   '140753',
   '143410',
   '83829',
   '149532',
   '128616',
   '132549',
   '141866',
   '134025',
   '126106',
   '128620',
   '122888',
   '160590',
   '117192',
   '140763',
   '136592',
   '129250'],
  'meta': {'number_of_movies': 18}},
 'Action': {'data': ['3771',
   '5152',
   '8811',
   '65802',
   '5782',
   '8665',
   '4958',
   '434',
   '1681',
   '90249',
   '67295',
   '66785',
   '90403',
   '73323',
   '547',
   '72308',
   '94864',
   '112006',
   '541',
   '2582',
   '4614',
   '44245',
   '4142',
   '2948',
   '2568',
   '26464',
   '160565',
   '63826',
   '111663',
   '147426',
   '6959',
   '86835',
   '1254',
   '2720',
   '861',
   '122920',
   '2728',
   '122886',
   '479',
   '4947',
   '4636',
   '544',
   '172',
   '839',
   '2476',
   '26614',
   '1518',
   '126420',
   '4965',
   '4355',
   '7445',
   '110781',
   '114818',
   '4488',
   '8370',
   '62374',
   '2344',
   '8576',
   '26152',
   '106

### ratingsdf

In [11]:
ratingsdf.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [21]:
ratings = {}
users = {}
for row in ratingsdf.iterrows():
    row=row[1]
    userId = str(int(row[0]))
    movieId = str(int(row[1]))
    rating = row[2]
    timestamp = str(int(row[3]))
    
    ratings.setdefault(userId,{})  
    ratings[userId].setdefault(movieId,{})
  
    ratings[userId][movieId] = dict(rating=rating, timestamp=timestamp)
      
    users.setdefault(userId,{})
    users[userId].setdefault('meta',{})
    users[userId]['meta'].setdefault('movies',{})
    users[userId]['meta']['movies'].setdefault('all',[])
    
    users[userId]['meta']['movies']['all'].append(movieId)
    
    for genre in genres.keys():
        users[userId]['meta']['movies'].setdefault(genre, [])
        if movieId in genres[genre]['data']:
            users[userId]['meta']['movies'][genre].append(movieId)
    
for user in users.keys():
    rs=0
    for i,movie in enumerate(users[user]['meta']['movies']['all']):
        rs+=ratings[user][movie]['rating']
    
    users[user]['meta']['average_rating']=rs/i
    users[user]['meta']['number_of_ratings']=len(users[user]['meta']['movies']['all'])

In [13]:
def transpose(prefs):
    tp={}
    meta={}
    for user in prefs:
        for item in prefs[user]:
            
            tp.setdefault(item,{})
            
            tp[item][user]=prefs[user][item]
            
            meta.setdefault(item, {})
            meta[item].setdefault('rated_by', 0)
            meta[item].setdefault('ratings_sum', 0)
            
            meta[item]['rated_by']+=1
            meta[item]['ratings_sum']+=prefs[user][item]['rating']
    
    for movie in tp:
        movies_dict['data'][movie]['meta']['average_rating']=meta[movie]['ratings_sum']/meta[movie]['rated_by']
    
    return tp

In [14]:
trans_ratings = transpose(ratings)

In [29]:
saveJson(users,MOVIELENS+'users.json')
saveJson(ratings,MOVIELENS+'user-ratings.json')
saveJson(trans_ratings,MOVIELENS+'movie-ratings.json')
saveJson(movies_dict, MOVIELENS+'movies-dictionary.json')
saveJson(movies_query_dict, MOVIELENS+'movies-query-dictionary.json')
saveJson(genres, MOVIELENS+'genres-dictionary.json')