In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def readData(filename):
    fs = open('dataset/' + filename + '.dat', 'r')    
    
    if filename == 'users':
        data = [line.strip().split('::', 3) for line in fs]
        for record in data:
            del record[-1]      # remove columns occupation and zipcode
    elif filename == 'movies':
        movie = []
        movie_genre = []
        for line in fs:
            parts   = line.strip().split('::')
            movieId = parts[0]
            title   = parts[1]
            genres  = parts[2].split('|')
            for genre in genres:
                movie_genre.append((movieId, genre))
            movie.append((movieId, title))
        data = (movie, movie_genre)
    elif filename == 'ratings':
        data = [line.strip().split('::') for line in fs]
        for record in data:
            del record[-1]      #remove column timestamp
    
    fs.close()                                        
    return data

In [4]:
users          = readData('users')
movies, genres = readData('movies')
ratings        = readData('ratings')

In [5]:
udf = pd.DataFrame({ 'UserID'  : pd.Series(row[0] for row in users),
                     'Gender'  : pd.Series(row[1] for row in users),
                     'Age'     : pd.Series(row[2] for row in users)})
mdf = pd.DataFrame({ 'MovieID' : pd.Series(row[0] for row in movies),
                     'Title'   : pd.Series(row[1] for row in movies)})
gdf = pd.DataFrame({ 'MovieID' : pd.Series(row[0] for row in genres),
                     'Genre'   : pd.Series(row[1] for row in genres)})
rdf = pd.DataFrame({ 'UserID'  : pd.Series(row[0] for row in ratings),
                     'MovieID' : pd.Series(row[1] for row in ratings),
                     'Rating'  : pd.Series(row[2] for row in ratings)})

In [6]:
rdf['Rating'] = rdf['Rating'].astype(int)

In [7]:
joined = pd.merge(pd.merge(pd.merge(udf, rdf, on='UserID'), mdf, on='MovieID'), gdf, on='MovieID')

In [8]:
%matplotlib inline

means = joined['Rating'].groupby(joined['Genre']).aggregate(np.mean)
#means.plot(kind='bar',figsize=(15,12))
means

Genre
Action         3.491185
Adventure      3.477257
Animation      3.684868
Children's     3.422035
Comedy         3.522099
Crime          3.708679
Documentary    3.933123
Drama          3.766332
Fantasy        3.447371
Film-Noir      4.075188
Horror         3.215013
Musical        3.665519
Mystery        3.668102
Romance        3.607465
Sci-Fi         3.466521
Thriller       3.570466
War            3.893327
Western        3.637770
Name: Rating, dtype: float64

In [10]:
means = joined['Rating'].groupby([joined['Genre'], joined['Gender']]).aggregate(np.mean).unstack()
#means.plot(kind='bar',figsize=(15,12))
means

Gender,F,M
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,3.490252,3.491386
Adventure,3.512879,3.468125
Animation,3.744702,3.661335
Children's,3.572548,3.358961
Comedy,3.571938,3.503667
Crime,3.689332,3.71372
Documentary,3.946392,3.928811
Drama,3.765662,3.766589
Fantasy,3.513076,3.426603
Film-Noir,4.018087,4.092254


In [9]:
means = joined['Rating'].groupby([joined['Genre'], joined['Age']]).aggregate(np.mean).unstack()
#means.plot(kind='bar',figsize=(15,12))
means

Age,1,18,25,35,45,50,56
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Action,3.506385,3.447097,3.453358,3.538107,3.528543,3.611333,3.610709
Adventure,3.449975,3.408525,3.443163,3.515291,3.528963,3.628163,3.649064
Animation,3.476113,3.624014,3.701228,3.740545,3.734856,3.78002,3.756233
Children's,3.241642,3.294257,3.426873,3.518423,3.527593,3.556555,3.621822
Comedy,3.497491,3.460417,3.490385,3.561984,3.591789,3.646868,3.650949
Crime,3.71017,3.668054,3.680321,3.733736,3.750661,3.810688,3.832549
Documentary,3.730769,3.865865,3.94669,3.953747,3.966521,3.908108,3.961538
Drama,3.794735,3.72193,3.726428,3.782512,3.784356,3.878415,3.933465
Fantasy,3.317647,3.353778,3.452484,3.482301,3.532468,3.58157,3.5327
Film-Noir,4.145455,3.997368,4.058725,4.06491,4.105376,4.175401,4.125932
