In [120]:
import pandas as pd
import matplotlib.pyplot as plt

In [121]:
user = ['user_id','gender','age','occupation','zip']
users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, names=user, engine='python')
users[:5]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [122]:
rating = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, names=rating, engine='python')
ratings[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [123]:
movie = ['movie_id', 'title', 'genres']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, names=movie, engine='python')
movies[:5]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [124]:
newMovies = pd.DataFrame(movies.genres.str.split('|').tolist(), index=movies.movie_id).stack()
newMovies = newMovies.reset_index([0, 'movie_id'])
newMovies.columns = ['movie_id','genres']
newSet = pd.merge(newMovies, movies, on='movie_id',how='inner')
newSet = newSet.drop(['genres_y'], axis=1)
newSet.rename(columns={'genres_x':'genres'}, inplace=True)
newSet[:5]

Unnamed: 0,movie_id,genres,title
0,1,Animation,Toy Story (1995)
1,1,Children's,Toy Story (1995)
2,1,Comedy,Toy Story (1995)
3,2,Adventure,Jumanji (1995)
4,2,Children's,Jumanji (1995)


**1. An aggregate of the movie ratings for each particular genre, e.g., Action, Adventure, Drama, Science Fiction** 

In [125]:
data  = pd.merge(pd.merge(ratings,users),newSet)
rating_data = pd.merge(pd.merge(ratings,users),movies)
data[:10]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,genres,title
0,1,1193,5,978300760,F,1,10,48067,Drama,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,978298413,M,56,16,70072,Drama,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,978220179,M,25,12,32793,Drama,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,978199279,M,25,7,22903,Drama,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,978158471,M,50,1,95350,Drama,One Flew Over the Cuckoo's Nest (1975)
5,18,1193,4,978156168,F,18,3,95825,Drama,One Flew Over the Cuckoo's Nest (1975)
6,19,1193,5,982730936,M,1,10,48073,Drama,One Flew Over the Cuckoo's Nest (1975)
7,24,1193,5,978136709,F,25,7,10023,Drama,One Flew Over the Cuckoo's Nest (1975)
8,28,1193,3,978125194,F,25,1,14607,Drama,One Flew Over the Cuckoo's Nest (1975)
9,33,1193,5,978557765,M,45,3,55421,Drama,One Flew Over the Cuckoo's Nest (1975)


In [126]:
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

In [127]:
mean_ratings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [128]:
most_rated = data.groupby('title').size().sort_values(ascending=False)[:10]
most_rated

title
Star Wars: Episode V - The Empire Strikes Back (1980)    14950
Star Wars: Episode VI - Return of the Jedi (1983)        14415
Star Wars: Episode IV - A New Hope (1977)                11964
Men in Black (1997)                                      10152
Princess Bride, The (1987)                                9272
L.A. Confidential (1997)                                  9152
E.T. the Extra-Terrestrial (1982)                         9076
Star Wars: Episode I - The Phantom Menace (1999)          9000
Alien (1979)                                              8096
Jurassic Park (1993)                                      8016
dtype: int64

In [129]:
import numpy as np

In [130]:
agg_rating_genre = data.groupby('genres').agg({'rating':[np.size,np.average]})
agg_rating_genre

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,average
genres,Unnamed: 1_level_2,Unnamed: 2_level_2
Action,257457,3.491185
Adventure,133953,3.477257
Animation,43293,3.684868
Children's,72186,3.422035
Comedy,356580,3.522099
Crime,79541,3.708679
Documentary,7910,3.933123
Drama,354529,3.766332
Fantasy,36301,3.447371
Film-Noir,18261,4.075188


In [131]:
agg_rating_genre_avg = data.groupby('genres').agg({'rating':np.average})
agg_rating_genre_avg

Unnamed: 0_level_0,rating
genres,Unnamed: 1_level_1
Action,3.491185
Adventure,3.477257
Animation,3.684868
Children's,3.422035
Comedy,3.522099
Crime,3.708679
Documentary,3.933123
Drama,3.766332
Fantasy,3.447371
Film-Noir,4.075188


**2.The top 5 highest ranked genre by women.**

In [132]:
women_data = data[data.gender == 'F']
rank_by_genre_women = women_data.groupby('genres').size().sort_values(ascending=False)[:5]
rank_by_genre_women

genres
Drama       98153
Comedy      96271
Romance     50297
Action      45650
Thriller    40308
dtype: int64

**3.The top 5 highest ranked genre by men.**

In [133]:
men_data = data[data.gender == 'M']
rank_by_genre_men = men_data.groupby('genres').size().sort_values(ascending=False)[:5]
rank_by_genre_men

genres
Comedy      260309
Drama       256376
Action      211807
Thriller    149372
Sci-Fi      129894
dtype: int64

**4.Pick a movie of your choice and provide a breakdown of the movie’s ratings by 3 age ranges (a) under 18 (b) 19 to 45 (c) Above 45.**

**Jumanji**

In [134]:
movie_of_choice = rating_data.loc[rating_data['title'] == 'Jumanji (1995)']
labels = ['Under 18', '19 to 45', 'Above 45']
movie_of_choice['age_range'] = pd.cut(rating_data.age,[0,18,45,81],labels=labels)
movie_of_choice.groupby('age_range').agg({'rating': [np.size, np.mean]})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
age_range,Unnamed: 1_level_2,Unnamed: 2_level_2
Under 18,190,3.068421
19 to 45,456,3.234649
Above 45,55,3.381818


**5.A function that given a user id and movie id , returns a list of user ids for other users that rated the movie identified by the provided movie id with the same score.**

In [135]:
rating_data[(rating_data.user_id == 1) & (rating_data.movie_id == 1)].rating.values[0]
rating_data[(rating_data.movie_id == 1) & (rating_data.rating == 5)]['user_id']

41626       1
41629       9
41630      10
41632      19
41637      34
41638      36
41639      38
41640      44
41643      49
41644      51
41645      56
41647      65
41650      75
41651      76
41659     112
41663     119
41664     121
41667     132
41672     146
41674     148
41679     156
41680     157
41681     162
41684     169
41688     182
41689     184
41690     186
41695     195
41696     198
41699     213
         ... 
43626    5809
43627    5823
43629    5825
43632    5831
43633    5833
43635    5839
43636    5840
43637    5841
43644    5858
43645    5861
43646    5862
43651    5875
43656    5888
43658    5890
43660    5903
43662    5908
43668    5930
43671    5938
43675    5954
43679    5964
43682    5978
43686    5989
43688    5995
43689    5996
43692    6010
43693    6011
43694    6013
43695    6015
43698    6022
43699    6025
Name: user_id, Length: 820, dtype: int64

In [136]:
def get_user_same_rating(user_id, movie_id):
    current_user_rating = rating_data[(rating_data.user_id == user_id) & (rating_data.movie_id == movie_id)].rating.values[0]
    return rating_data[(rating_data.movie_id == movie_id) & (rating_data.rating == current_user_rating)]['user_id']

In [137]:
print(get_user_same_rating(1,1))

41626       1
41629       9
41630      10
41632      19
41637      34
41638      36
41639      38
41640      44
41643      49
41644      51
41645      56
41647      65
41650      75
41651      76
41659     112
41663     119
41664     121
41667     132
41672     146
41674     148
41679     156
41680     157
41681     162
41684     169
41688     182
41689     184
41690     186
41695     195
41696     198
41699     213
         ... 
43626    5809
43627    5823
43629    5825
43632    5831
43633    5833
43635    5839
43636    5840
43637    5841
43644    5858
43645    5861
43646    5862
43651    5875
43656    5888
43658    5890
43660    5903
43662    5908
43668    5930
43671    5938
43675    5954
43679    5964
43682    5978
43686    5989
43688    5995
43689    5996
43692    6010
43693    6011
43694    6013
43695    6015
43698    6022
43699    6025
Name: user_id, Length: 820, dtype: int64


**6.Some other statistic, figure, aggregate, or plot that you created using this dataset, along with a short description of what interesting observations you derived from it.**

In [138]:
horror_movie_genre = data[data.genres == 'Horror']
horror_movie_genre.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,genres,title
265558,2,1690,3,978300051,M,56,16,70072,Horror,Alien: Resurrection (1997)
265561,10,1690,4,978230253,F,35,1,95370,Horror,Alien: Resurrection (1997)
265564,13,1690,3,978202057,M,45,1,93304,Horror,Alien: Resurrection (1997)
265567,18,1690,1,978153649,F,18,3,95825,Horror,Alien: Resurrection (1997)
265570,23,1690,5,978464728,M,35,0,90049,Horror,Alien: Resurrection (1997)


In [139]:
labels = ['Under 20','20-30','Above 30']
horror_movie_genre['age_range'] = pd.cut(horror_movie_genre.age,[0,20,30,81],labels=labels)
horror_movie_genre.groupby('age_range').agg({'rating': [np.size]})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,rating
Unnamed: 0_level_1,size
age_range,Unnamed: 1_level_2
Under 20,17395
20-30,31235
Above 30,27756


In [140]:
labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59']
horror_movie_genre['age_group'] = pd.cut(horror_movie_genre.age, range(0, 61, 10), right=False, labels=labels)
horror_data = horror_movie_genre.groupby('age_group').agg({'rating': [np.size]})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [143]:
print(horror_data)
horror_data.plot(kind='barh', figsize=[9, 15], color = 'red')
plt.title('Horror data by Age')
plt.ylabel('age_group')
plt.xlabel('Number of rating')
plt.show()

          rating
            size
age_group       
0-9         2211
10-19      15184
20-29      31235
30-39      15122
40-49       6192
50-59       6442


**I Analysed the age group of the horror movie and the age group which watched the most is in 20-29. Almost half of the rating for the horror movie comes in age group 20-29. **