# 영화 평점 분석

In [1]:
import pandas as pd

In [10]:
users = pd.read_csv(
    'data/pydata-book/movielens/users.dat',
    sep='::', 
    engine='python', # 구분자 관련 경고 처리
    names=['user_id', '성별', '나이', '직업', '지역']
)

In [11]:
users[:10]

Unnamed: 0,user_id,성별,나이,직업,지역
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [12]:
ratings = pd.read_csv(
    'data/pydata-book/movielens/ratings.dat',
    sep='::', engine='python',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

In [13]:
ratings[:3]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [16]:
movies = pd.read_csv(
    'data/pydata-book/movielens/movies.dat',
    sep='::', engine='python',
    names=['movie_id', 'title', 'genres'],
    encoding='latin1'
)

In [19]:
movies[:3]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


세 개의 자료를 하나로 합치기

In [24]:
data = pd.merge(pd.merge(ratings, users), movies)

In [25]:
data.ix[0]

user_id                                           1
movie_id                                       1193
rating                                            5
timestamp                                 978300760
성별                                                F
나이                                                1
직업                                               10
지역                                            48067
title        One Flew Over the Cuckoo's Nest (1975)
genres                                        Drama
Name: 0, dtype: object

성별에 따른 영화 평점

In [32]:
영화별성별 = data.groupby(['title', '성별'])

In [33]:
영화별성별['rating'].mean().unstack()

성별,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
1-900 (1994),2.000000,3.000000
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.500000
101 Dalmatians (1996),3.240000,2.911215
12 Angry Men (1957),4.184397,4.328421


In [35]:
mean_ratings = data.pivot_table(
    'rating', aggfunc='mean', 
    index='title', columns='성별')

In [37]:
mean_ratings[:5]

성별,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


평점 개수가 250건 이상인 영화들의 성별 평점 구하기

In [78]:
제목별 = data.groupby('title')

In [79]:
평점충분 = 제목별.size() >= 250

평점이 "충분"한 영화들만 골라내기

In [47]:
mean_ratings = mean_ratings[평점충분]

여성에게 높은 평점을 받은 순으로 정렬

In [49]:
mean_ratings.sort_values(
    by='F', ascending=False)[:10]

성별,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611
Creature Comforts (1990),4.513889,4.272277
"Usual Suspects, The (1995)",4.513317,4.518248


남녀 호불호가 갈리는 영화

In [51]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [52]:
mean_ratings[:5]

성별,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"'burbs, The (1989)",2.793478,2.962085,0.168607
10 Things I Hate About You (1999),3.646552,3.311966,-0.334586
101 Dalmatians (1961),3.791444,3.5,-0.291444
101 Dalmatians (1996),3.24,2.911215,-0.328785
12 Angry Men (1957),4.184397,4.328421,0.144024


여성 선호 영화

In [54]:
mean_ratings.sort_values(by='diff')[:3]

성별,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224


남성 선호 영화

In [56]:
mean_ratings.sort_values(by='diff', ascending=False)[:3]
#mean_ratings.sort_values(by='diff')[::-1][:3]

성별,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608


성별 관계없이 호불호가 갈리는 영화

In [73]:
제목별 = data.groupby('title')

In [76]:
제목별_평점편차 = 제목별['rating'].std()

250건 이상 기준으로 걸러내기

In [81]:
제목별_평점편차 = 제목별_평점편차[평점충분]

In [82]:
제목별_평점편차.sort_values(ascending=False)[:10]

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64