# 영화 평점 데이터 


In [1]:
import pandas as pd
import os


In [3]:
# encoding 설정
encoding = 'latin1'

In [12]:
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')

In [14]:
users = pd.read_csv(upath, sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip'],encoding=encoding, engine='python')
ratings = pd.read_csv(rpath, sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'],encoding=encoding, engine='python')
movies = pd.read_csv(mpath, sep='::', header=None, names=['movie_id', 'title', 'genres'],encoding=encoding, engine='python')

In [16]:
users.head(5)

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [18]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [20]:
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 세 종류의 테이블을 병합 (merge)
  
나이와 성별에 따른 어떤 영화의 평균 평점을 계산

In [22]:
data = pd.merge(pd.merge(ratings, users), movies)
data.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


## DataFrame.ix[] is deprecated.
  
instead of ix, you can use iloc and loc.  
iloc - numerical index based (배열)  
loc - label index based (레이블)  

In [25]:
data.iloc[0]

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

### 성별에 따른 각 영화의 평균 평점 구하기

pivot_table 을 이용하여 index 를 title 로 두고, 성별에 따른 (columns='gender') 

```markdown
TIP. 성별(gender)에 따른 각 영화 평균 평점을 구할 때, 다음과 같이 구성한다.
1. rating 칼럼에 대해 'mean' 을 구한다.
data.pivot_table('rating', aggfunc='mean')
2. 평균을 구할 때 기준이 되는 레코드(primary key == index) 를 설정한다.
data.pivot_table('rating', index='title', aggfunc='mean')
3. 평균을 성별에 따라 분류하고자 한다.
data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
```

In [30]:
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

In [31]:
mean_ratings.head(5)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


## 제목별 평점 정보 건수 구하기

groupby 절을 이용하여 'title' 을 기준으로 groupby를 하고 size 메서드를 통해서 Series 객체를 출력한다.

![image.png](attachment:image.png)


In [38]:
ratings_by_title = data.groupby('title').size() # return Series 객체

In [40]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [44]:
mean_ratings = mean_ratings.loc[active_titles]

In [47]:
mean_ratings.head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5
101 Dalmatians (1996),3.24,2.911215
12 Angry Men (1957),4.184397,4.328421


In [51]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611
Creature Comforts (1990),4.513889,4.272277
"Usual Suspects, The (1995)",4.513317,4.518248


## 평점 차이 구하기
  
    
남녀간의 호불호가 갈리는 영화를 찾아본다.  
  
1. 'diff' 열을 추가하고 이 열을 기준으로 정렬한다.

In [52]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')

In [54]:
sorted_by_diff.head()

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224
Little Women (1994),3.870588,3.321739,-0.548849
Steel Magnolias (1989),3.901734,3.365957,-0.535777


2. 'diff' 열을 기준으로 정렬한 상태에서 이를 역순으로 정렬 시 표기 방법은 `[::-1]` 이다.

In [56]:
sorted_by_diff[::-1][:15]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608
"Longest Day, The (1962)",3.411765,4.031447,0.619682
"Cable Guy, The (1996)",2.25,2.863787,0.613787
Evil Dead II (Dead By Dawn) (1987),3.297297,3.909283,0.611985
"Hidden, The (1987)",3.137931,3.745098,0.607167
Rocky III (1982),2.361702,2.943503,0.581801
Caddyshack (1980),3.396135,3.969737,0.573602
For a Few Dollars More (1965),3.409091,3.953795,0.544704


## GroupBy with .std()
아래의 그림에서 groupby 원리를 이해하고 분류된 후 std() {표준편차} 를 구하는 연산이 추가됨을 기억

![image.png](attachment:image.png)

In [58]:
ratings_std_by_title = data.groupby('title')['rating'].std()

In [61]:
ratings_std_by_title.head()

title
$1,000,000 Duck (1971)           1.092563
'Night Mother (1986)             1.118636
'Til There Was You (1997)        1.020159
'burbs, The (1989)               1.107760
...And Justice for All (1979)    0.878110
Name: rating, dtype: float64

In [62]:
# 불린 인덱싱을 통해서 250 건 이상 평가된 작품만 선별한다.
ratings_std_by_title = ratings_std_by_title.loc[active_titles]
ratings_std_by_title.sort_values(ascending=False)[:10]

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64