## 1. Data Load

In [40]:
import pandas as pd 

# u.user 정보
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./data/u.user', sep='|', names=u_cols, encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [41]:
# u.item 정보
i_cols = ['movie_id', 'title', 'release data',' video release date', 'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation', 
          'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror' ,'Musical', 'Mystery',
          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./data/u.item', sep='|', names=i_cols, encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head(2)

Unnamed: 0_level_0,title,release data,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [42]:
# u.data 정보
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./data/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


## 2. Best-seller 추천

In [43]:
def recom_movie1(df, n_items):
    movie_sort = df.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

In [44]:
movie_mean_df = ratings.groupby(['movie_id'])['rating'].mean()
movie_mean_df

movie_id
1       3.878319
2       3.206107
3       3.033333
4       3.550239
5       3.302326
          ...   
1678    1.000000
1679    3.000000
1680    2.000000
1681    3.000000
1682    3.000000
Name: rating, Length: 1682, dtype: float64

In [45]:
recom_movie1(movie_mean_df, 5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

In [46]:
def recom_movie2(df, n_items):
    return movies.loc[df.sort_values(ascending=False)[:n_items].index]['title']

In [47]:
recom_movie2(movie_mean_df, 5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

## 3. 정확도 측정

In [48]:
import numpy as np
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

In [49]:
len(ratings.index)

100000

In [50]:
# 중복없는 user수
len(set(ratings.index))

943

In [51]:
rmse = []
for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean_df[ratings.loc[user]['movie_id']] # user가 본 movie_id 리스트
    accuracy = RMSE(y_true, y_pred)
    rmse.append(accuracy)

In [52]:
len(rmse)

943

In [53]:
# 196번 사용자
ratings.loc[196].head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
196,393,4,881251863
196,381,4,881251728
196,251,3,881251274
196,655,5,881251793


In [54]:
users.to_csv('./data/users.dat')
movies.to_csv('./data/movies.dat')
ratings.to_csv('./data/ratings.dat')

## 4. 사용자 집단별 추천

In [134]:
users = pd.read_csv('./data/users.dat')
movies = pd.read_csv('./data/movies.dat')
ratings = pd.read_csv('./data/ratings.dat')

In [135]:
ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]

In [136]:
# train, test set 분리
from sklearn.model_selection import train_test_split
X = ratings.copy()
y = ratings['user_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [137]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return RMSE(y_true, y_pred)

In [138]:
rating_matrix = X_train.pivot(index='user_id', columns='movie_id', values='rating')

In [139]:
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1672,1673,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,,,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


### 4-1. 모델1: trainset의 평점 평균으로 testset 평점 예측

In [140]:
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        # train set에 없지만, test set에 있는 영화는 3.0으로 처리함
        rating = 3.0
    return rating

In [141]:
train_mean = X_train.groupby(['movie_id'])['rating'].mean()

In [142]:
train_mean

movie_id
1       3.816327
2       3.276596
3       3.115942
4       3.538961
5       3.290323
          ...   
1678    1.000000
1679    3.000000
1680    2.000000
1681    3.000000
1682    3.000000
Name: rating, Length: 1635, dtype: float64

In [143]:
score(best_seller)

1.028891953479005

### 4-2. 모델2: trainset의 성별 평점 평균으로 testset 평점 예측

In [144]:
merged_ratings = pd.merge(X_train, users)

In [145]:
merged_ratings.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,617,17,1,27,F,writer,11201
1,617,648,3,27,F,writer,11201
2,617,174,1,27,F,writer,11201
3,617,145,1,27,F,writer,11201
4,617,496,1,27,F,writer,11201


In [146]:
users = users.set_index('user_id')

In [147]:
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [148]:
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [149]:
g_mean[1]

sex
F    3.692308
M    3.861111
Name: rating, dtype: float64

In [150]:
users.head(2)

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [151]:
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        
        # 평가 인원이 적어서 특정 성별의 평점이 없을 수 있으므로
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    
    return gender_rating

In [152]:
score(cf_gender)

1.0384723741301407

### 4-3. 모델3: trainset의 직업의 평균으로 testset 평점 예측 

In [158]:
set(merged_ratings.occupation)

{'administrator',
 'artist',
 'doctor',
 'educator',
 'engineer',
 'entertainment',
 'executive',
 'healthcare',
 'homemaker',
 'lawyer',
 'librarian',
 'marketing',
 'none',
 'other',
 'programmer',
 'retired',
 'salesman',
 'scientist',
 'student',
 'technician',
 'writer'}

In [160]:
o_mean = merged_ratings[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()

In [164]:
o_mean[1]

occupation
administrator    3.840000
artist           4.000000
doctor           3.500000
educator         3.677419
engineer         3.966667
entertainment    3.285714
executive        3.666667
healthcare       3.000000
homemaker        3.000000
lawyer           4.000000
librarian        3.714286
marketing        3.571429
none             4.166667
other            4.051282
programmer       4.172414
retired          3.500000
salesman         4.500000
scientist        4.000000
student          3.714286
technician       4.222222
writer           3.333333
Name: rating, dtype: float64

In [161]:
users.head(2)

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [165]:
def cf_occupation(user_id, movie_id):
    if movie_id in rating_matrix:
        occupation = users.loc[user_id]['occupation']
        
        # 평가 인원이 적어서 특정 성별의 평점이 없을 수 있으므로
        if occupation in o_mean[movie_id]:
            occupation_rating = o_mean[movie_id][occupation]
        else:
            occupation_rating = 3.0
    else:
        occupation_rating = 3.0
    
    return occupation_rating

In [166]:
score(cf_occupation)

1.1262711607758618

### 4-4. 모델4: trainset의 성별, 직업의 평균으로 testset 평점 예측 

In [179]:
merged_ratings.head(2)

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,617,17,1,27,F,writer,11201
1,617,648,3,27,F,writer,11201


In [181]:
g_o_mean = merged_ratings[['movie_id', 'sex', 'occupation', 'rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()

In [201]:
if 'artist' in o_mean[1]:
    print('hi')

hi


In [211]:
g_o_mean[1]['F'].mean()

3.676120448179272

In [207]:
movie_id = 1
gender = 'M'
occupation = 'artist'
if (gender, occupation) in g_o_mean[movie_id]:
    print('hi')
    print(g_o_mean[movie_id][gender, occupation])
else:
    pirnt('no')

hi
3.8333333333333335


In [184]:
gender, occupation = users.loc[1][['sex', 'occupation']]

In [212]:
def cf_gender_occupation(user_id, movie_id):
    if movie_id in rating_matrix:
        gender, occupation = users.loc[user_id][['sex', 'occupation']]
        
        # 평가 인원이 적어서 특정 성별, 직업의 평점이 없을 수 있으므로
        if (gender, occupation) in g_o_mean[movie_id]:
            rating = g_o_mean[movie_id][gender, occupation]
        elif gender in g_o_mean[movie_id]:
            rating = g_o_mean[movie_id][gender].mean()
        elif occupation in g_o_mean[movie_id]:
            rating = g_o_mean[movie_id][occupation].mean()
        else:
            rating = 3.0
    else:
        rating = 3.0
    
    return rating

In [213]:
score(cf_gender_occupation)

1.1402270410641107