In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# df
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.user.csv',sep='|',names=u_cols,encoding='latin_1')

i_cols = ['movie_id','title','release date','video release date','IMDB URL','unknown','Action','Adventure','Animation','children\s',
          'comedy','crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance',
          'Sci-Fi','Thriller','war','western']
movies = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.item.csv',sep='|',names=i_cols,
                     encoding='latin_1')

r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.data.csv',sep='\t',names=r_cols,
                      encoding='latin_1')

ratings_2 = ratings.drop('timestamp',axis=1)
movies_2 = movies[['movie_id','title']]

In [None]:
# 모델 평가(train_test_split 활용)

def RMSE(y_true,y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

x = ratings_2.copy()
y = ratings_2['user_id']  
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.25,stratify=y)   

def score(model):
  id_pairs = zip(X_test['user_id'],X_test['movie_id'])
  y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])   # ratings_2의 (user_id,movie_id) pair에 모델의 예측값 계산 후 y_pred에 append
  y_true = np.array(X_test['rating'])
  return RMSE(y_true,y_pred)

rating_matrix = X_train.pivot(index='user_id',columns='movie_id',values='rating')

In [None]:
print(rating_matrix)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,4.0,3.0,,,,1.0,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,,,...,,,,,,,,,,
941,5.0,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# 전체 평균으로 예측치를 구하는 기본 모델 (best_mean_recomm 모델)
def best_seller(user_id,movie_id):
  try :
    rating = train_mean['movie_id']    # X_train에는 없는데 X_test에는 있는 영화 방지용 --> 있는 영화는 X_train의 rating 평균을 내고
  except:
    rating = 3.0   # X_train에 없는 영화는 일단 3점 리턴
    return rating

train_mean = X_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

1.2443150726403662

In [None]:
# 사용자 집단별 추천
# 1) Gender(성별) 추천

merged_ratings = pd.merge(X_train,users,on='user_id')
users = users.set_index('user_id')

print(merged_ratings)

       user_id  movie_id  rating  age sex occupation zip_code
0          142       243       1   13   M      other    48118
1          142       514       5   13   M      other    48118
2          142       322       2   13   M      other    48118
3          142       895       4   13   M      other    48118
4          142       294       3   13   M      other    48118
...        ...       ...     ...  ...  ..        ...      ...
74995      649       291       5   20   M    student    39762
74996      649         1       5   20   M    student    39762
74997      649        15       4   20   M    student    39762
74998      649       181       4   20   M    student    39762
74999      649       282       4   20   M    student    39762

[75000 rows x 7 columns]


In [None]:
# 성별 별 영화 평점 평균 계산
g_mean = merged_ratings.groupby(['movie_id','sex'])['rating'].mean()
g_mean

movie_id  sex
1         F      3.831461
          M      3.907407
2         F      3.375000
          M      3.132530
3         F      2.454545
                   ...   
1678      M      1.000000
1679      M      3.000000
1680      M      2.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3029, dtype: float64

In [None]:
# gender 기준 영화 추천
def cf_gender(user_id,movie_id):
  if movie_id in rating_matrix:     # 해당 영화가 rating_matrix(X_train 내 영화)에 존재하면
    gender = users.loc[user_id]['sex']  # 예측 대사인 사용자 성별을 구해옴(M/F)
    if gender in g_mean[movie_id]:             # 해당 성별의 평균 평점이 있는지?(평점이 너무 적은 경우는 남/여 사용자 평균평점이 없을수도 있음)
      gender_rating = g_mean[movie_id][gender]   # 있으면 성별별 평균평점 리턴
    else:
      gender_rating = 3.0
  else:
    gender_rating = 3.0
  return gender_rating

score(cf_gender)

1.0351937057310259

In [None]:
# occupation별 평균 평점 계산
o_mean = merged_ratings.groupby(['movie_id','occupation'])['rating'].mean()
o_mean

movie_id  occupation   
1         administrator    4.074074
          artist           4.181818
          doctor           3.500000
          educator         3.718750
          engineer         4.076923
                             ...   
1678      student          1.000000
1679      student          3.000000
1680      student          2.000000
1681      writer           3.000000
1682      engineer         3.000000
Name: rating, Length: 16792, dtype: float64

In [None]:
# occupation 기준 영화 추천
# 좀 낮게 나왔는데 occupation별 평점이 적은것도 많아서인듯...

def cf_occupation(user_id,movie_id):
  if movie_id in rating_matrix:
    occ = users.loc[user_id]['occupation']
    if occ in o_mean[movie_id]:
      occ_rating = o_mean[movie_id][occ]
    else:
      occ_rating = 3.0
  else:
    occ_rating = 3.0
  return occ_rating
score(cf_occupation)

1.1289261889674969

In [None]:
# gender & occupation별 게산
g_o_mean = merged_ratings.groupby(['movie_id','sex','occupation'])['rating'].mean()
g_o_mean[1]

sex  occupation   
F    administrator    4.142857
     artist           4.750000
     educator         3.000000
     engineer         4.000000
     entertainment    4.000000
     executive        3.000000
     healthcare       2.600000
     homemaker        5.000000
     librarian        3.833333
     marketing        3.000000
     none             4.000000
     other            3.916667
     programmer       4.000000
     salesman         4.000000
     scientist        3.000000
     student          4.000000
     technician       4.000000
     writer           4.400000
M    administrator    4.000000
     artist           3.857143
     doctor           3.500000
     educator         4.000000
     engineer         4.078947
     entertainment    3.250000
     executive        4.083333
     lawyer           4.200000
     librarian        3.500000
     marketing        3.750000
     none             4.333333
     other            4.068966
     programmer       4.142857
     retired        

In [None]:
# gender & occupation 동시에 고려한 영화 추천
def cf_g_o(user_id,movie_id):
  if movie_id in rating_matrix:
    gen = users.loc[user_id]['sex']
    occ = users.loc[user_id]['occupation']
    if gen in g_o_mean[movie_id]:
      if occ in g_o_mean[movie_id]:
        g_o_rating = g_o_mean[movie_id][occ]
      else:
        g_o_rating = 3.0
    else:
      g_o_rating = 3.0
  else:
    g_o_rating = 3.0
  return g_o_rating

score(cf_g_o)

1.2443150726403662