In [70]:
# 모듈 import
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [2]:
# 데이터 로드
purchase_info = pd.read_pickle("./how_to_make/purchase.pickle")
person_info = pd.read_csv("./dataset/LPOINT_BIG_COMP_01_DEMO.csv", low_memory=False)
goods_info = pd.read_csv("./dataset/LPOINT_BIG_COMP_04_PD_CLAC.csv", low_memory = False)

In [3]:
# cust별로 id 부여
person_info['user_id'] = [i for i in range(1,29914)]

In [31]:
# 사용자가 구매한 품목의 합을 구한 데이터
purchase = purchase_info[['cust','pd_c', 'buy_ct','pd_nm', 'clac_mcls_nm']]
purchase = pd.merge(left=purchase, right=person_info, how='inner', on='cust')[['pd_c', 'buy_ct','pd_nm', 'user_id','clac_mcls_nm']]
rating_data = purchase.groupby(['user_id','pd_c']).sum().reset_index()
rating_data.head(5)

Unnamed: 0,user_id,pd_c,buy_ct
0,1,PD0116,2
1,1,PD0169,1
2,1,PD0178,1
3,1,PD0204,3
4,1,PD0218,2


In [32]:
# 품목 데이터
goods_data = goods_info[['pd_c','pd_nm']]
goods_data.head(5)

Unnamed: 0,pd_c,pd_nm
0,PD0001,소파
1,PD0002,스툴/리빙의자
2,PD0003,탁자
3,PD0004,장식장/진열장
4,PD0005,기타가구


In [67]:
df_user_goods_ratings = rating_data.pivot(index='user_id',columns='pd_c',values='buy_ct').fillna(0)

In [68]:
values = df_user_goods_ratings.values
user_ratings_mean = np.mean(values, axis=1)
values_user_mean = values - user_ratings_mean.reshape(-1,1)

In [72]:
U, sigma, Vt = svds(values_user_mean, k=12)
sigma = np.diag(sigma)

In [74]:
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1,1)

In [75]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_goods_ratings.columns)

In [79]:
def recommend_movies(df_svd_preds, user_id, goods_data, rating_data, num_recommendations=5):
    #현재는 index로 적용이 되어있으므로 user_id - 1
    user_row_number = user_id - 1 
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 품목 데이터 정렬
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    # 원본 평점 데이터에서 user id에 해당하는 데이터 추출. 
    user_data = rating_data[rating_data['user_id'] == user_id]
    # 위에서 뽑은 user_data와 원본 품목 데이터를 합침. 
    user_history = user_data.merge(goods_data, on = 'pd_c').sort_values(['buy_ct'], ascending=False)
    # 원본 품목 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출
    recommendations = goods_data[~goods_data['pd_c'].isin(user_history['pd_c'])]
    # 사용자의 품목 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합침. 
    recommendations = recommendations.merge( pd.DataFrame(sorted_user_predictions).reset_index(), on = 'pd_c')
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
    
    return user_history, recommendations

In [87]:
already_rated, predictions = recommend_movies(df_svd_preds, 3, goods_data, rating_data, 10)

In [88]:
predictions

Unnamed: 0,pd_c,pd_nm,Predictions
1292,PD1333,일반소주,1.837362
1137,PD1173,생수,1.314255
373,PD0381,국산담배,0.722081
220,PD0223,젤리,0.717599
404,PD0413,국물용기라면,0.623972
395,PD0403,국물봉지라면,0.461754
229,PD0232,일반스낵,0.451239
1146,PD1184,혼합탄산,0.401188
1302,PD1343,막걸리,0.399013
320,PD0328,바아이스크림,0.397817
