In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = 'NanumSquare'

In [None]:
!pip install lightfm

In [None]:
import lightfm
from lightfm import LightFM, cross_validation
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

In [4]:
vod_info = pd.read_pickle('/content/drive/MyDrive/DXSCHOOL/Project/vod_info_1116.pkl')
user_id = pd.read_pickle('/content/drive/MyDrive/DXSCHOOL/Project/user_id_1116.pkl')
rating_data = pd.read_pickle('/content/drive/MyDrive/DXSCHOOL/Project/rating_data_1116.pkl')

In [None]:
!pip install LibRecommender

In [56]:
from libreco.algorithms import NCF

## LightFM

In [None]:
# 준비할 데이터프레임
# 1. User Info
# user feature는 아직 따로 없음 - 추후에 취향, 다양성 등에 대한 컬럼을 생성할 수 있음

# 2. Item meta
# vod feature 로는 일단 ct_cl, genre만 사용
vod_meta = vod_info.drop(['program_name', 'running_time'], axis=1)

# 3. User-Item rating
# interation
rating = rating_data[['user_id', 'subsr', 'vod_id', 'click_cnt', 'use_cnt']]
rating['rating'] = rating['click_cnt'] + rating['use_cnt']
rating = rating[['user_id', 'vod_id', 'rating']]

In [None]:
rating

Unnamed: 0,user_id,vod_id,rating
0,810,0,5.0
1,280,4768,2.0
2,122,4768,21.0
3,380,4768,11.0
4,648,4768,6.0
...,...,...,...
10355,661,706,1.0
10356,661,4706,4.0
10357,661,4707,4.0
10358,420,4708,1.0


In [None]:
# 참고 코드
# https://greeksharifa.github.io/machine_learning/2020/06/01/LightFM/
# https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_hybrid/lightfm_deep_dive.ipynb

### Interaction 만 이용 (train/test 내장 메서드로)

#### LightFM 라이브러리 활용을 위한 데이터셋 준비

In [None]:
dataset = Dataset()
dataset.fit(users = rating['user_id'],
            items = rating['vod_id'])

num_users, num_vods = dataset.interactions_shape()
print(num_users) # Unique User = 924 명
print(num_vods)  # Unique VOD = 4786 개

924
4786


In [None]:
# interaction matrix 생성 -> csr matrix
# weights는 평점 value가 있는 matrix고, interactions는 상호작용이 있으면 1 아니면 0으로 이루어진 matrix임
# 둘 중 뭐로 학습해야 하는지?
(interactions, weights) = dataset.build_interactions(rating.values)

In [None]:
# train/test 분할
# cross_validation.random_train_test_split 메서드가 존재함
# test에 있는 user가 무조건 train에 존재하게끔 나눠지는 방법은 아니기에 cold-start 문제가 test set에서 발생 가능함 유의
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2,random_state=0)

In [None]:
print(train_interactions.shape)
print(test_interactions.shape)

(924, 4786)
(924, 4786)


In [None]:
train_weights, test_weights = cross_validation.random_train_test_split(
    weights, test_percentage=0.2, random_state=0
)

In [None]:
print(train_weights.shape)
print(test_weights.shape)

(924, 4786)
(924, 4786)


#### 모델 학습

In [None]:
# loss에 'bpr' Bayesian Personalized Ranking Loss 사용도 가능
lfm1 = LightFM(loss='bpr', no_components=20, learning_rate=0.1, random_state=0)
lfm1.fit(interactions=train_interactions, sample_weight=train_weights, epochs=20, verbose=1)

Epoch: 100%|██████████| 20/20 [00:00<00:00, 92.49it/s]


<lightfm.lightfm.LightFM at 0x7e7199ba23b0>

#### Test data 생성

#### 모델 Predict

In [None]:
# LightFM 모델의 predict 값은 ranking의 목적일 뿐인 점을 유의해야 함


#### 모델 평가

In [None]:
precision = precision_at_k(lfm1, test_interactions, train_interactions, k=10).mean()
recall = recall_at_k(lfm1, test_interactions, train_interactions, k=10).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.038
Recall@K: 0.157


In [None]:
# sample_weights 추가했을 때
precision = precision_at_k(lfm1, test_interactions, train_interactions, k=10).mean()
recall = recall_at_k(lfm1, test_interactions, train_interactions, k=10).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.003
Recall@K: 0.012


In [None]:
precision = precision_at_k(lfm1, test_interactions, train_interactions, k=20).mean()
recall = recall_at_k(lfm1, test_interactions, train_interactions, k=20).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.031
Recall@K: 0.263


### Interaction 만 이용 (train/test 방식 다르게)

#### LightFM 라이브러리 활용을 위한 데이터셋 준비

In [None]:
# train_interactions의 shape과 test_interactions의 shape은 동일해야 함
df_test = rating.copy()
df_train = rating.copy()

In [None]:
df_test = df_test.groupby('user_id').first().reset_index()

In [None]:
def mask_first(x):
    result = np.ones_like(x)
    result[0] = 0
    return result

mask = rating.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
df_train = rating.loc[mask]

In [None]:
# train user, item 리스트 생성
uids = df_train['user_id'].tolist()
iids = df_train['vod_id'].tolist()
values = list(df_train.rating)

In [None]:
train_interactions = pd.DataFrame(np.zeros([num_users, num_vods]).astype(int))
test_interactions = pd.DataFrame(np.zeros([num_users, num_vods]).astype(int))

In [None]:
# train_interactions 생성 - 돌아가는데 시간이 너무 오래걸림...
cnt = 0
for i in range(len(uids)):
    train_interactions.iloc[uids[i], iids[i]] = 1

In [None]:
# test user, item 리스트 생성
uids_test = df_test['user_id'].tolist()
iids_test = df_test['vod_id'].tolist()
values_test = list(df_test.rating)

In [None]:
# test_interactions 생성
cnt = 0
for i in range(len(uids_test)):
    test_interactions.iloc[uids_test[i], iids_test[i]] = 1

In [None]:
# csr matrix
from scipy.sparse import csr_matrix
train_interactions = csr_matrix(train_interactions)
test_interactions = csr_matrix(test_interactions)

In [None]:
print(train_interactions.shape)
print(test_interactions.shape)

(924, 4786)
(924, 4786)


#### 모델 학습

In [None]:
# loss에 'bpr' Bayesian Personalized Ranking Loss 사용도 가능
lfm2 = LightFM(loss='bpr', no_components=5, learning_rate=0.1, random_state=0)
lfm2.fit(interactions=train_interactions, epochs=20, verbose=1)

Epoch: 100%|██████████| 20/20 [00:00<00:00, 56.32it/s]


<lightfm.lightfm.LightFM at 0x7e718ece09a0>

#### 모델 Predict

In [None]:
# LightFM 모델의 predict 값은 ranking의 목적일 뿐인 점을 유의해야 함


#### 모델 평가

In [None]:
precision = precision_at_k(lfm2, test_interactions, train_interactions, k=10).mean()
recall = recall_at_k(lfm2, test_interactions, train_interactions, k=10).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.029
Recall@K: 0.286


In [None]:
precision = precision_at_k(lfm1, test_interactions, train_interactions, k=20).mean()
recall = recall_at_k(lfm1, test_interactions, train_interactions, k=20).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.001
Recall@K: 0.015


### Interaction 만 이용 (train/test 내장 메서드로 & interaction 대신 weights로 해보기)

#### LightFM 라이브러리 활용을 위한 데이터셋 준비

In [None]:
dataset = Dataset()
dataset.fit(users = rating['user_id'],
            items = rating['vod_id'])

num_users, num_vods = dataset.interactions_shape()
print(num_users) # Unique User = 924 명
print(num_vods)  # Unique VOD = 4786 개

924
4786


In [None]:
# interaction matrix 생성 -> csr matrix
# weights는 평점 value가 있는 matrix고, interactions는 상호작용이 있으면 1 아니면 0으로 이루어진 matrix임
# 둘 중 뭐로 학습해야 하는지?
(interactions, weights) = dataset.build_interactions(rating.values)

In [None]:
# train/test 분할
# cross_validation.random_train_test_split 메서드가 존재함
# test에 있는 user가 무조건 train에 존재하게끔 나눠지는 방법은 아니기에 cold-start 문제가 test set에서 발생 가능함 유의
train_weights, test_weights = cross_validation.random_train_test_split(
    weights, test_percentage=0.2,random_state=0)

In [None]:
print(train_weights.shape)
print(test_weights.shape)

(924, 4786)
(924, 4786)


#### 모델 학습

In [None]:
# loss에 'bpr' Bayesian Personalized Ranking Loss 사용도 가능
lfm1 = LightFM(loss='bpr', no_components=20, learning_rate=0.1, random_state=0)
lfm1.fit(interactions=train_weights, epochs=20, verbose=1)

Epoch: 100%|██████████| 20/20 [00:00<00:00, 25.28it/s]


<lightfm.lightfm.LightFM at 0x7e7198e72050>

#### Test data 생성

#### 모델 Predict

In [None]:
# LightFM 모델의 predict 값은 ranking의 목적일 뿐인 점을 유의해야 함


#### 모델 평가

In [None]:
precision = precision_at_k(lfm1, test_interactions, train_interactions, k=10).mean()
recall = recall_at_k(lfm1, test_interactions, train_interactions, k=10).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.001
Recall@K: 0.006


In [None]:
precision = precision_at_k(lfm1, test_interactions, train_interactions, k=20).mean()
recall = recall_at_k(lfm1, test_interactions, train_interactions, k=20).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.001
Recall@K: 0.015


#### 유사한 Users. 유사한 Items

### Interaction 만 이용 (train/test 방식 다르게 & interaction 대신 weights로 해보기)

#### LightFM 라이브러리 활용을 위한 데이터셋 준비

In [None]:
# train_interactions의 shape과 test_interactions의 shape은 동일해야 함
df_test = rating.copy()
df_train = rating.copy()

In [None]:
df_test = df_test.groupby('user_id').first().reset_index()

In [None]:
def mask_first(x):
    result = np.ones_like(x)
    result[0] = 0
    return result

mask = rating.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
df_train = rating.loc[mask]

In [None]:
# train user, item 리스트 생성
uids = df_train['user_id'].tolist()
iids = df_train['vod_id'].tolist()
values = list(df_train.rating)

In [None]:
train_weights = pd.DataFrame(np.zeros([num_users, num_vods]).astype(int))
test_weights = pd.DataFrame(np.zeros([num_users, num_vods]).astype(int))

In [None]:
# train_interactions 생성 - 돌아가는데 시간이 너무 오래걸림...
cnt = 0
for i in range(len(uids)):
    train_weights.iloc[uids[i], iids[i]] = values[i]

In [None]:
# test user, item 리스트 생성
uids_test = df_test['user_id'].tolist()
iids_test = df_test['vod_id'].tolist()
values_test = list(df_test.rating)

In [None]:
# test_interactions 생성
cnt = 0
for i in range(len(uids_test)):
    test_weights.iloc[uids_test[i], iids_test[i]] = values_test[i]

In [None]:
# csr matrix
from scipy.sparse import csr_matrix
train_weights = csr_matrix(train_weights)
test_weights = csr_matrix(test_weights)

In [None]:
print(train_weights.shape)
print(test_weights.shape)

(924, 4786)
(924, 4786)


#### 모델 학습

In [None]:
# loss에 'bpr' Bayesian Personalized Ranking Loss 사용도 가능
lfm2 = LightFM(loss='bpr', no_components=5, learning_rate=0.1, random_state=0)
lfm2.fit(interactions=train_weights, epochs=20, verbose=1)

Epoch: 100%|██████████| 20/20 [00:00<00:00, 175.56it/s]


<lightfm.lightfm.LightFM at 0x7e71998659f0>

#### 모델 Predict

In [None]:
# LightFM 모델의 predict 값은 ranking의 목적일 뿐인 점을 유의해야 함


#### 모델 평가

In [None]:
precision = precision_at_k(lfm2, test_weights, train_weights, k=10).mean()
recall = recall_at_k(lfm2, test_weights, train_weights, k=10).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.029
Recall@K: 0.286


In [None]:
precision = precision_at_k(lfm1, test_weights, train_weights, k=20).mean()
recall = recall_at_k(lfm1, test_weights, train_weights, k=20).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.001
Recall@K: 0.015


### Interactions + Features (train/test 내장 메서드로)

In [None]:
# rating에 vod 붙이기
vod_genre = rating.merge(vod_meta, how='left', left_on='vod_id', right_on='vod_id')['ct_cl'].tolist()

In [None]:
# vod meta 데이터 준비
all_vod_genre = vod_meta['ct_cl'].unique().tolist()

In [None]:
# user meta 데이터
# 아직은 쓸 게 없음

In [None]:
dataset2 = Dataset()
dataset2.fit(users = rating['user_id'],
            items = rating['vod_id'],
            item_features=all_vod_genre)

num_users, num_vods = dataset.interactions_shape()
print(num_users) # Unique User = 924 명
print(num_vods)  # Unique VOD = 4786 개

924
4786


In [None]:
# item_features_source: [(item1, [feature, feature, ...]), (item2, [feature, feature, ...])] 형태로 만들기
# item_features_source = [(x, [y]) for x,y in zip(vod_meta['vod_id'], vod_meta['ct_cl])] # 이게 맞는지 밑에가 맞는지 모르겠음

item_features = dataset2.build_item_features((x, [y]) for x,y in zip(rating.vod_id, vod_genre))

In [None]:
(interactions, weights) = dataset2.build_interactions(rating.values)

In [None]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2,random_state=0)

In [None]:
# 넣어도 되고, 안넣어도 학습되는 듯 - 이것도 선택지
train_weights = train_interactions.multiply(weights).tocoo()

In [None]:
lfm3 = LightFM(loss='bpr', no_components=20, learning_rate=0.1, random_state=0)
lfm3.fit(interactions=train_interactions,
         item_features=item_features,
         epochs=20, verbose=1)

# regularisation for both user and item features
# ITEM_ALPHA = 1e-6
# USER_ALPHA = 1e-6

# sample_weight 추가했더니 오류 발생함

Epoch: 100%|██████████| 20/20 [00:00<00:00, 66.10it/s]


<lightfm.lightfm.LightFM at 0x7e718eaad2a0>

In [None]:
precision = precision_at_k(lfm3, test_interactions, train_interactions, k=10,
                           item_features=item_features).mean()
recall = recall_at_k(lfm3, test_interactions, train_interactions, k=10,
                      item_features=item_features).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.044
Recall@K: 0.223


In [None]:
precision = precision_at_k(lfm3, test_interactions, train_interactions, k=20,
                           item_features=item_features).mean()
recall = recall_at_k(lfm3, test_interactions, train_interactions, k=20,
                      item_features=item_features).mean()
print('Precision@K: {:.3f}'.format(precision))
print('Recall@K: {:.3f}'.format(recall))

Precision@K: 0.033
Recall@K: 0.294


#### similar vod 정보

In [None]:
_, item_embeddings = lfm3.get_item_representations(features=item_features)
item_embeddings

array([[ 0.11256789, -0.88921595,  0.14866894, ..., -0.531572  ,
         0.5125935 , -0.00314356],
       [ 0.21481277, -1.6982783 ,  0.27543464, ..., -1.0243443 ,
         0.9599098 , -0.01862909],
       [ 0.17749786, -1.5340897 ,  0.25952747, ..., -0.9181824 ,
         0.84524965,  0.01713548],
       ...,
       [ 0.8151699 ,  0.70727396,  0.07004264, ..., -0.05289529,
        -0.0454418 , -0.0424279 ],
       [ 0.912986  ,  0.7317201 ,  0.03545414, ..., -0.12244309,
         0.07769467, -0.24660546],
       [ 0.8106697 ,  0.7073496 ,  0.06729263, ..., -0.03220104,
        -0.04631758, -0.03425556]], dtype=float32)

In [None]:
def make_best_items_report(item_embeddings, vod_id, num_search_items=10):
    item_id = vod_id

    # Cosine similarity
    scores = item_embeddings.dot(item_embeddings[item_id])
    item_norms = np.linalg.norm(item_embeddings, axis=1)
    item_norms[item_norms == 0] = 1e-10
    scores /= item_norms

    # best: score가 제일 높은 item의 id를 num_search_items 개 만큼 가져온다.
    best = np.argpartition(scores, -num_search_items)[-num_search_items:]
    similar_item_id_and_scores = sorted(zip(best, scores[best] / item_norms[item_id]),
                                        key=lambda x: -x[1])

    # Report를 작성할 pandas dataframe
    best_items = pd.DataFrame(columns=['vod_id', 'ct_cl', 'score'])

    for similar_item_id, score in similar_item_id_and_scores:
        vod_id = similar_item_id
        #title = vod_meta[vod_meta['vod_id'] == vod_id][['']]
        ct_cl = vod_meta[vod_meta['vod_id']==vod_id][0][2]

        row = pd.Series([vod_id, ct_cl, score], index=best_items.columns)
        best_items = best_items.append(row, ignore_index=True)
    return best_items

In [None]:
# 오류 고쳐야 함
make_best_items_report(item_embeddings, 0, 10)

KeyError: ignored

## Neural CF

### 참고 코드로 레이어 연결 구조 이해하기

In [None]:
class NCF(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.num_users = config["num_users"] # Input Layer shape 구성을 위해
        self.num_items = config["num_items"] # Input Layer shape 구성을 위해
        self.latent_dim_mf = config["latent_dim_mf"]  # GMF 의 임베딩 차원
        self.latent_dim_mlp = config["latent_dim_mlp"]  # MLP 의 임베딩 차원

        # Input - 서로 다른 MLP와 GMF Embedding Layer 사용
        self.embedding_user_mlp = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp)
        self.embedding_item_mlp = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp)
        self.embedding_user_mf = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf)
        self.embedding_item_mf = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf)

        # Layer - FC Layer 구성
        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(config["layers"][:-1], config["layers"][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        # Output
        self.last_layer = torch.nn.Linear(in_features=(config["layers"][-1] + self.latent_dim_mf), out_features=1)
        self.output_layer = torch.nn.Sigmoid() # Sigmoid

    def forward(self, user_indices, item_indices):
        # Input Embedding Layer 준비
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        # GMF Layer에서 앞서 Embedding한 MF Latent Vector를 내적
        gmf_layer = torch.mul(user_embedding_mf, item_embedding_mf)

        # MLP - User, Item Input 벡터 Concatenate
        mlp_concat_layer = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
        # MLP - FC Layer 통과
        for idx in range(len(self.fc_layers)):
            mlp_concat_layer = self.fc_layers[idx](mlp_concat_layer)
            mlp_concat_layer = torch.nn.ReLU()(mlp_concat_layer)

        # Concatenate - GMF Layer 출력값과 FC Layer 마지막 출력값 Concatenate 해서 최종 Output 값으로 연결
        neu_mf_layer = torch.cat([gmf_layer, mlp_concat_layer], dim=-1)

        # Output
        return self.output_layer(self.last_layer(neu_mf_layer)).view(-1)

### Keras 코드 짜기



In [68]:
# 위 코드를 거쳐서 단순히 구하려면 userID, ItemID, Rating(0 또는 1) 컬럼이 포함된 DataSet 이 준비되어 있어야 함
vod_info = pd.read_pickle('/content/drive/MyDrive/DXSCHOOL/Project/vod_info_1116.pkl')
user_id = pd.read_pickle('/content/drive/MyDrive/DXSCHOOL/Project/user_id_1116.pkl')
rating_data = pd.read_pickle('/content/drive/MyDrive/DXSCHOOL/Project/rating_data_1116.pkl')

In [72]:
df

Unnamed: 0,user_id,vod_id,rating
0,810,0,1
1,280,4768,1
2,122,4768,1
3,380,4768,1
4,648,4768,1
...,...,...,...
10355,661,706,0
10356,661,4706,0
10357,661,4707,0
10358,420,4708,0


In [3]:
# train / test 데이터 준비
# rating_data['rating'] = rating_data['sum_use_tms'] + rating_data['click_cnt']
rating_data['rating'] = rating_data['sum_use_tms']
rating_data['rating'] = rating_data['rating'].apply(lambda x: 1 if x > 5 else 0)
df = rating_data[['user_id', 'vod_id', 'rating']]

In [4]:
# 1. 무작위 분할
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

In [5]:
train_users = train_df['user_id'].values
train_items = train_df['vod_id'].values
train_ratings = train_df['rating'].values

test_users = test_df['user_id'].values
test_items = test_df['vod_id'].values
test_ratings = test_df['rating'].values

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *

In [7]:
def NCF(users_num, items_num, latent_dim_gmf, latent_dim_mlp):
    # User Embedding
    user = Input(shape=(1,), dtype='int32', name='user_input')
    item = Input(shape=(1,), dtype='int32', name='item_input')

    # GMF 쌓기
    user_embedding_gmf = Embedding(users_num, latent_dim_gmf, input_length=user.shape[1])(user)
    item_embedding_gmf = Embedding(items_num, latent_dim_gmf, input_length=item.shape[1])(item)

    user_latent_gmf = Flatten()(user_embedding_gmf)
    item_latent_gmf = Flatten()(item_embedding_gmf)

    # GMF Layer - Embedding한 MF Latent Vector를 내적
    gmf_layer = dot([user_latent_gmf, item_latent_gmf], axes=1)
    # GMF Predict
    gmf_prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform')(gmf_layer)

    # MLP 쌓기
    user_embedding_mlp = Embedding(users_num, latent_dim_mlp, input_length=user.shape[1])(user)
    item_embedding_mlp = Embedding(items_num, latent_dim_mlp, input_length=item.shape[1])(item)

    user_latent_mlp = Flatten()(user_embedding_mlp)
    item_latent_mlp = Flatten()(item_embedding_mlp)

    # Concatenated
    concat_embedding = Concatenate()([user_latent_mlp, item_latent_mlp])

    # FC Layer - MLP
    layer_1 = Dense(units=64, activation='relu', name='layer1')(concat_embedding)
    layer_2 = Dense(units=32, activation='relu', name='layer2')(layer_1)
    layer_3 = Dense(units=16, activation='relu', name='layer3')(layer_2)
    layer_4 = Dense(units=8, activation='relu', name='layer4')(layer_3)

    # MLP Predict
    mlp_prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform')(layer_4)

    # GMF + MLP
    predict_vector = Concatenate()([gmf_prediction, mlp_prediction])

    # output layer
    output_layer = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform')(predict_vector)

    # Model
    model = Model([user, item], output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy')

    return model

In [8]:
users_num = len(user_id)
items_num = len(vod_info)

In [9]:
ncf = NCF(users_num, items_num, 20, 20)
ncf.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 item_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 1, 20)                18480     ['user_input[0][0]']          
                                                                                                  
 embedding_3 (Embedding)     (None, 1, 20)                95720     ['item_input[0][0]']          
                                                                                              

In [10]:
ncf.fit([train_users, train_items], train_ratings, epochs=5,
        validation_data=([test_users, test_items], test_ratings),
        verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b6f21399780>

In [11]:
# Precision@k, Recall@k 구하기


# 1. Predictions
predictions = ncf.predict([test_users, test_items]).flatten()
result = pd.DataFrame([test_users, test_items, predictions]).T
result.columns = ['user_id', 'vod_id', 'prediction']
result



Unnamed: 0,user_id,vod_id,prediction
0,810.0,3570.0,0.345313
1,788.0,4747.0,0.140314
2,661.0,1270.0,0.136676
3,369.0,32.0,0.133515
4,771.0,4747.0,0.126686
...,...,...,...
2067,101.0,552.0,0.320213
2068,460.0,985.0,0.146444
2069,661.0,3835.0,0.154166
2070,661.0,2275.0,0.136185


In [None]:
# 2. 모든 user, 모든 Item Pair에 대해 Predictions 값 뽑기
# 일단 실제 본 것도 포함해서 Pred 데이터프레임 생성
users = user_id['user_id'].tolist()
users_test_list = np.array([],dtype='int32')
items_test_list = np.array([], dtype='int32')
for user_id in users:
  users_test_list = np.concatenate([users_test_list, np.full(items_num, user_id, dtype='int32')])
  items_test_list = np.concatenate([items_test_list, np.array(vod_info['vod_id'].tolist())])

In [33]:
preds = ncf.predict([users_test_list, items_test_list])



In [38]:
all_predictions = pd.DataFrame(data={'user_id':users_test_list.tolist(), 'vod_id':items_test_list.tolist(), 'predictions':preds.flatten()})

In [40]:
all_predictions.to_pickle('/content/drive/MyDrive/DXSCHOOL/Project/NCF_all_pred.pkl')

In [41]:
# 3. Precision@K 계산 (Average Precision K)
# 추천한 K개 중 사용자가 관심 있는 아이템 수 / 모델이 추천해 준 아이템 K개
all_predictions

Unnamed: 0,user_id,vod_id,predictions
0,0,0,0.340714
1,0,1,0.138318
2,0,2,0.338451
3,0,3,0.149709
4,0,4,0.343482
...,...,...,...
4422259,923,4781,0.343895
4422260,923,4782,0.189853
4422261,923,4783,0.337088
4422262,923,4784,0.331361


In [55]:
pred_k = all_predictions[all_predictions['user_id']==0].sort_values('predictions',ascending=False)[:10]
pred_k = pred_k.merge(df, how='left', on=['user_id','vod_id']).fillna(0)

In [56]:
pred_k

Unnamed: 0,user_id,vod_id,predictions,rating
0,0,42,0.38428,0.0
1,0,341,0.381675,0.0
2,0,39,0.378726,0.0
3,0,398,0.377288,0.0
4,0,169,0.376444,1.0
5,0,76,0.372973,0.0
6,0,95,0.372481,0.0
7,0,79,0.371335,0.0
8,0,43,0.370773,0.0
9,0,37,0.369969,0.0


In [64]:
precisions = []
recalls = []
for user in users:
  pred_k = all_predictions[all_predictions['user_id']==user].sort_values('predictions', ascending=False)[:10]
  pred_k = pred_k.merge(df, how='left', on=['user_id','vod_id']).fillna(0)
  precision = pred_k['rating'].sum() / 10
  recall = pred_k['rating'].sum() / df[df['user_id']==user]['rating'].sum()
  precisions.append(precision)
  recalls.append(recall)

In [73]:
# precision@K
np.mean(precisions)
#np.mean(recall) # 0이 나오는데, 실제 본 게 있는 사람 대상으로만 계산하도록 해야 함 - 수정 필요

0.10032467532467533

In [63]:
# recall@K 계산 - 추천한 K개 중 사용자가 관심 있는 아이템 수 / 사용자가 관심 있는 모든 아이템 수
# 실제 사용자가 관심 있는 아이템 수가 적은 문제가 있음
df[df['user_id']==0]

Unnamed: 0,user_id,vod_id,rating
358,0,167,1
427,0,169,1
3418,0,4754,0
3457,0,152,0
3479,0,1549,0
3638,0,1608,0
3749,0,139,0


In [None]:
# 궁금한 것
# Precision@k 측정할때 시간이 이렇게 오래걸려도 되는 것인강..ㅇ.