# CDR Model - Ablation Study_4
* 본 모델의 Parameter 변경
* 1) Dropout : 0.1, 0.2, 0.3, 0.4  --> Default : 0.2
* 2) MLP : 128, 256, 512, 1024  --> Default : 256

In [1]:
import pandas as pd
import numpy as np
import os, random, re
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, mean_absolute_error
from gensim.models.doc2vec import Doc2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Embedding, Input, Flatten, Multiply, Dense, Activation, Concatenate, Dropout, BatchNormalization, Conv1D, GlobalAveragePooling1D, MaxPooling1D
pd.options.display.max_colwidth=20
pd.options.display.max_columns=999

In [2]:
rest = pd.read_csv('/content/drive/MyDrive/seminar/cdr_v2/yelp_rest_prepro.csv')
cafe = pd.read_csv('/content/drive/MyDrive/seminar/cdr_v2/yelp_cafe_prepro.csv')
bar = pd.read_csv('/content/drive/MyDrive/seminar/cdr_v2/yelp_bar_prepro.csv')

rest_doc_model_32 = Doc2Vec.load('/content/drive/MyDrive/seminar/cdr_v2/rest_doc_model_32')
cafe_doc_model_32 = Doc2Vec.load('/content/drive/MyDrive/seminar/cdr_v2/cafe_doc_model_32')
bar_doc_model_32 = Doc2Vec.load('/content/drive/MyDrive/seminar/cdr_v2/bar_doc_model_32')

print(rest.shape)
print(cafe.shape)
print(bar.shape)

(1070842, 24)
(28117, 26)
(17735, 42)


In [3]:
def seed_everything(seed: int=42):
  random.seed(seed)
  np.random.seed(seed)
  os.environ['PYTHONASHSEED'] = str(seed)
  tf.random.set_seed(seed)

seed_everything(42)

In [4]:
def friends(x):
  '''
  num_friends - 친구의 수로 표현
  '''
  friends = len(x.split(','))

  return friends


def add_side_info(df=None, domain='None'):
  '''
  Rating + Review + User Profile + Context
  business_id, user_id는 knowledge transfer를 위해 지금 수정X
  '''
  if domain == 'bar':
    cols = ['business_id', 'user_id', 'stars_y', 'categories', 'Alcohol', 'BestNights', 
            'BestNights_Mon', 'BestNights_Tue', 'BestNights_Fri', 'BestNights_Wed',
            'BestNights_Thu','BestNights_Sun','BestNights_Sat', 'Music', 'video', 'dj', 
            'background_music', 'jukebox', 'no_music', 'live', 'karaoke', 'text', 'num_friends', 'fans', 'user_votes']
    new_df = df[cols]

    for col in ['num_friends', 'fans', 'user_votes', 'categories', 'Alcohol', 'BestNights', 'Music']:  # business_id, user_id
      new_df[col] = new_df[col].astype('category')
      new_df[col] = new_df[col].cat.codes.values
  
  elif domain == 'cafe':
    cols = ['business_id', 'user_id', 'stars_y', 'categories', 'OutdoorSeating', 'DriveThru', 
            'text', 'num_friends', 'fans', 'user_votes']
    new_df = df[cols]

    for col in ['num_friends', 'fans', 'user_votes', 'categories', 'OutdoorSeating', 'DriveThru']:   # business_id, user_id
      new_df[col] = new_df[col].astype('category')
      new_df[col] = new_df[col].cat.codes.values

  return new_df

def normalize(x):
  '''
  MLP / GMF / NCF - sigmoid 용 정규화 모델
  Min-Max Scaler 사용
  '''
  normalized_x = (((x-1) / (5-1)))
  return normalized_x

def de_normalize(x):
  '''
  normalize된 평점이 다시 원래 값을 갖도록 설정
  '''
  original_x = 4 * ((x - 0) / (1 - 0)) + 1
  return original_x

def rmse(y_true, y_pred):
  y_true = tf.cast(y_true, tf.float32)
  return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

rest['num_friends'] = rest['friends'].apply(lambda x: friends(x))
bar['num_friends'] = bar['friends'].apply(lambda x: friends(x))
cafe['num_friends'] = cafe['friends'].apply(lambda x: friends(x))

rest = rest[['business_id', 'user_id', 'stars_y', 'text', 'num_friends', 'fans', 'user_votes']]
bar = add_side_info(bar, 'bar')
cafe = add_side_info(cafe, 'cafe')

rest['stars_y_scaled'] = rest['stars_y'].apply(lambda x: normalize(x))
bar['stars_y_scaled'] = bar['stars_y'].apply(lambda x: normalize(x))
cafe['stars_y_scaled'] = cafe['stars_y'].apply(lambda x: normalize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = new_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = new_df[col].cat.codes.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = new_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try u

In [5]:
def get_embeddings(model, df=None):
  '''
  학습한 Doc2Vec 모델의 임베딩 부여
  '''
  user_embeddings = []
  for user_id in df['user_id'].unique():
    user_embedding = model.docvecs[user_id]
    user_embeddings.append(user_embedding)
  user_embeddings_dict = dict(zip(df['user_id'].unique(), user_embeddings))
  df['user_embeddings'] = df['user_id'].apply(lambda x: user_embeddings_dict[x])

  bus_embeddings = []

  for bus_id in df['business_id'].unique():
    bus_embedding = model.docvecs[bus_id]
    bus_embeddings.append(bus_embedding)
  bus_embeddings_dict = dict(zip(df['business_id'].unique(), bus_embeddings))
  df['business_embeddings'] = df['business_id'].apply(lambda x: bus_embeddings_dict[x])
  
  return df

rest_doc_32 = get_embeddings(rest_doc_model_32, rest)
cafe_doc_32 = get_embeddings(cafe_doc_model_32, cafe)
bar_doc_32 = get_embeddings(bar_doc_model_32, bar)

  user_embedding = model.docvecs[user_id]
  bus_embedding = model.docvecs[bus_id]
  user_embedding = model.docvecs[user_id]
  bus_embedding = model.docvecs[bus_id]
  user_embedding = model.docvecs[user_id]
  bus_embedding = model.docvecs[bus_id]


In [6]:
rest_user_emb_dict = dict(zip(rest_doc_32['user_id'], rest_doc_32['user_embeddings']))
cafe_user_emb_dict = dict(zip(cafe_doc_32['user_id'], cafe_doc_32['user_embeddings']))
bar_user_emb_dict = dict(zip(bar_doc_32['user_id'], bar_doc_32['user_embeddings']))

def user_emb_mean_single(target_df, source_dict=None, target_dict=None, method=None):
  '''
  Common User에 대해 Embedding Aggregation(single cross-domain)
  Common User인 경우 user_embedding aggregation
  Common User가 아닌 경우 기존 embedding 유지

  inputs:
        1) target df : Common User에 대해 user_embeddings가 변경될 target domain dataframe
        2) source_dict : source domain의 user_id별 user_embeddings 딕셔너리
        3) target_dict : target domain의 user_id별 user_embeddings 딕셔너리
        4) method : Aggregation 방식
  '''
  if method == 'SUM':
    for idx, row in target_df.iterrows():
      user_id = row['user_id']
      
      if user_id in source_dict:
        new_user_emb = [(x+y) for x, y in zip(source_dict[user_id], target_dict[user_id])]
        target_df.at[idx, 'user_embeddings'] = new_user_emb
      
      else:
        target_df.at[idx, 'user_embeddings'] = target_dict[user_id]

  elif method == 'MAX':
    for idx, row in target_df.iterrows():
      user_id = row['user_id']

      if user_id in source_dict:
        new_user_emb = [max(x,y) for x, y in zip(source_dict[user_id], target_dict[user_id])]
        target_df.at[idx, 'user_embeddings'] = new_user_emb
      
      else:
        target_df.at[idx, 'user_embeddings'] = target_dict[user_id]

  elif method == 'MEAN':
    for idx, row in target_df.iterrows():
      user_id = row['user_id']

      if user_id in source_dict:
        new_user_emb = [(x + y) / 2 for x, y in zip(source_dict[user_id], target_dict[user_id])]
        target_df.at[idx, 'user_embeddings'] = new_user_emb
      
      else:
        target_df.at[idx, 'user_embeddings'] = target_dict[user_id]
  
  else:
    raise RuntimeError('Select another method.')

  return target_df
  
rest_to_bar = user_emb_mean_single(bar_doc_32, rest_user_emb_dict, bar_user_emb_dict, method='MAX')
rest_to_cafe = user_emb_mean_single(cafe_doc_32, rest_user_emb_dict, cafe_user_emb_dict, method='MAX')

In [7]:
# dataset making

def embeddings_making(df=None):
  '''
  inputs : 
          정보 전송이 완료된 데이터셋의 user_embeddings, business_embeddings
  
  outputs : 
          np.array() 형태의 user_embeddings, business_embeddings
  '''
  user_embeddings = []
  business_embeddings = []

  for idx in range(len(df)):
    user = df['user_embeddings'][idx]
    business = df['business_embeddings'][idx]

    user_embeddings.append(user)
    business_embeddings.append(business)
  
  return np.array(user_embeddings), np.array(business_embeddings)

def data_split(df=None, user_embeddings=None, business_embeddings=None, test_size=None):
  '''
  데이터프레임 및 np.array형태의 user/item embeddings를 
  train : valid : test = 0.6 : 0.2 : 0.2 크기로 분할
  '''
  train, test = train_test_split(df, test_size=test_size, random_state=42)
  train_user, test_user = train_test_split(user_embeddings, test_size=test_size, random_state=42)
  train_bus, test_bus = train_test_split(business_embeddings, test_size=test_size, random_state=42)

  return train, test, train_user, test_user, train_bus, test_bus

rb_user_emb, rb_bus_emb = embeddings_making(rest_to_bar)  
rb_train, rb_test, rb_train_user, rb_test_user, rb_train_bus, rb_test_bus = data_split(rest_to_bar, rb_user_emb, rb_bus_emb, 0.2)  
rb_train, rb_valid, rb_train_user, rb_valid_user, rb_train_bus, rb_valid_bus = data_split(rb_train, rb_train_user, rb_train_bus, 0.25)

rc_user_emb, rc_bus_emb = embeddings_making(rest_to_cafe)
rc_train, rc_test, rc_train_user, rc_test_user, rc_train_bus, rc_test_bus = data_split(rest_to_cafe, rc_user_emb, rc_bus_emb, 0.2) 
rc_train, rc_valid, rc_train_user, rc_valid_user, rc_train_bus, rc_valid_bus = data_split(rc_train, rc_train_user, rc_train_bus, 0.25)

In [274]:
# Proposed Model
cdr_configs = {'vector_length' : 32, 'user_item_embed' : 32, 'drop_rate' : 0.2, 
           'dense_1' : 2056, 'dense_2' : 1024, 'dense_3' : 256, 'dense_4' : 64, 'output' : 1} 

class CDR(tf.keras.Model):

  def __init__(self, **cdr_configs):
    super(CDR, self).__init__(name='CDR')
    
    user_input = Input(shape=(cdr_configs['vector_length'], ), name='user_input')
    business_input = Input(shape=(cdr_configs['vector_length']), name='business_input')

    user_emb = Dense(cdr_configs['user_item_embed'], activation='relu', name='user_emb')(user_input)
    business_emb = Dense(cdr_configs['user_item_embed'], activation='relu', name='business_emb')(business_input)
    user_emb = Flatten(name='user_flat')(user_emb)
    business_emb = Flatten(name='business_flat')(business_emb)
    concat = Concatenate(name='concat')([user_emb, business_emb])
    drop_1 = Dropout(rate=cdr_configs['drop_rate'], name='drop_1')(concat)
    
    mlp_1 = Dense(cdr_configs['dense_1'], activation='relu', name='mlp_1')(drop_1)
    drop_2 = Dropout(rate=cdr_configs['drop_rate'], name='drop_2')(mlp_1)
    bn_1 = BatchNormalization(name='bn_1')(drop_2)

    mlp_2 = Dense(cdr_configs['dense_2'], activation='relu', name='mlp_2')(bn_1)
    drop_3 = Dropout(rate=cdr_configs['drop_rate'], name='drop_3')(mlp_2)
    bn_2 = BatchNormalization(name='bn_2')(drop_3)

    mlp_3 = Dense(cdr_configs['dense_3'], activation='relu', name='mlp_3')(bn_2)
    mlp_4 = Dense(cdr_configs['dense_4'], activation='relu', name='mlp_4')(mlp_3)
    output = Dense(cdr_configs['output'], activation='sigmoid', name='output')(mlp_4)

    self.model = Model([user_input, business_input], output, name='CDR')
  
  def get_model(self):
    model = self.model
    return model

es = EarlyStopping(monitor='val_loss', mode = 'min', patience=5, min_delta=0.001, restore_best_weights = True)
rp = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=2, min_delta=0.001)

In [275]:
# Restaurant -> Bar Fitting
rest_to_bar_cdr = CDR(**cdr_configs).get_model()
print(rest_to_bar_cdr.summary())
rest_to_bar_cdr.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy')
result = rest_to_bar_cdr.fit(x=[rb_train_user, rb_train_bus],
                        y=rb_train['stars_y_scaled'], validation_data=([rb_valid_user, rb_valid_bus], rb_valid['stars_y_scaled']), epochs=100, batch_size=512, callbacks=[es, rp])

Model: "CDR"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 32)]         0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 user_emb (Dense)               (None, 32)           1056        ['user_input[0][0]']             
                                                                                                  
 business_emb (Dense)           (None, 32)           1056        ['business_input[0][0]']         
                                                                                                

In [276]:
# Restaurant -> Bar Prediction
pred_mlp = rest_to_bar_cdr.predict([rb_test_user, rb_test_bus])
pred_mlp = 4 *((pred_mlp - pred_mlp.min())/(pred_mlp.max() - pred_mlp.min())) + 1  # min-max scaler
rmse_temp = mean_squared_error(rb_test['stars_y'], pred_mlp, squared=False)
mae_temp = mean_absolute_error(rb_test['stars_y'], pred_mlp)
print(f'rmse : {rmse_temp}')
print(f'mae : {mae_temp}')

rmse : 0.9014893337610965
mae : 0.6988828795575276


In [277]:
rmse = [0.9235511613883062, 0.9212706912722717, 1.0168208402505992, 0.9978447013169012, 0.9014893337610965]
mae = [0.729755012021792, 0.7243064429546902, 0.820273285588312, 0.7980321854445375, 0.6988828795575276]
print(np.mean(rmse))
print(np.mean(mae))

0.952195345597835
0.7542499611133718


In [260]:
01# Restaurant -> Cafe Fitting
rest_to_cafe_cdr = CDR(**cdr_configs).get_model()
print(rest_to_cafe_cdr.summary())
rest_to_cafe_cdr.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy')
result = rest_to_cafe_cdr.fit(x=[rc_train_user, rc_train_bus],
                        y=rc_train['stars_y_scaled'], validation_data=([rc_valid_user, rc_valid_bus], rc_valid['stars_y_scaled']), epochs=100, batch_size=512, callbacks=[es, rp])

Model: "CDR"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 32)]         0           []                               
                                                                                                  
 business_input (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 user_emb (Dense)               (None, 32)           1056        ['user_input[0][0]']             
                                                                                                  
 business_emb (Dense)           (None, 32)           1056        ['business_input[0][0]']         
                                                                                                

In [261]:
# Restaurant -> Cafe Prediction
pred_mlp = rest_to_cafe_cdr.predict([rc_test_user, rc_test_bus])
pred_mlp = 4 *((pred_mlp - pred_mlp.min())/(pred_mlp.max() - pred_mlp.min())) + 1  # min-max scaler
rmse_temp = mean_squared_error(rc_test['stars_y'], pred_mlp, squared=False)
mae_temp = mean_absolute_error(rc_test['stars_y'], pred_mlp)
print(f'rmse : {rmse_temp}')
print(f'mae : {mae_temp}')

rmse : 0.9408245506579527
mae : 0.7399253677338319
