In [14]:
import pandas as pd
import operator
import statistics
import random
import numpy as np
import datetime
import pickle
from datetime import datetime

from math import sqrt
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate,Dropout
from keras.optimizers import Adam
from kerastuner.tuners import RandomSearch
from kerastuner import HyperParameters
from tensorflow.keras.layers import Embedding
from kerastuner.engine import hyperparameters as hp

file_path1 = '/home/ubuntu/project/4.5HZ/Data/완성/플레이리스트/dummy_user(12월26일).csv'
file_path2 = '/home/ubuntu/project/4.5HZ/Data/완성/플레이리스트/playlist_중복제거(12월26일).csv'
file_path3 = '/home/ubuntu/project/4.5HZ/Data/완성/플레이리스트/플레이리스트_협업필터링(12월26일).csv'

df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df3 = pd.read_csv(file_path3)

# 데이터프레임 병합
df = pd.merge(df2, df3, left_on='playlist_id', right_on='PLAYLIST_ID')
df = pd.merge(df, df1, left_on='USER_ID', right_on='id')

# 필요한 컬럼만 사용
df = df[['USER_ID', 'TRACK_ID', 'rating', 'birth_date', 'gender']]

# 'birth_date'를 10대, 20대, 30대, 40대, 50대로 나이대를 나눔
df['birth_date'] = pd.to_numeric(df['birth_date'], errors='coerce')
df['age'] = (2023 - df['birth_date']) // 10

# 데이터 전처리
# 사용자와 아이템의 ID를 숫자로 변환
user_ids = df['USER_ID'].unique()
track_ids = df['TRACK_ID'].unique()

num_users = len(user_ids)
num_tracks = len(track_ids)

user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
track2track_encoded = {x: i for i, x in enumerate(track_ids)}
track_encoded2track = {i: x for i, x in enumerate(track_ids)}

df['USER'] = df['USER_ID'].map(user2user_encoded)
df['TRACK'] = df['TRACK_ID'].map(track2track_encoded)

# 컬럼 타입을 category로 변환
df['age'] = df['age'].astype('category').cat.codes
df['gender'] = df['gender'].astype('category').cat.codes

#데이터를 train과 test로 나눔
train, test = train_test_split(df, test_size=0.2, random_state=42)

# 입력 변수와 타겟 변수를 분리
train_inputs = [train.USER.values, train.TRACK.values, train.age.values, train.gender.values]
train_targets = train.rating.values

test_inputs = [test.USER.values, test.TRACK.values, test.age.values, test.gender.values]
test_targets = test.rating.values

# 모델 구성
user_input = Input(shape=(1,), name='user_input', dtype='int64')
user_embedding = Embedding(num_users, 10, name='user_embedding')(user_input)
user_vec = Flatten(name='flatten_users')(user_embedding)

track_input = Input(shape=(1,), name='track_input', dtype='int64')
track_embedding = Embedding(num_tracks, 10, name='track_embedding')(track_input)
track_vec = Flatten(name='flatten_tracks')(track_embedding)

age_input = Input(shape=(1,), name='age_input', dtype='int64')
age_embedding = Embedding(df.age.nunique(), 5, name='age_embedding')(age_input)
age_vec = Flatten(name='flatten_ages')(age_embedding)

gender_input = Input(shape=(1,), name='gender_input', dtype='int64')
gender_embedding = Embedding(df.gender.nunique(), 5, name='gender_embedding')(gender_input)
gender_vec = Flatten(name='flatten_genders')(gender_embedding)

# Dropout 적용
concat = Concatenate()([user_vec, track_vec, age_vec, gender_vec])
dropout = Dropout(0.5)(concat) # Dropout 적용
dense = Dense(128, activation='relu')(dropout) # Dropout 적용 결과를 다음 레이어에 연결
dense = Dense(64, activation='relu')(dense)
output = Dense(1)(dense)

model = Model([user_input, track_input, age_input, gender_input], output)

# 모델 컴파일 및 학습
model.compile(optimizer=Adam(0.001), loss='mean_squared_error')
history = model.fit(train_inputs, train_targets, batch_size=64, epochs=5, verbose=1, validation_split=0.2)

# 모델 평가
model.evaluate(test_inputs, test_targets)

# 모델 예측
preds = model.predict(test_inputs)

# RMSE 계산
rmse = sqrt(mean_squared_error(test_targets, preds))
print(f'Test RMSE: {rmse}')

# MAE 계산
mae = mean_absolute_error(test_targets, preds)
print(f'Test MAE: {mae}')

from kerastuner.engine import hyperparameters as hp

def build_model(hp):
    user_input = Input(shape=(1,), name='user_input', dtype='int64')
    user_embedding = Embedding(num_users, hp.Int('user_embedding_dim', 5, 15, step=5), name='user_embedding')(user_input)
    user_vec = Flatten(name='flatten_users')(user_embedding)

    track_input = Input(shape=(1,), name='track_input', dtype='int64')
    track_embedding = Embedding(num_tracks, hp.Int('track_embedding_dim', 5, 15, step=5), name='track_embedding')(track_input)
    track_vec = Flatten(name='flatten_tracks')(track_embedding)

    age_input = Input(shape=(1,), name='age_input', dtype='int64')
    age_embedding = Embedding(df.age.nunique(), hp.Int('age_embedding_dim', 2, 10, step=2), name='age_embedding')(age_input)
    age_vec = Flatten(name='flatten_ages')(age_embedding)

    gender_input = Input(shape=(1,), name='gender_input', dtype='int64')
    gender_embedding = Embedding(df.gender.nunique(), hp.Int('gender_embedding_dim', 2, 10, step=2), name='gender_embedding')(gender_input)
    gender_vec = Flatten(name='flatten_genders')(gender_embedding)

    concat = Concatenate()([user_vec, track_vec, age_vec, gender_vec])
    dropout = Dropout(hp.Float('dropout_rate', 0.0, 0.5, step=0.1))(concat)
    dense = Dense(hp.Int('dense_1_units', 64, 256, step=64), activation='relu')(dropout)
    dense = Dense(hp.Int('dense_2_units', 32, 128, step=32), activation='relu')(dense)
    output = Dense(1)(dense)

    model = Model([user_input, track_input, age_input, gender_input], output)
    model.compile(optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')), loss='mean_squared_error')

    return model

tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=1, # 10
    executions_per_trial=1, # 3
    directory='my_dir',
    project_name='helloworld')

tuner.search(train_inputs, train_targets, 
             epochs=5, 
             validation_split=0.2, 
             batch_size=hp.Choice('batch_size', values=[32, 64, 128, 256])) 


# 최적의 하이퍼파라미터를 구한 후 
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# 최적의 하이퍼파라미터를 사용하여 모델 구성
model = build_model(best_hps)

# 모델 컴파일
model.compile(optimizer=Adam(best_hps.get('learning_rate')), 
              loss='mean_squared_error')

# 모델 학습
history = model.fit(train_inputs, train_targets, 
                    epochs=5, 
                    validation_split=0.2, 
                    batch_size=32,
                    verbose=1)  # 직접 배치 크기를 지정

# 모델 평가
test_loss = model.evaluate(test_inputs, test_targets, verbose=0)

# 결과 출력
print('테스트 손실:', test_loss)

# 모델 저장
model.save('/home/ubuntu/project/4.5HZ/추천시스템/협업필터링/Collaborative_Filtering_model_2.h5')

사용자 10의 닉네임: pwmns
사용자 10의 사용자명: 이상현
사용자 10의 생년월일: 2010-03-15
사용자 10의 나이: 10대
사용자 10의 성별: 여성

트랙 10를 위한 추천 음악 ID와 예측 평점:
트랙 4L8pKwvlbzvw8HKwgQM4Mz: 예측 평점 - 2.539658784866333
트랙 62D6Bjz4IgTKDJRLPD3gJY: 예측 평점 - 2.5343410968780518
트랙 1ODqKQQkoP8KyXSpWASI60: 예측 평점 - 2.5073540210723877
트랙 1zKEyeeSWHOyd8csMC1WTO: 예측 평점 - 2.5073540210723877
트랙 7LJZyNjN5U0LdYSBQ9hlLz: 예측 평점 - 2.5073540210723877
