In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
from datetime import datetime as dt

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

2023-01-12 15:46:55.685188: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 1. Data

In [3]:
# 1. Dataset read
data_path = "datas/movielens_small/{}"

ratings_df = pd.read_csv(data_path.format("ratings.csv"), encoding='utf-8')
ratings_df.drop("timestamp", inplace=True, axis=1)

print("평점 데이터 shape : {}".format(ratings_df.shape))
ratings_df.head()

평점 데이터 shape : (100004, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:
movies_df = pd.read_csv(data_path.format("movies.csv"), encoding='utf-8')
movies_df.set_index("movieId", inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9125, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
# 2. dummy genre setting
# - 영화별 장르 포함 정보를 가진 dummy data
dummy_genre_df = movies_df['genres'].str.get_dummies(sep="|")

print("* (영화 수, 장르 수)")
print("장르 포함 정보 데이터 shape : {}".format(dummy_genre_df.shape))
dummy_genre_df.head()

* (영화 수, 장르 수)
장르 포함 정보 데이터 shape : (9125, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# 3. year parsing
# - 영화별 년도 정보 추출

# 3-1. (4자리) 타입의 문자열 추출
years = movies_df['title'].str.extract("(\(\d\d\d\d\))")[0].values.astype("str")

# 3-2. 괄호 제거 (nan은 0으로 대치)
years = np.array([0 if y == 'nan' else int(y.replace("(","").replace(")","")) for y in years])

# 3-3. movie column 삽입
movies_df['year'] = years

# 3-4. 년도 정보가 없었던 데이터는 삭제
movies_df.drop(movies_df[movies_df['year'] == 0].index, axis=0, inplace=True)

# 3-5. 제목 칼럼 삭제
movies_df.drop("title", axis=1, inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9119, 2)


Unnamed: 0_level_0,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Adventure|Children|Fantasy,1995
3,Comedy|Romance,1995
4,Comedy|Drama|Romance,1995
5,Comedy,1995


In [7]:
# 4. year-level generate
bins = np.arange(1900, 2023, 20)
labels = [x for x in range(len(bins) - 1)]

# 4-1. 구간 나누기
movies_df['year_level'] = pd.cut(movies_df['year'], bins, right=False, labels=labels)

# 4-2. year 칼럼 삭제
movies_df.drop('year', axis=1, inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9119, 2)


Unnamed: 0_level_0,genres,year_level
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,4
2,Adventure|Children|Fantasy,4
3,Comedy|Romance,4
4,Comedy|Drama|Romance,4
5,Comedy,4


In [8]:
# 5. threshold data
# - 평점의 갯수가 10개 이상인 영화 데이터 기록

# 5-1. threshold setting each movie
threshold = 10
over_threshold = ratings_df.groupby('movieId').size() >= threshold

# 5-2. rating in
# - 평점의 갯수가 10개 이상인 영화인가? 에 대한 기록
ratings_df['over_threshold'] = ratings_df['movieId'].map(lambda x: over_threshold[x])

# 5-3. filtering
ratings_df = ratings_df[ratings_df['over_threshold']]
ratings_df.drop('over_threshold', axis=1, inplace=True)

print("평점 데이터 shape : {}".format(ratings_df.shape))
ratings_df.head()

평점 데이터 shape : (81915, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [9]:
# 6. 최종 학습 데이터 셋 생성
shuffled_df = ratings_df.sample(frac=1)

# rating num user matrix
# 몇 번째 평점의 어떤 유저의 정보인가 matrix
ru_mat = pd.get_dummies(shuffled_df['userId'], prefix="user")

# rating and movie matrix
# 몇 번째 평점의 어떤 영화의 정보인가 matrix
rm_mat = pd.get_dummies(shuffled_df['movieId'], prefix="movie")

# rating and genre matrix
# 몇 번째 평점의 어떤 장르의 정보인가 matrix
rg_mat = shuffled_df['movieId'].apply(lambda x: dummy_genre_df.loc[x])

# ratings and year_level
# 몇 번째 평점의 어떤 년도의 정보인가 vector
ry_vec = shuffled_df['movieId'].apply(lambda x: movies_df.loc[x]['year_level']).rename('year_level')

# concat
concat_df = pd.concat([ru_mat, rm_mat, rg_mat, ry_vec], axis=1)

# 정말 그냥 평점이라는 고유 데이터에
# 사용자 번호 행렬, 영화 번호 행렬, 장르 행렬, 년도 레벨 합친 데이터 이다.
print("최종 학습 데이터셋 shape : {}".format(concat_df.shape))
concat_df.head()

최종 학습 데이터셋 shape : (81915, 2937)


Unnamed: 0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9,user_10,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year_level
59091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
71285,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,5
30691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
75727,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
45715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,0,5


In [10]:
# 7. 훈련, 시험용 데이터 분리
from sklearn.model_selection import train_test_split

# target_df
## 지금까지와 다른 특징은 평점 데이터를 사용하지 않고
## 4점이 넘으면 보는 것, 넘지 않으면 안 보는 것으로 간주 -> 추천의 관점에서
target_df = ratings_df.loc[concat_df.index]['rating']
target_df = target_df.apply(lambda x: 1 if x >= 4 else 0)

X_train, X_test, y_train, y_test = train_test_split(concat_df, target_df, test_size=0.1)

print("훈련 데이터 shape : {}, 훈련 라벨 갯수 : {}".format(X_train.shape, y_train.size))
print("테스트 데이터 shape : {}, 테스트 라벨 갯수 : {}".format(X_test.shape, y_test.size))

훈련 데이터 shape : (73723, 2937), 훈련 라벨 갯수 : 73723
테스트 데이터 shape : (8192, 2937), 테스트 라벨 갯수 : 8192


# 2. Factorization Machine

## 1. FM Model Process

In [11]:
# 1. Setting Value 
num_factor = 8
n, p = X_train.shape

In [19]:
# 2. tensor variable ref.Factorization Machine Model Operation (1)

# 2-1. Global Bias
w_0 = tf.Variable([0.0], name="Global Bias")

# 2-2. i 번째 개별 **특성**에 대한 가중치
w = tf.Variable(tf.zeros(shape=[p]), name="Weights")

# 2-3. v_i, v_factor 는 f개의 latent factor로 표현된 2-way interaction을 계산하는 내적을 의미
v = tf.Variable(tf.random.normal(shape=(p, num_factor)), name="Latent Factor")

trainable_variables = [w_0, w, v]
trainable_variables

[<tf.Variable 'Global Bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>,
 <tf.Variable 'Weights:0' shape=(2937,) dtype=float32, numpy=array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)>,
 <tf.Variable 'Latent Factor:0' shape=(2937, 8) dtype=float32, numpy=
 array([[-0.31313184,  0.22039066,  0.16138363, ...,  0.14336602,
          0.3747506 , -1.5969039 ],
        [-2.2915716 , -2.9860423 , -0.19873302, ...,  2.070536  ,
         -1.5299114 ,  0.37945673],
        [ 0.687828  , -0.34823158,  0.5965239 , ..., -0.14492488,
         -0.22520304,  0.31036416],
        ...,
        [ 0.457936  ,  0.37095883, -0.00765966, ...,  0.23827663,
         -1.7979732 ,  1.5304476 ],
        [ 2.0993567 , -1.2577075 ,  0.5381595 , ..., -0.02137662,
          0.5637898 ,  0.5120409 ],
        [ 0.53468686,  1.3936852 , -1.613502  , ...,  0.08413359,
         -0.832082  ,  0.65849686]], dtype=float32)>]

In [20]:
# 3. predictref.Factorization Machine Model Operation (1)

# 3-1. Setting
batch_size = 32
X_batch = tf.constant(X_train.to_numpy()[:batch_size], dtype=float)

In [21]:
# 3-2. predict function, keras.Model.call function
def predict():
    # 3-1. 시그마(W_i, x_i)
    degree_1 = tf.reduce_sum(tf.multiply(w, X_batch), axis=1)

    # 3-2. 오른쪽에 어려운 식 계산
    degree_2 = 0.5 * tf.reduce_sum(
            tf.math.pow(tf.matmul(X_batch, v), 2)
            - tf.matmul(tf.math.pow(X_batch, 2), tf.math.pow(v, 2)),
            1, keepdims=False
        )

    # 3-3. 나머지 계산 (Sigmoid)
    _predict = tf.math.sigmoid(w_0 + degree_1, degree_2)
    
    return _predict

print("예측 값",predict().numpy())

예측 값 [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]


In [22]:
# 4. loss function
loss_fn = keras.losses.binary_crossentropy
with tf.GradientTape() as tape:    
    y_batch = tf.constant(y_train.to_numpy()[:batch_size], dtype=float)
    _predict = predict()
    loss_value = loss_fn(y_batch, _predict)
    print("loss : {}".format(loss_value))
    
# 5. Variable, Weight, Bias Train
learning_rate = 0.01

# 5-1. Backpropagation
gradients = tape.gradient(loss_value, 
              trainable_variables,
              unconnected_gradients=tf.UnconnectedGradients.ZERO)

# 5-2. Optimizer
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# 5-3. variable train
optimizer.apply_gradients(zip(gradients, trainable_variables))

trainable_variables

loss : 0.6931469440460205


[<tf.Variable 'Global Bias:0' shape=(1,) dtype=float32, numpy=array([0.00999966], dtype=float32)>,
 <tf.Variable 'Weights:0' shape=(2937,) dtype=float32, numpy=
 array([0.        , 0.        , 0.        , ..., 0.00999798, 0.        ,
        0.0099999 ], dtype=float32)>,
 <tf.Variable 'Latent Factor:0' shape=(2937, 8) dtype=float32, numpy=
 array([[-0.31313184,  0.22039066,  0.16138363, ...,  0.14336602,
          0.3747506 , -1.5969039 ],
        [-2.2915716 , -2.9860423 , -0.19873302, ...,  2.070536  ,
         -1.5299114 ,  0.37945673],
        [ 0.687828  , -0.34823158,  0.5965239 , ..., -0.14492488,
         -0.22520304,  0.31036416],
        ...,
        [ 0.457936  ,  0.37095883, -0.00765966, ...,  0.23827663,
         -1.7979732 ,  1.5304476 ],
        [ 2.0993567 , -1.2577075 ,  0.5381595 , ..., -0.02137662,
          0.5637898 ,  0.5120409 ],
        [ 0.53468686,  1.3936852 , -1.613502  , ...,  0.08413359,
         -0.832082  ,  0.65849686]], dtype=float32)>]

## 2. Use

In [54]:
# 1. model setting
from model import FactorizationMachine

model = FactorizationMachine(
                train_set=(X_train, y_train),
                test_set=(X_test, y_test)
                )

In [55]:
# 2. Train (epoch value default = 10)
model.fit()

epoch : 1 / 10
73696/73696  mean: 0.5865 - binary_accuracy: 0.6901
검증 정확도: 0.717041015625
epoch : 2 / 10
73696/73696  mean: 0.5461 - binary_accuracy: 0.7226
검증 정확도: 0.7119140625
epoch : 3 / 10
73696/73696  mean: 0.5406 - binary_accuracy: 0.7275
검증 정확도: 0.7181396484375
epoch : 4 / 10
73696/73696  mean: 0.5394 - binary_accuracy: 0.7277
검증 정확도: 0.716552734375
epoch : 5 / 10
73696/73696  mean: 0.5392 - binary_accuracy: 0.7285
검증 정확도: 0.7122802734375
epoch : 6 / 10
73696/73696  mean: 0.5389 - binary_accuracy: 0.7294
검증 정확도: 0.7149658203125
epoch : 7 / 10
73696/73696  mean: 0.5388 - binary_accuracy: 0.7296
검증 정확도: 0.713134765625
epoch : 8 / 10
73696/73696  mean: 0.5387 - binary_accuracy: 0.7298
검증 정확도: 0.71435546875
epoch : 9 / 10
73696/73696  mean: 0.5389 - binary_accuracy: 0.7286
검증 정확도: 0.716064453125
epoch : 10 / 10
73696/73696  mean: 0.5392 - binary_accuracy: 0.7302
검증 정확도: 0.7147216796875
