In [139]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [140]:
from datetime import datetime as dt

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 1. Data

In [141]:
# 1. Dataset read
data_path = "datas/movielens_small/{}"

ratings_df = pd.read_csv(data_path.format("ratings.csv"), encoding='utf-8')
ratings_df.drop("timestamp", inplace=True, axis=1)

print("평점 데이터 shape : {}".format(ratings_df.shape))
ratings_df.head()

평점 데이터 shape : (100004, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [142]:
movies_df = pd.read_csv(data_path.format("movies.csv"), encoding='utf-8')
movies_df.set_index("movieId", inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9125, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [143]:
# 2. dummy genre setting
# - 영화별 장르 포함 정보를 가진 dummy data
dummy_genre_df = movies_df['genres'].str.get_dummies(sep="|")

print("* (영화 수, 장르 수)")
print("장르 포함 정보 데이터 shape : {}".format(dummy_genre_df.shape))
dummy_genre_df.head()

* (영화 수, 장르 수)
장르 포함 정보 데이터 shape : (9125, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [144]:
# 3. year parsing
# - 영화별 년도 정보 추출

# 3-1. (4자리) 타입의 문자열 추출
years = movies_df['title'].str.extract("(\(\d\d\d\d\))")[0].values.astype("str")

# 3-2. 괄호 제거 (nan은 0으로 대치)
years = np.array([0 if y == 'nan' else int(y.replace("(","").replace(")","")) for y in years])

# 3-3. movie column 삽입
movies_df['year'] = years

# 3-4. 년도 정보가 없었던 데이터는 삭제
movies_df.drop(movies_df[movies_df['year'] == 0].index, axis=0, inplace=True)

# 3-5. 제목 칼럼 삭제
movies_df.drop("title", axis=1, inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9119, 2)


Unnamed: 0_level_0,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Adventure|Children|Fantasy,1995
3,Comedy|Romance,1995
4,Comedy|Drama|Romance,1995
5,Comedy,1995


In [145]:
# 4. year-level generate
bins = np.arange(1900, 2023, 20)
labels = [x for x in range(len(bins) - 1)]

# 4-1. 구간 나누기
movies_df['year_level'] = pd.cut(movies_df['year'], bins, right=False, labels=labels)

# 4-2. year 칼럼 삭제
movies_df.drop('year', axis=1, inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9119, 2)


Unnamed: 0_level_0,genres,year_level
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,4
2,Adventure|Children|Fantasy,4
3,Comedy|Romance,4
4,Comedy|Drama|Romance,4
5,Comedy,4


In [8]:
# 5. threshold data
# - 평점의 갯수가 10개 이상인 영화 데이터 기록

# 5-1. threshold setting each movie
threshold = 10
over_threshold = ratings_df.groupby('movieId').size() >= threshold

# 5-2. rating in
# - 평점의 갯수가 10개 이상인 영화인가? 에 대한 기록
ratings_df['over_threshold'] = ratings_df['movieId'].map(lambda x: over_threshold[x])

# 5-3. filtering
ratings_df = ratings_df[ratings_df['over_threshold']]
ratings_df.drop('over_threshold', axis=1, inplace=True)

print("평점 데이터 shape : {}".format(ratings_df.shape))
ratings_df.head()

평점 데이터 shape : (81915, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [9]:
# 6. 최종 학습 데이터 셋 생성
shuffled_df = ratings_df.sample(frac=1)

# rating num user matrix
# 몇 번째 평점의 어떤 유저의 정보인가 matrix
ru_mat = pd.get_dummies(shuffled_df['userId'], prefix="user")

# rating and movie matrix
# 몇 번째 평점의 어떤 영화의 정보인가 matrix
rm_mat = pd.get_dummies(shuffled_df['movieId'], prefix="movie")

# rating and genre matrix
# 몇 번째 평점의 어떤 장르의 정보인가 matrix
rg_mat = shuffled_df['movieId'].apply(lambda x: dummy_genre_df.loc[x])

# ratings and year_level
# 몇 번째 평점의 어떤 년도의 정보인가 vector
ry_vec = shuffled_df['movieId'].apply(lambda x: movies_df.loc[x]['year_level']).rename('year_level')

# concat
concat_df = pd.concat([ru_mat, rm_mat, rg_mat, ry_vec], axis=1)

# 정말 그냥 평점이라는 고유 데이터에
# 사용자 번호 행렬, 영화 번호 행렬, 장르 행렬, 년도 레벨 합친 데이터 이다.
print("최종 학습 데이터셋 shape : {}".format(concat_df.shape))
concat_df.head()

최종 학습 데이터셋 shape : (81915, 2937)


Unnamed: 0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9,user_10,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year_level
19015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,4
40362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,4
74827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,5
95292,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,5
78510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,4


In [159]:
ru_mat.head()

Unnamed: 0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9,user_10,...,user_662,user_663,user_664,user_665,user_666,user_667,user_668,user_669,user_670,user_671
19015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
# 7. 훈련, 시험용 데이터 분리
from sklearn.model_selection import train_test_split

# target_df
## 지금까지와 다른 특징은 평점 데이터를 사용하지 않고
## 4점이 넘으면 보는 것, 넘지 않으면 안 보는 것으로 간주 -> 추천의 관점에서
target_df = ratings_df.loc[concat_df.index]['rating']
target_df = target_df.apply(lambda x: 1 if x >= 4 else 0)

X_train, X_test, y_train, y_test = train_test_split(concat_df, target_df, test_size=0.1)

print("훈련 데이터 shape : {}, 훈련 라벨 갯수 : {}".format(X_train.shape, y_train.size))
print("테스트 데이터 shape : {}, 테스트 라벨 갯수 : {}".format(X_test.shape, y_test.size))

훈련 데이터 shape : (73723, 2937), 훈련 라벨 갯수 : 73723
테스트 데이터 shape : (8192, 2937), 테스트 라벨 갯수 : 8192


In [162]:
target_df

19015    0
40362    0
74827    1
95292    1
78510    0
        ..
38873    1
55199    0
61855    0
82006    1
28705    0
Name: rating, Length: 81915, dtype: int64

# 2. Factorization Machine

## 1. FM Model Process

In [165]:
# 1. Setting Value 
num_factor = 8

# n = item count
# p = latent factor size
n, p = X_train.shape

In [166]:
# 2. tensor variable ref.Factorization Machine Model Operation (1)

# 2-1. Global Bias
w_0 = tf.Variable([0.0], name="Global Bias")

# 2-2. i 번째 개별 **특성**에 대한 가중치
w = tf.Variable(tf.zeros(shape=[p]), name="Weights")

# 2-3. v_i, v_factor 는 f개의 latent factor로 표현된 2-way interaction을 계산하는 내적을 의미
v = tf.Variable(tf.random.normal(shape=(p, num_factor)), name="Latent Factor")

trainable_variables = [w_0, w, v]
trainable_variables

[<tf.Variable 'Global Bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>,
 <tf.Variable 'Weights:0' shape=(2937,) dtype=float32, numpy=array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)>,
 <tf.Variable 'Latent Factor:0' shape=(2937, 8) dtype=float32, numpy=
 array([[ 0.02257266,  0.25723988,  0.67653745, ..., -1.1757308 ,
         -0.4377415 , -0.28318274],
        [-1.1416212 , -0.17436378, -0.07349749, ..., -0.37865558,
          0.8716561 ,  1.2952055 ],
        [-0.668155  ,  0.7099338 , -1.0267736 , ..., -0.95287144,
         -0.3488749 ,  1.4413193 ],
        ...,
        [-0.26190338,  0.28695047,  0.02734262, ...,  1.512424  ,
          0.8758463 ,  0.36281273],
        [-0.58675414,  0.7801984 , -0.86314684, ..., -0.06473006,
         -1.0305654 ,  1.491081  ],
        [ 0.4357749 ,  0.7023662 , -0.74529886, ..., -0.47243702,
          2.122399  , -1.540363  ]], dtype=float32)>]

In [19]:
# 3. predictref.Factorization Machine Model Operation (1)

# 3-1. Setting
batch_size = 32
X_batch = tf.constant(X_train.to_numpy()[:batch_size], dtype=float)

In [20]:
# 3-2. predict function, keras.Model.call function
def predict():
    # 3-1. 시그마(W_i, x_i)
    degree_1 = tf.reduce_sum(tf.multiply(w, X_batch), axis=1)

    # 3-2. 오른쪽에 어려운 식 계산
    degree_2 = 0.5 * tf.reduce_sum(
            tf.math.pow(tf.matmul(X_batch, v), 2)
            - tf.matmul(tf.math.pow(X_batch, 2), tf.math.pow(v, 2)),
            1, keepdims=False
        )

    # 3-3. 나머지 계산 (Sigmoid)
    _predict = tf.math.sigmoid(w_0 + degree_1, degree_2)
    
    return _predict

print("예측 값",predict().numpy())

예측 값 [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]


In [21]:
# 4. loss function
loss_fn = keras.losses.binary_crossentropy
with tf.GradientTape() as tape:    
    y_batch = tf.constant(y_train.to_numpy()[:batch_size], dtype=float)
    _predict = predict()
    loss_value = loss_fn(y_batch, _predict)
    print("loss : {}".format(loss_value))
    
# 5. Variable, Weight, Bias Train
learning_rate = 0.01

# 5-1. Backpropagation
gradients = tape.gradient(loss_value, 
              trainable_variables,
              unconnected_gradients=tf.UnconnectedGradients.ZERO)

# 5-2. Optimizer
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

# 5-3. variable train
optimizer.apply_gradients(zip(gradients, trainable_variables))

trainable_variables

loss : 0.6931469440460205


[<tf.Variable 'Global Bias:0' shape=(1,) dtype=float32, numpy=array([0.0099996], dtype=float32)>,
 <tf.Variable 'Weights:0' shape=(2937,) dtype=float32, numpy=
 array([0.        , 0.        , 0.        , ..., 0.00999791, 0.        ,
        0.00999982], dtype=float32)>,
 <tf.Variable 'Latent Factor:0' shape=(2937, 8) dtype=float32, numpy=
 array([[-1.4227068 ,  1.9679726 , -0.11169749, ..., -1.3351027 ,
          0.37389368,  0.4606609 ],
        [ 1.8239148 ,  3.209506  , -0.06265442, ...,  0.74963534,
          1.2086935 ,  1.4433596 ],
        [-1.507574  ,  1.9431064 ,  1.7771577 , ...,  0.0645679 ,
          0.42695114,  1.1674573 ],
        ...,
        [ 1.3261193 , -0.7505512 ,  2.3174555 , ..., -1.0801251 ,
         -0.7227388 ,  0.57095623],
        [-0.11480276,  0.46526626,  1.1899788 , ..., -0.29380172,
         -2.251638  ,  1.4839966 ],
        [ 0.3463958 , -0.28402182,  0.14997464, ..., -0.7698518 ,
          0.23088053, -0.68296224]], dtype=float32)>]

## 2. Use

In [120]:
# 1. model setting
from model import FactorizationMachine

model = FactorizationMachine(
                train_set=(X_train, y_train),
                test_set=(X_test, y_test)
                )

In [121]:
# 2. Train (epoch value default = 10)
model.fit()

epoch : 1 / 10
73696/73696  mean: 0.5866 - binary_accuracy: 0.6913
검증 정확도: 0.7037353515625
epoch : 2 / 10
73696/73696  mean: 0.5457 - binary_accuracy: 0.7239
검증 정확도: 0.714599609375
epoch : 3 / 10
73696/73696  mean: 0.5403 - binary_accuracy: 0.7277
검증 정확도: 0.711181640625
epoch : 4 / 10
73696/73696  mean: 0.5389 - binary_accuracy: 0.7286
검증 정확도: 0.715576171875
epoch : 5 / 10
73696/73696  mean: 0.5387 - binary_accuracy: 0.7286
검증 정확도: 0.7147216796875
epoch : 6 / 10
73696/73696  mean: 0.5384 - binary_accuracy: 0.7292
검증 정확도: 0.7132568359375
epoch : 7 / 10
73696/73696  mean: 0.5385 - binary_accuracy: 0.7297
검증 정확도: 0.714599609375
epoch : 8 / 10
73696/73696  mean: 0.5384 - binary_accuracy: 0.7299
검증 정확도: 0.7078857421875
epoch : 9 / 10
73696/73696  mean: 0.5386 - binary_accuracy: 0.7298
검증 정확도: 0.7135009765625
epoch : 10 / 10
73696/73696  mean: 0.5388 - binary_accuracy: 0.7294
검증 정확도: 0.71435546875


In [136]:
# Precision Recall
y = [_[1].numpy() for _ in iter(model.test_data)]
y_preds = [_.numpy() for _ in model.predicts]

y = np.array(y)
y_preds = np.array([_ for _ in y_preds]).round()

In [138]:
from sklearn.metrics import recall_score, precision_score

precision = precision_score(y, y_preds, average='macro')
recall = recall_score(y, y_preds, average='macro')

print("precision : {} - recall : {}".format(precision, recall))

precision : 0.5498335642082735 - recall : 0.5395407874420683
