In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [43]:
from datetime import datetime as dt

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 1. Data

In [73]:
# 1. Dataset read
data_path = "datas/movielens_small/{}"

ratings_df = pd.read_csv(data_path.format("ratings.csv"), encoding='utf-8')
ratings_df.drop("timestamp", inplace=True, axis=1)

print("평점 데이터 shape : {}".format(ratings_df.shape))
ratings_df.head()

평점 데이터 shape : (100004, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [74]:
movies_df = pd.read_csv(data_path.format("movies.csv"), encoding='utf-8')
movies_df.set_index("movieId", inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9125, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [75]:
# 2. dummy genre setting
# - 영화별 장르 포함 정보를 가진 dummy data
dummy_genre_df = movies_df['genres'].str.get_dummies(sep="|")

print("* (영화 수, 장르 수)")
print("장르 포함 정보 데이터 shape : {}".format(dummy_genre_df.shape))
dummy_genre_df.head()

* (영화 수, 장르 수)
장르 포함 정보 데이터 shape : (9125, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
# 3. year parsing
# - 영화별 년도 정보 추출

# 3-1. (4자리) 타입의 문자열 추출
years = movies_df['title'].str.extract("(\(\d\d\d\d\))")[0].values.astype("str")

# 3-2. 괄호 제거 (nan은 0으로 대치)
years = np.array([0 if y == 'nan' else int(y.replace("(","").replace(")","")) for y in years])

# 3-3. movie column 삽입
movies_df['year'] = years

# 3-4. 년도 정보가 없었던 데이터는 삭제
movies_df.drop(movies_df[movies_df['year'] == 0].index, axis=0, inplace=True)

# 3-5. 제목 칼럼 삭제
movies_df.drop("title", axis=1, inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9119, 2)


Unnamed: 0_level_0,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,1995
2,Adventure|Children|Fantasy,1995
3,Comedy|Romance,1995
4,Comedy|Drama|Romance,1995
5,Comedy,1995


In [77]:
# 4. year-level generate
bins = np.arange(1900, 2023, 20)
labels = [x for x in range(len(bins) - 1)]

# 4-1. 구간 나누기
movies_df['year_level'] = pd.cut(movies_df['year'], bins, right=False, labels=labels)

# 4-2. year 칼럼 삭제
movies_df.drop('year', axis=1, inplace=True)

print("영화 데이터 shape : {}".format(movies_df.shape))
movies_df.head()

영화 데이터 shape : (9119, 2)


Unnamed: 0_level_0,genres,year_level
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,4
2,Adventure|Children|Fantasy,4
3,Comedy|Romance,4
4,Comedy|Drama|Romance,4
5,Comedy,4


In [78]:
# 5. threshold data
# - 평점의 갯수가 10개 이상인 영화 데이터 기록

# 5-1. threshold setting each movie
threshold = 10
over_threshold = ratings_df.groupby('movieId').size() >= threshold

# 5-2. rating in
# - 평점의 갯수가 10개 이상인 영화인가? 에 대한 기록
ratings_df['over_threshold'] = ratings_df['movieId'].map(lambda x: over_threshold[x])

# 5-3. filtering
ratings_df = ratings_df[ratings_df['over_threshold']]
ratings_df.drop('over_threshold', axis=1, inplace=True)

print("평점 데이터 shape : {}".format(ratings_df.shape))
ratings_df.head()

평점 데이터 shape : (81915, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [104]:
# 6. 최종 학습 데이터 셋 생성
shuffled_df = ratings_df.sample(frac=1)

# rating num user matrix
# 몇 번째 평점의 어떤 유저의 정보인가 matrix
ru_mat = pd.get_dummies(shuffled_df['userId'], prefix="user")

# rating and movie matrix
# 몇 번째 평점의 어떤 영화의 정보인가 matrix
rm_mat = pd.get_dummies(shuffled_df['movieId'], prefix="movie")

# rating and genre matrix
# 몇 번째 평점의 어떤 장르의 정보인가 matrix
rg_mat = shuffled_df['movieId'].apply(lambda x: dummy_genre_df.loc[x])

# ratings and year_level
# 몇 번째 평점의 어떤 년도의 정보인가 vector
ry_vec = shuffled_df['movieId'].apply(lambda x: movies_df.loc[x]['year_level']).rename('year_level')

# concat
concat_df = pd.concat([ru_mat, rm_mat, rg_mat, ry_vec], axis=1)

# 정말 그냥 평점이라는 고유 데이터에
# 사용자 번호 행렬, 영화 번호 행렬, 장르 행렬, 년도 레벨 합친 데이터 이다.
print("최종 학습 데이터셋 shape : {}".format(concat_df.shape))
concat_df.head()

최종 학습 데이터셋 shape : (81915, 2937)


Unnamed: 0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9,user_10,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year_level
22508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,5
68373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
34265,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,5
37254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,4
13470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [153]:
# 7. 훈련, 시험용 데이터 분리
from sklearn.model_selection import train_test_split

# target_df
## 지금까지와 다른 특징은 평점 데이터를 사용하지 않고
## 4점이 넘으면 보는 것, 넘지 않으면 안 보는 것으로 간주 -> 추천의 관점에서
target_df = ratings_df.loc[concat_df.index]['rating']
target_df = target_df.apply(lambda x: 1 if x >= 4 else 0)

X_train, X_test, y_train, y_test = train_test_split(concat_df, target_df, test_size=0.1)

print("훈련 데이터 shape : {}, 훈련 라벨 갯수 : {}".format(X_train.shape, y_train.size))
print("테스트 데이터 shape : {}, 테스트 라벨 갯수 : {}".format(X_test.shape, y_test.size))

훈련 데이터 shape : (73723, 2937), 훈련 라벨 갯수 : 73723
테스트 데이터 shape : (8192, 2937), 테스트 라벨 갯수 : 8192


In [154]:
class FM(keras.Model):
    def __init__(self, p, n_factor=8, **kwargs):
        super().__init__(**kwargs)
        
        self.w_0 = tf.Variable([0.0])
        self.w = tf.Variable(tf.zeros(shape=[p]))
        self.v = tf.Variable(tf.random.normal(shape=(p, n_factor)))
        
    def call(self, inputs):
        degree_1 = tf.reduce_sum(tf.multiply(self.w, inputs), axis=1)
        
        degree_2 = 0.5 * tf.reduce_sum(
                    tf.math.pow(tf.matmul(inputs, self.v), 2)
                    - tf.matmul(tf.math.pow(inputs, 2), tf.math.pow(self.v, 2)),
                    1, keepdims=False
                )
        
        predict = tf.math.sigmoid(self.w_0 + degree_1, degree_2)
        
        return predict

In [155]:
learner = "adam"
learning_rate = 0.01

if learner == "adagrad":
    optimizer = keras.optimizers.Adagrad(learning_rate=learning_rate)
elif learner == "rmsprop":
    optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
elif learner == "adam":
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
else:
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)

loss_fn = keras.losses.binary_crossentropy
mean_loss = keras.metrics.Mean()
metrics = [keras.metrics.BinaryAccuracy()]
test_acc = keras.metrics.BinaryAccuracy()

n, p = X_train.shape

model = FM(p=p)
# model.compile(optimizer=optimizer, loss=loss_fn)

In [156]:
batch_size = 32

train_data = tf.data.Dataset.from_tensor_slices(
    (tf.cast(X_train, tf.float32), tf.cast(y_train, tf.float32))).shuffle(500).batch(batch_size)
test_data = tf.data.Dataset.from_tensor_slices(
    (tf.cast(X_test, tf.float32), tf.cast(y_test, tf.float32))).shuffle(200).batch(batch_size)

In [160]:
epochs = 10

n_steps = len(X_train) // batch_size

def print_status_bar(iteration, total, loss, metrics = None):
    metrics = " - ".join([f"{m.name}: {m.result():.4f}"
                          for m in [loss] + (metrics or [])])
    end = "" if iteration < total else "\n"
    print(f"\r{iteration}/{total}  " + metrics ,
          end = end)

for epoch in range(epochs):
    print(f"에포크 : {epoch}/{epochs}")

    for step, (X_batch, y_batch) in enumerate(train_data):
        # train, test data
        with tf.GradientTape() as tape:
            predict = model(X_batch)
            loss = loss_fn(y_batch, predict)
        gradients = tape.gradient(loss, model.trainable_variables,unconnected_gradients=tf.UnconnectedGradients.ZERO)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        mean_loss(loss)

        for metric in metrics:
            metric(y_batch, predict)

        print_status_bar(step * batch_size, len(y_train), mean_loss, metrics=metrics)

    for x_test, y_test in test_data:
        prediction = model(x_test)
        test_acc.update_state(y_test, prediction)

    print_status_bar(n_steps * batch_size, n_steps * batch_size, mean_loss, metrics=metrics)
    print("검증 정확도: ", test_acc.result().numpy())
    for metric in [mean_loss] + [test_acc] +metrics:
        metric.reset_states()

에포크 : 0/10
73696/73696  mean: 0.5530 - binary_accuracy: 0.7180
검증 정확도:  0.7114868
에포크 : 1/10
73696/73696  mean: 0.5397 - binary_accuracy: 0.7269
검증 정확도:  0.71154785
에포크 : 2/10
73696/73696  mean: 0.5385 - binary_accuracy: 0.7285
검증 정확도:  0.7126465
에포크 : 3/10
73696/73696  mean: 0.5382 - binary_accuracy: 0.7289
검증 정확도:  0.711792
에포크 : 4/10
73696/73696  mean: 0.5384 - binary_accuracy: 0.7292
검증 정확도:  0.7116699
에포크 : 5/10
73696/73696  mean: 0.5383 - binary_accuracy: 0.7290
검증 정확도:  0.71276855
에포크 : 6/10
73696/73696  mean: 0.5384 - binary_accuracy: 0.7293
검증 정확도:  0.71118164
에포크 : 7/10
73696/73696  mean: 0.5384 - binary_accuracy: 0.7293
검증 정확도:  0.71032715
에포크 : 8/10
73696/73696  mean: 0.5385 - binary_accuracy: 0.7303
검증 정확도:  0.710083
에포크 : 9/10
73696/73696  mean: 0.5383 - binary_accuracy: 0.7295
검증 정확도:  0.7116699
