<a href="https://colab.research.google.com/github/codesongs/codestates_TP2/blob/main/knnbase_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/tp2'

Mounted at /content/drive


In [None]:
import joblib
from torch.utils.data import DataLoader
import sys
from pathlib import Path
import os
import argparse

#콘텐츠 기반 필터링용 패키지
import pandas as pd
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.downloader as api
from sklearn.neighbors import NearestNeighbors
import numpy as np
import joblib

def load_ratings(path):
    COL_NAME = ['userId','movieId','rating','timestamp']
    df   = pd.read_csv(os.path.join(path,"ratings.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies(path):
    COL_NAME = ['movieId','title','genres']
    df = pd.read_csv(os.path.join(path,"movies.dat"),sep='::', header=None, engine='python', names=COL_NAME, encoding = 'ISO-8859-1' )
    return df

def load_users(path):
    COL_NAME = ['userId','gender','age','Occupation','zip_code']
    df = pd.read_csv(os.path.join(path,"users.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

df_ratings = load_ratings(path)
df_movies = load_movies(path)
df_users = load_users(path)

In [None]:
# 'movieId'와 'userId'를 기준으로 데이터프레임 통합
df_total = pd.merge(df_ratings, df_movies, on='movieId', how='left')
df_total = pd.merge(df_total, df_users, on='userId', how='left')

# 결과 확인
df_total

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,gender,age,Occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy,M,25,6,11106
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War,M,25,6,11106
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama,M,25,6,11106
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama,M,25,6,11106


In [None]:
import easydict

args = easydict.EasyDict({
    "user": 56,
    "num": 20
})

def parse_opt():
    return args


In [None]:
class KNN():
    def __init__(self, path):
        self.model = joblib.load(path)

    def predict(self, userid, n):
        # 데이터 로드
        cbf_data = joblib.load('/content/cbf_data.joblib')
        df_ratings = load_ratings(path)

        # 유저가 시청했던 영화 목록 호출
        movie_list = df_ratings[df_ratings['userId'] == userid]['movieId'].tolist()

        # 입력 벡터 생성
        # 입력 벡터는 유저가 본 영화의 모든 벡터의 평균을 사용
        m_vector = 0
        for m in movie_list:
            m_vector += cbf_data[m]

        # 예측
        return self.model.kneighbors(m_vector.reshape((1, -1)), n_neighbors=n)[1][0]

knn_model = KNN('/content/knn.joblib')
user_id = 56
num_recommendations = 20
recommendations = knn_model.predict(user_id, num_recommendations)

In [None]:
def train(df_movies=None, vector_size=100, pretrained = 'glove-twitter-100'):
    if df_movies is None:
        df_movies = load_movies('/content')

    print("---Tokenizing...---")
    tokens = df_movies['title'].apply(tokenizer)
    print("Tokenizing Complete.")

    print("---w2v Training...---")
    w2v = Word2Vec(sentences=tokens, vector_size = vector_size, window = 2, min_count = 1, workers = 4, sg= 0)
    w2v.save("./models/word2vec.model")
    print(w2v.wv.vectors.shape)
    print("w2v Training Complete.")

    wv = w2v.wv

    vectors = tokens.apply(vectorizer)

    print("---pre-trained w2v loading...---")
    #사전 훈련된 w2v 가중치 호출
    wv2 = api.load(f"{pretrained}")
    print("loading Complete.")

    def gen2vec(sentence):
        vector = 0
        for g in sentence.split('|'):
            if g.lower() == "children's":
                g = "children"
            elif g.lower() == "film-noir":
                g = "noir"

            vector += wv2[g.lower()]
        return vector

    g_vector = df_movies['genres'].apply(gen2vec)

    #훈련 데이터 생성
    cbf_vectors = ((vectors.to_numpy() + g_vector.to_numpy()) / 2).tolist()
    cbf_data = np.zeros((df_movies['movieId'].max()+1, 100))

    for idx, vec in zip(df_movies['movieId'], cbf_vectors):
        cbf_data[idx] = vec

    joblib.dump(cbf_data, "./models/cbf_data.joblib")

    print("---knn Training...---")
    knn = NearestNeighbors()
    knn.fit(cbf_data)
    joblib.dump(knn, "./models/knn.joblib")
    print("---knn Done.---")

# if __name__ == '__main__':
#     opt = parse_opt()
#     knn = KNN('/content/knn.joblib')
#     print(knn.predict(opt.user, opt.num))

if __name__ == '__main__':
    opt = parse_opt()
    knn = KNN('/content/knn.joblib')

    all_user_recommendations = []

    for user_id in range(1, 6041): # 1부터 6040까지 반복
        recommendations = knn.predict(user_id, opt.num)
        all_user_recommendations.append((user_id, recommendations))

    # 처음 5명의 추천 목록을 출력
    print(all_user_recommendations[:5])


OSError: ignored

In [None]:
all_user_recommendations[-1]

(1766,
 array([1205, 1264, 1215,   70, 1196,  258, 1210, 2322, 2054,  610, 2080,
        2081,  673,  558, 1566, 2872, 3257, 3584,  247,  168]))

In [None]:
user_id = 56  # 원하는 사용자의 ID (예: 56)
min_rating = 4  # 최소 평점
max_rating = 5  # 최대 평점

# 해당 유저가 평가한 영화 중에서 min_rating 이상, max_rating 이하의 평점을 가진 영화를 필터링합니다.
user_56_high_rated_movies = df_ratings[(df_ratings['userId'] == user_id) & (df_ratings['rating'] >= min_rating) & (df_ratings['rating'] <= max_rating)]['movieId'].tolist()

print(f"User {user_id}가 평점 {min_rating} 이상 {max_rating} 이하로 준 영화 목록:")
print(user_56_high_rated_movies)

User 56가 평점 4 이상 5 이하로 준 영화 목록:
[2997, 588, 1, 6, 3948, 1408, 3019, 2302, 1500, 3114, 2321, 2329, 1617, 2427, 2355, 2359, 2289, 1635, 1641, 110, 2396, 2542, 2706, 150, 151, 161, 1923, 2580, 318, 246, 25, 2762, 34, 36, 39, 364, 296, 3728, 2858, 440, 50, 527, 457, 608, 1213, 474, 2890, 2028, 1089, 628]


In [None]:
# # 예시 데이터
# actual_preference = user_56_high_rated_movies # 실제 선호 영화 리스트
# recommended_movies = knn.predict(opt.user, opt.num)  # 예측된 추천 영화 리스트

# 추천된 영화 중에서 실제 선호하는 영화의 순서를 계산합니다.
def calculate_dcg(recommended, actual):
    dcg = 0.0
    for i, movie_id in enumerate(recommended):
        if movie_id in actual:
            # 추천된 영화 중에 실제 선호하는 영화라면
            relevance = 1.0  # 이 예시에서는 간단하게 "선호" 또는 "비선호"로 측정하며, 선호하는 경우에는 1로 가정합니다.
            rank = i + 1  # 영화의 순위 (1부터 시작)
            dcg += (2 ** relevance - 1) / (np.log2(rank + 1))  # DCG 계산 공식
    return dcg

# NDCG 계산
def calculate_ndcg(recommended, actual, k=None):
    if k is not None:
        recommended = recommended[:k]  # 상위 k개의 추천만 고려
    idcg = calculate_dcg(sorted(actual, reverse=True), actual)  # Ideal DCG 계산
    dcg = calculate_dcg(recommended, actual)  # 추천된 영화의 DCG 계산
    if idcg == 0:
        return 0.0  # 분모가 0인 경우 처리
    return dcg / idcg  # NDCG 계산

# Recall 계산
def calculate_recall(recommended, actual, k=None):
    if k is not None:
        recommended = recommended[:k]  # 상위 k개의 추천만 고려
    correct = len(set(recommended) & set(actual))  # 실제 선호하는 영화 중에서 추천된 영화의 수
    if len(actual) == 0:
        return 0.0  # 분모가 0인 경우 처리
    return correct / len(actual)  # Recall 계산

# NDCG 및 Recall 계산
k = 20  # 상위 k개의 추천을 고려
ndcg = calculate_ndcg(recommended_movies, actual_preference, k)
recall = calculate_recall(recommended_movies, actual_preference, k)

print("NDCG@{}: {:.4f}".format(k, ndcg))
print("Recall@{}: {:.4f}".format(k, recall))

NDCG@20: 0.0000
Recall@20: 0.0000


In [None]:
import numpy as np
import pandas as pd

def compute_ndcg_k(predictions, actual, k=20):
    if len(predictions) == 0:
        return 0.0

    dcg = 0.0
    idcg = 0.0
    for i, pred in enumerate(predictions[:k]):
        if pred in actual:
            dcg += (1.0 / np.log2(i + 2))
    for i in range(min(k, len(actual))):
        idcg += (1.0 / np.log2(i + 2))

    if idcg == 0.0:
        return 0.0

    return dcg / idcg

def compute_recall_k(predictions, actual, k=20):
    return len(set(predictions[:k]) & set(actual)) / float(20)

def evaluate(df_rating, all_user_recommendations):
    ndcgs = []
    recalls = []

    for user, recommendations in all_user_recommendations:
        actual_movies = df_rating[df_rating['userId'] == user]['movieId'].tolist()
        ndcgs.append(compute_ndcg_k(recommendations, actual_movies))
        recalls.append(compute_recall_k(recommendations, actual_movies))

    return {
        'average_ndcg': np.mean(ndcgs),
        'average_recall': np.mean(recalls)
    }


# Assuming df_rating is your dataframe
results = evaluate(df_ratings, all_user_recommendations)
print(results)


{'average_ndcg': 0.16306108705228783, 'average_recall': 0.16962061155152885}
