In [None]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

### 1.MovieLens 데이터 셋 불러오기

In [None]:
path="/content/drive/MyDrive/Colab Notebooks/movielens/"

In [None]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col = 'movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [None]:
# print(ratings_df)
print(ratings_df.shape)
print(ratings_df.info())
ratings_df

In [None]:
len(set(ratings_df['userId']))
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print(len(num_users), len(num_movies))

In [None]:
user_movie_matrix = ratings_df.pivot(index='movieId', columns= 'userId', values='rating').fillna(0)
user_movie_matrix

In [None]:
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                           index = user_movie_matrix.columns, columns=['movies_rated'])
user_info_df

In [None]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                           index = user_movie_matrix.index, columns=['users_rated'])
movie_info_df

In [None]:
ratings_df

In [None]:

# train_test_split
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [None]:
print(train_df.shape,test_df.shape)

### test set에는 존재하지만, train set에는 없는 영화 또는 사용자 비율

In [None]:
len(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))

In [None]:
print(f"시용자 : {len(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))}")
print(f"시용자 : {len(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))}")
print(f"테스트셋의 전체 영화 수: {len(test_df['movieId'].unique())}")

In [None]:
print(f"시용자 : {len(list(set(train_df['movieId'].unique()) - set(test_df['movieId'].unique())))}")

## 2.간단한 추천 알고리즘 만들기
##### 1. 랜덤으로 평점예측  

In [None]:
 ratings_range= np.arange(0.5, 5.5, step=0.5)

In [None]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random

In [None]:
test_df['pred_ratings_random'] = pred_random
test_df

In [None]:
# mse 구하기
mse = mean_squared_error(test_df['rating'].values,test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

## 2. 사용자 평균 평점을 기반으로 예측

In [None]:
train_user_df = train_df.groupby('userId').mean()

print(train_user_df.shape)
print(train_user_df.head())

In [None]:
def avg_rating_prediction(train_set, x):
    if x in train_set.index:
        pred_rating = train_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [None]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x : avg_rating_prediction(train_user_df, x))

In [None]:
test_df

In [None]:
## 3. 영화 평균 평점을 기반으로 예측

In [None]:
train_movie_df = train_df.groupby('movieId').mean()
test_df['pred_rating_movie'] = test_df['movieId'].apply(lambda x : avg_rating_prediction(train_movie_df, x))
test_df

In [None]:
mse_user = mean_squared_error(test_df['rating'].values, test_df['pred_rating_user'].values)
rmse_user = np.sqrt(mse_user)
print(mse_user, rmse_user)

mse_movie = mean_squared_error(test_df['rating'].values, test_df['pred_rating_movie'].values)
rmse_movie = np.sqrt(mse_movie)
print(mse_movie, rmse_movie)