In [None]:
# 필요한 패키지 설치 및 패키지 불러오기
!pip install surprise
!pip install tmdbv3api

import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, NormalPredictor, KNNBasic
from surprise import KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import BaselineOnly, SVDpp,NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import cross_validate, KFold, train_test_split
import json
import mpmath
from scipy.special import logsumexp

In [None]:
# # data 출처
# kaggle
# NetFlix Prize Data

# movie_titles.csv
# combined_data_1.txt
# combined_data_2.txt
# combined_data_3.txt
# combined_data_4.txt

# The Movies Dataset

# movies_metadata
# ratings
# ratings_small

In [None]:
# 전처리 과정 - ratings_small

ratings = pd.read_csv('ratings_small.csv',low_memory=False)
ratings = ratings[['userId', 'movieId', 'rating']]
ratings.head()
ratings.movieId = pd.to_numeric(ratings.movieId, errors='coerce')
ratings.userId = pd.to_numeric(ratings.userId, errors='coerce')
ratings.rating = pd.to_numeric(ratings.rating, errors='coerce')
len(ratings)
df = ratings
print(df)

In [None]:
# user와 movie 와 ratings 갯수 파악

p = df.groupby('rating')['rating'].agg(['count'])
movie_count = df['movieId'].nunique()
cust_count = df['userId'].nunique()
rating_count = df['userId'].count()
df = df[pd.notnull(df['rating'])]

In [None]:
# user , movie 구간 정하기

f = ['count','mean']
df_movie_summary = df.groupby('movieId')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int) # map 함수 쓰면 한번에 형변환 처리 가능 , 스트나 튜플을 지정함수로 처리해주는 역할
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)  #quantile 사분위수
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
print('Movie minimum times of review: {}'.format(movie_benchmark))

In [None]:


df_cust_summary = df.groupby('userId')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index
print('Customer minimum times of review: {}'.format(cust_benchmark))

In [None]:
df = df[~df['movieId'].isin(drop_movie_list)]
df = df[~df['userId'].isin(drop_cust_list)]

In [None]:
# pivot table 만들기

df_p = pd.pivot_table(df,values='rating',index='userId',columns='movieId')
print(df_p)

In [None]:
# movies_metadata 불러오기

meta = pd.read_csv('movies_metadata.csv',low_memory=False)
meta = meta[['id', 'original_title', 'genres','release_date','popularity','original_language']]
meta = meta.rename(columns={'id':'movieId'})
meta.movieId = pd.to_numeric(meta.movieId, errors='coerce')
meta.popularity = pd.to_numeric(meta.popularity, errors='coerce')

In [None]:
# 개봉일 가중치 함수
def user_release_ratio(df, usernumber):
    user_df =df[df['userId'] == usernumber]
    meta2 = pd.read_csv('movies_metadata.csv',low_memory=False)
    value_meta = meta2[['id','original_title','release_date', 'genres']]

    value_meta = value_meta.rename(columns={'id':'movieId'})
    value_meta.movieId = pd.to_numeric(value_meta.movieId, errors='coerce')
    value_meta = value_meta.dropna(axis=0)
    value_meta = value_meta.reset_index()
    merge_data = pd.merge(user_df, value_meta, on='movieId', how='left')
    merge_data = merge_data.dropna(axis=0)
    merge_data = merge_data.reset_index()

    release_date_list = {'1900':0,'1950':0,'1960':0,'1970':0,'1980':0,'1990':0,'2000':0,'2010':0,'2020':0}
    for i in range(0,len(merge_data)):
        if int(merge_data['release_date'].loc[i][0:4]) <= 1900:
            release_date_list["1900"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 1950:
            release_date_list["1950"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 1960:
            release_date_list["1960"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 1970:
            release_date_list["1970"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 1980:
            release_date_list["1980"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 1990:
            release_date_list["1990"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 2000:
            release_date_list["2000"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 2010:
            release_date_list["2010"] += 1
        elif int(merge_data['release_date'].loc[i][0:4]) <= 2020:
            release_date_list["2020"] += 1
    release_date_list

    sum = 0
    for i in release_date_list:

        sum += release_date_list[i]

    release_date_rate = []
    for i in release_date_list:
        if release_date_list[i] ==0:
            continue
        release_date_list[i] = round((release_date_list[i]/sum),3)
    return release_date_list

In [None]:
# 개봉일 가중치 + 예측치 함수
def Estimate_Score_sum1(user_df, user_release_ratio_list):
    user_df = user_df.dropna(axis=0)
    for i in range(0,len(user_df)):
        if int(user_df.iloc[i]['release_date'][0:4]) <= 1900:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['1900']
        elif int(user_df.iloc[i]['release_date'][0:4]) <= 1950:
            user_df['Estimate_Score'].loc[user_df.index[i]]= user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['1950']
        elif int(user_df.iloc[i]['release_date'][0:4]) <= 1960:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['1960']
        elif int(user_df.iloc[i]['release_date'][0:4]) <= 1970:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['1970']
        elif int(user_df.iloc[i]['release_date'][0:4]) <= 1980:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['1980']
        elif int(user_df.iloc[i]['release_date'][0:4]) <= 1990:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['1990']
        elif int(user_df.iloc[i]['release_date'][0:4])  <= 2000:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['2000']
        elif int(user_df.iloc[i]['release_date'][0:4])  <= 2010:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['2010']
        elif int(user_df.iloc[i]['release_date'][0:4])  <= 2020:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_release_ratio_list['2020']
    return user_df

In [None]:
# 인기 가중치 함수
def user_pop_ratio(df, usernumber):
    user_df =df[df['userId'] == usernumber]
    meta2 = pd.read_csv('movies_metadata.csv',low_memory=False)
    value_meta = meta2[['id','original_title', 'popularity','genres']]

    value_meta = value_meta.rename(columns={'id':'movieId'})
    value_meta.movieId = pd.to_numeric(value_meta.movieId, errors='coerce')
    value_meta.popularity = pd.to_numeric(value_meta.popularity, errors='coerce')
    value_meta = value_meta.dropna(axis=0)
    value_meta = value_meta.reset_index()
    merge_data = pd.merge(user_df, value_meta, on='movieId', how='left')
    merge_data = merge_data.dropna(axis=0)
    merge_data = merge_data.reset_index()

    popularity_list = {'2':0,'6':0,'9':0,'13':0,'18':0,'31':0,'64':0,'185':0,'288':0, '547':0}
    for i in range(0,len(merge_data)):
        if int(merge_data['popularity'].loc[i]) <= 2:
            popularity_list["2"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 6:
            popularity_list["6"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 9:
            popularity_list["9"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 13:
            popularity_list["13"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 18:
            popularity_list["18"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 31:
            popularity_list["31"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 64:
            popularity_list["64"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 185:
            popularity_list["185"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 288:
            popularity_list["288"] += 1
        elif int(merge_data['popularity'].loc[i]) <= 547:
            popularity_list["547"] += 1
    # popularity_list

    sum = 0
    for i in popularity_list:

        sum += popularity_list[i]

    #popularity_list = []
    for i in popularity_list:
        if popularity_list[i] ==0:
            continue
        popularity_list[i] = round((popularity_list[i]/sum),3)
    return popularity_list

In [None]:
# 인기 가중치 + 예측치 함수
def Estimate_Score_sum2(user_df, user_pop_ratio_list):
    user_df = user_df.dropna(axis=0)
    for i in range(0,len(user_df)):
        if int(user_df.iloc[i]['popularity']) <= 2:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['2']
        elif int(user_df.iloc[i]['popularity']) <= 6:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['6']
        elif int(user_df.iloc[i]['popularity']) <= 9:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['9']
        elif int(user_df.iloc[i]['popularity']) <= 13:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['13']
        elif int(user_df.iloc[i]['popularity']) <= 18:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['18']
        elif int(user_df.iloc[i]['popularity']) <= 31:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['31']
        elif int(user_df.iloc[i]['popularity'])  <= 64:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['64']
        elif int(user_df.iloc[i]['popularity'])  <= 185:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['185']
        elif int(user_df.iloc[i]['popularity'])  <= 288:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['288']
        elif int(user_df.iloc[i]['popularity'])  <= 547:
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_pop_ratio_list['547']
    return user_df

In [None]:
# 영화 언어 가중치 함수
def user_language_ratio(df, usernumber):
    user_df =df[df['userId'] == usernumber]
    meta2 = pd.read_csv('movies_metadata.csv',low_memory=False)
    value_meta = meta2[['id','original_title', 'original_language','genres']]

    value_meta = value_meta.rename(columns={'id':'movieId'})
    value_meta.movieId = pd.to_numeric(value_meta.movieId, errors='coerce')
    # value_meta.popularity = pd.to_numeric(value_meta.popularity, errors='coerce')
    value_meta = value_meta.dropna(axis=0)
    value_meta = value_meta.reset_index()
    merge_data = pd.merge(user_df, value_meta, on='movieId', how='left')
    merge_data = merge_data.dropna(axis=0)
    merge_data = merge_data.reset_index()

    original_language_list = {'en':0,'fr':0,'it':0,'ja':0,'de':0}
    for i in range(0,len(merge_data)):
        if merge_data['original_language'].loc[i] == 'en':
            original_language_list["en"] += 1
        elif merge_data['original_language'].loc[i] == 'fr':
            original_language_list["fr"] += 1
        elif merge_data['original_language'].loc[i] == 'it':
            original_language_list["it"] += 1
        elif merge_data['original_language'].loc[i] == 'ja':
            original_language_list["ja"] += 1
        elif merge_data['original_language'].loc[i] == 'de':
            original_language_list["de"] += 1

    # original_language_list

    sum = 0
    for i in original_language_list:

        sum += original_language_list[i]

    # popularity_list = []
    for i in original_language_list:
        if original_language_list[i] ==0:
            continue
        original_language_list[i] = round((original_language_list[i]/sum),3)

    return original_language_list

In [None]:
# 영화 언어 가중치 + 예측치 함수
def Estimate_Score_sum3(user_df, user_language_ratio_list):
    user_df = user_df.dropna(axis=0)
    for i in range(0,len(user_df)):
        if user_df.iloc[i]['original_language'] == 'en':
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_language_ratio_list['en']
        elif user_df.iloc[i]['original_language'] == 'fr':
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_language_ratio_list['fr']
        elif user_df.iloc[i]['original_language'] == 'it':
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_language_ratio_list['it']
        elif user_df.iloc[i]['original_language'] == 'ja':
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_language_ratio_list['ja']
        elif user_df.iloc[i]['original_language'] == 'de':
            user_df['Estimate_Score'].loc[user_df.index[i]] = user_df.iloc[i]['Estimate_Score'] + user_language_ratio_list['de']

    return user_df

In [None]:
# 평점 + 평점 투퓨수 가중치 함수
def user_vote_ratio():
  meta = pd.read_csv('movies_metadata.csv',low_memory=False)
  meta = meta[['id', 'original_title', 'genres','poster_path','vote_average','vote_count']]
  meta = meta.rename(columns={'id':'movieId'})
  meta.movieId = pd.to_numeric(meta.movieId, errors='coerce')
  meta.head()
  meta.vote_average =  pd.to_numeric(meta.vote_average, errors='coerce')
  meta.vote_count =  pd.to_numeric(meta.vote_count, errors='coerce')
  meta = meta[['movieId','original_title','poster_path','vote_average','vote_count','genres']].dropna()
  mata = meta.reset_index(inplace=True,drop=True)
  meta.index = meta.index.map(int)

  meta['vote_round'] = ''
  meta['Adj'] = ''
  meta.movieId = meta.movieId.astype(int)
  meta.vote_average = meta.vote_average.astype(float)
  meta.vote_count = meta.vote_count.astype(int)
  print(meta.head())
  meta.info()
  meta.describe()

  for idx in range(len(meta)):
      if str(meta.vote_average[idx])[-1] in ["1","2","3","4","6","7","8","9"]:
        meta.vote_round[idx] = round(meta.vote_average[idx],0)
      else:
        meta.vote_round[idx] = meta.vote_average[idx]


  for idx in range(len(meta)):
      if len(str(meta.vote_count[idx])) == 1:
          meta.Adj[idx] = 1.1 / 250
      if len(str(meta.vote_count[idx])) == 2:
        if str(meta.vote_count[idx])[0] == "1":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.1) / 250
        if str(meta.vote_count[idx])[0] == "2":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.2) / 250
        if str(meta.vote_count[idx])[0] == "3":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.3) / 250
        if str(meta.vote_count[idx])[0] == "4":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.4) / 250
        if str(meta.vote_count[idx])[0] == "5":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.5) / 250
        if str(meta.vote_count[idx])[0] == "6":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.6) / 250
        if str(meta.vote_count[idx])[0] == "7":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.7) / 250
        if str(meta.vote_count[idx])[0] == "8":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.8) / 250
        if str(meta.vote_count[idx])[0] == "9":
          meta.Adj[idx] = (meta.vote_average[idx] * 2.9) / 250
      if len(str(meta.vote_count[idx])) == 3:
        if str(meta.vote_count[idx])[0] == "1":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.1) / 250
        if str(meta.vote_count[idx])[0] == "2":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.2) / 250
        if str(meta.vote_count[idx])[0] == "3":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.3) / 250
        if str(meta.vote_count[idx])[0] == "4":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.4) / 250
        if str(meta.vote_count[idx])[0] == "5":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.5) / 250
        if str(meta.vote_count[idx])[0] == "6":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.6) / 250
        if str(meta.vote_count[idx])[0] == "7":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.7) / 250
        if str(meta.vote_count[idx])[0] == "8":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.8) / 250
        if str(meta.vote_count[idx])[0] == "9":
          meta.Adj[idx] = (meta.vote_average[idx] * 3.9) / 250
      if len(str(meta.vote_count[idx])) == 4:
        if str(meta.vote_count[idx])[0] == "1":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.1) / 250
        if str(meta.vote_count[idx])[0] == "2":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.2) / 250
        if str(meta.vote_count[idx])[0] == "3":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.3) / 250
        if str(meta.vote_count[idx])[0] == "4":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.4) / 250
        if str(meta.vote_count[idx])[0] == "5":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.5) / 250
        if str(meta.vote_count[idx])[0] == "6":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.6) / 250
        if str(meta.vote_count[idx])[0] == "7":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.7) / 250
        if str(meta.vote_count[idx])[0] == "8":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.8) / 250
        if str(meta.vote_count[idx])[0] == "9":
          meta.Adj[idx] = (meta.vote_average[idx] * 4.9) / 250
      if len(str(meta.vote_count[idx])) == 5:
        if str(meta.vote_count[idx])[0] == "1":
          meta.Adj[idx] = (meta.vote_average[idx] * 5.1) / 250

  meta['genres'] = meta['genres'].apply(parse_genres)

  data = pd.merge(ratings, meta, on='movieId', how='inner')
  data.info()
  data.userId = data.userId.astype('int32')
  data.movieId = data.movieId.astype('int32')
  data.rating	 = data.rating.astype('float32')
  data.vote_average = data.vote_average.astype('float32')
  data.vote_count = data.vote_count.astype('int32')
  data.vote_round = data.vote_round.astype(str).astype('float32')
  data.Adj = pd.to_numeric(data.Adj, errors='coerce').astype('float32')
  return data, meta

In [None]:
# 장르 정리 함수
def parse_genres(genres_str):
    genres = json.loads(genres_str.replace('\'', '"'))

    genres_list = []
    for g in genres:
        genres_list.append(g['name'])

    return genres_list

In [None]:
# 투표 + 예측치 함수
def Estimate_Score_sum4(df_user,user_df):
  dfsort = df_user.groupby('vote_round')['vote_round'].agg(['count'])
  dfsort = dfsort.sort_values(by='count',ascending = False)
  dfsort["countAdj"] = ''
  dfsort['Rank'] = ''
  for idx in range(len(dfsort)):
    dfsort["countAdj"].iloc[idx] = dfsort['count'].iloc[idx] / 100
    dfsort['Rank'].iloc[idx] = idx+1
  dfsort.reset_index(inplace=True)
  for idx in range(len(user_df)):
    for idx2 in range(len(dfsort)):
      if user_df.vote_round.iloc[idx] ==  dfsort.index[idx2]:
          user_df.Adj[idx]+= dfsort.countAdj[idx2]
  for idx in range(len(user_df)):
    user_df.Estimate_Score[idx] += user_df.Adj[idx]

In [None]:
# ALS 알고리즘 구현 함수
def ALS(df_p):
      R = np.array(df_p)
      R = np.nan_to_num(R)
      r_lambda = 40
      nf = 200
      alpha = 40

      R = np.array(df_p)
      R = np.nan_to_num(R)

      nu = R.shape[0]
      ni = R.shape[1]

      # initialize X and Y with very small values
      X = np.random.rand(nu, nf) * 0.01
      Y = np.random.rand(ni, nf) * 0.01

      P = np.copy(R)
      P[P > 0] = 1
      C = 1 + alpha * R

      predict_errors = []
      confidence_errors = []
      regularization_list = []
      total_losses = []

      for i in range(2):
        if i != 0:
            yT = np.transpose(Y)
            for u in range(nu):
                Cu = np.diag(C[u])
                yT_Cu_y = np.matmul(np.matmul(yT, Cu), Y)
                lIy = np.dot(r_lambda, np.identity(nf))
                yT_Cu_pu = np.matmul(np.matmul(yT, Cu), P[u])
                X[u] = np.linalg.solve(yT_Cu_y + lIy, yT_Cu_pu)

            xT = np.transpose(X)
            for i in range(ni):
                Ci = np.diag(C[:, i])
                xT_Ci_x = np.matmul(np.matmul(xT, Ci), X)
                lIx = np.dot(r_lambda, np.identity(nf))
                xT_Ci_pi = np.matmul(np.matmul(xT, Ci), P[:, i])
                Y[i] = np.linalg.solve(xT_Ci_x + lIx, xT_Ci_pi)
        predict = np.matmul(X, np.transpose(Y))
        predict_error = np.square(P - predict)
        confidence_error = np.sum(C * predict_error)
        regularization = r_lambda * (np.sum(np.square(X)) + np.sum(np.square(Y)))
        total_loss = confidence_error + regularization
        # predict_error, confidence_error, regularization, total_loss = loss_function(C, P, predict, X, Y, r_lambda)

        predict_errors.append(predict_error)
        confidence_errors.append(confidence_error)
        regularization_list.append(regularization)
        total_losses.append(total_loss)

      predict = np.matmul(X, np.transpose(Y))
      print('final predict')
      print([predict])

      return  predict

In [None]:
reader = Reader()

In [None]:
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
svd = SVD()
slope = SlopeOne()
nmf = NMF()

bsl_options = {'method': 'als',
'n_epochs': 5,
'reg_u': 12,
'reg_i': 5
}
als = BaselineOnly(bsl_options=bsl_options)

als_result = cross_validate(als, data, measures=['RMSE', 'MAE'],cv=5, verbose=False)
slope_result = cross_validate(slope, data, measures=['RMSE', 'MAE'],cv=5, verbose=False)
svd_result = cross_validate(svd, data, cv = 5,  measures=['RMSE', 'MAE']) #evaluate 대신
nmf_result = cross_validate(nmf, data, measures=['RMSE', 'MAE'],cv=5, verbose=False)

In [None]:
# 알고리즘에 따른 RMSE, MAE 비교
print(slope_result)
print(svd_result)
print(nmf_result)
print(als_result)

In [None]:
meta['genres'] = meta['genres'].apply(parse_genres)

In [None]:
# 유저에 따른 개인 영화 추천
def user_difference(data,usernumber,rating,moviedata,dropdata,reader,svd):
    df = data
    df_user = df[(df['userId'] == usernumber) & (df['rating'] == rating)]
    df_user = df_user.set_index('movieId')
    df_user = df_user.join(moviedata)['original_title']
    print(df_user)

    user_release_ratio_list = user_release_ratio(df, usernumber)

    user_df = moviedata.copy()
    user_df = user_df[~user_df['movieId'].isin(dropdata)]
    data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    trainset = data1.build_full_trainset()
    svd.fit(trainset)
    user_df['Estimate_Score'] = user_df['movieId'].apply(lambda x: svd.predict(usernumber, x).est)
    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    print(user_df.head(10))

    return user_df


In [None]:
user_df665 = user_difference(df,665,5,meta,drop_movie_list,reader,svd)
user_df664 = user_difference(df,664,5,meta,drop_movie_list,reader,svd)
print(user_df665.iloc[0:12,[0,1,6]])

In [None]:
# 유저의 변화에 따른 영화 추천
def user_difference(data,usernumber,rating,moviedata,dropdata,reader,svd):
    df = data
    df_user = df[(df['userId'] == usernumber) & (df['rating'] == rating)]
    df_user = df_user.set_index('movieId')
    df_user = df_user.join(moviedata)['original_title']
    print(df_user)

    user_release_ratio_list = user_release_ratio(df, usernumber) # 유저의 년도 비율을 가져온다.

    user_df = moviedata.copy()
    user_df = user_df[~user_df['movieId'].isin(dropdata)]
    data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    trainset = data1.build_full_trainset()
    svd.fit(trainset)
    user_df['Estimate_Score'] = user_df['movieId'].apply(lambda x: svd.predict(usernumber, x).est)
    # user_df = user_df.drop('movieId', axis = 1)
    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    print(user_df.head(10))

    return user_df


In [None]:
user_1 = user_difference(df,1,5,meta,drop_movie_list,reader,svd)
df2 = df
df2.loc[1] = [1,5502,5.0]
df2.loc[2] = [1,5991.0,5.0]
add_user_1 = user_difference(df2,1,5,meta,drop_movie_list,reader,svd)

In [None]:
# 변중 가중치에 따른 영화 추천
def variable_weight(data,usernumber,rating,moviedata,dropdata,reader,algo):
    df = data
    df_user = df[(df['userId'] == usernumber) & (df['rating'] == rating)]
    df_user = df_user.set_index('movieId')
    df_user = df_user.join(moviedata)['original_title']
    # print(df_user)

    user_release_ratio_list = user_release_ratio(df, usernumber)
    # print(user_release_ratio_list)
    user_pop_ratio_list = user_pop_ratio(df, usernumber)
    # print(user_pop_ratio_list)
    user_language_ratio_list = user_language_ratio(df, usernumber)
    # print(user_language_ratio_list)

    user_df = moviedata.copy()
    user_df = user_df[~user_df['movieId'].isin(dropdata)]
    data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    trainset = data1.build_full_trainset()
    algo.fit(trainset)
    user_df['Estimate_Score'] = user_df['movieId'].apply(lambda x: algo.predict(usernumber, x).est)
    # user_df = user_df.drop('movieId', axis = 1)
    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    # print(user_df.head(10))

    user_df_sum = Estimate_Score_sum1(user_df, user_release_ratio_list)
    user_df_sum_relase = user_df_sum.sort_values('Estimate_Score', ascending=False)
    print("개봉일 별 가중치")
    print(user_df_sum_relase.head(10))

    user_df_sum = Estimate_Score_sum2(user_df, user_pop_ratio_list)
    user_df_sum_pop = user_df_sum.sort_values('Estimate_Score', ascending=False)
    print("인기 별 가중치")
    print(user_df_sum_pop.head(10))

    user_df_sum = Estimate_Score_sum3(user_df, user_language_ratio_list)
    user_df_sum_lang = user_df_sum.sort_values('Estimate_Score', ascending=False)
    print("언어 별 가중치")
    print(user_df_sum_lang.head(10))


    return user_df_sum_relase, user_df_sum_pop, user_df_sum_lang
user_df_sum_relase,user_df_sum_pop, user_df_sum_lang = variable_weight(df,665,5,meta,drop_movie_list,reader,svd)

In [None]:
vote_data, vote_meta = user_vote_ratio()

In [None]:
# 개봉일 변수 가중치 영화 추천
def variable_weight2(data,usernumber,rating,moviedata):
    df = data.copy()
    df_user = df[(df['userId'] == usernumber) & (df['rating'] == rating)]
    df_user = df_user.set_index('movieId')
    # print(df_user.iloc[:,2]) #제가 만든 meta에 이미 title이 있어서join을 할 필요가 없었습니다.


    user_df = moviedata.copy()
    user_df = user_df[~user_df['movieId'].isin(drop_movie_list)]
    data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    trainset = data1.build_full_trainset()
    svd.fit(trainset)
    user_df['Estimate_Score'] = user_df['movieId'].apply(lambda x: svd.predict(usernumber, x).est)
    user_df.reset_index(inplace=True, drop=True)


    Estimate_Score_sum4(df_user,user_df)
    # user_df = user_df.drop('movieId', axis = 1)
    user_df = user_df.sort_values(('Estimate_Score'), ascending=False)
    user_df_sum_vote = pd.DataFrame(user_df)
    user_df_sum_vote = user_df_sum_vote.iloc[:,[0,1,3,4,5,8]]


    return user_df_sum_vote
user_df_sum_vote = variable_weight2(vote_data,665,5,vote_meta)

In [None]:
# 알고리즘에 따른 영화추천
def userRec3(data,usernumber,rating,moviedata,dropdata,reader,algo):
    df = data
    df_user = df[(df['userId'] == usernumber) & (df['rating'] == rating)]
    df_user = df_user.set_index('movieId')
    df_user = df_user.join(moviedata)['original_title']
    # print(df_user)

    user_release_ratio_list = user_release_ratio(df, usernumber)
    # print(user_release_ratio_list)
    user_pop_ratio_list = user_pop_ratio(df, usernumber)
    # print(user_pop_ratio_list)
    user_language_ratio_list = user_language_ratio(df, usernumber) .
    # print(user_language_ratio_list)

    user_df = moviedata.copy()
    user_df = user_df[~user_df['movieId'].isin(dropdata)]
    data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
    trainset = data1.build_full_trainset()
    algo.fit(trainset)
    user_df['Estimate_Score'] = user_df['movieId'].apply(lambda x: algo.predict(usernumber, x).est)
    # user_df = user_df.drop('movieId', axis = 1)
    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    # print(user_df.head(10))

    # user_df_sum = Estimate_Score_sum1(user_df, user_release_ratio_list)
    # # user_df_sum = user_df_sum.sort_values('Estimate_Score', ascending=False)
    # # print(user_df_sum.head(10))

    # user_df_sum = Estimate_Score_sum2(user_df_sum, user_pop_ratio_list)
    # # user_df_sum = user_df_sum.sort_values('Estimate_Score', ascending=False)
    # # print(user_df_sum.head(10))

    # user_df_sum = Estimate_Score_sum3(user_df_sum, user_language_ratio_list)
    # user_df_sum = user_df_sum.sort_values('Estimate_Score', ascending=False)
    # print(user_df_sum.head(10))

    return user_df
als_recomend = userRec3(df,665,5,meta,drop_movie_list,reader,als)
svd_recomend = userRec3(df,665,5,meta,drop_movie_list,reader,svd)
slope_recomend = userRec3(df,665,5,meta,drop_movie_list,reader,slope)
nmf_recomend = userRec3(df,665,5,meta,drop_movie_list,reader,nmf)

In [None]:
# ALS, SVD, SLOPE-ONE, NMF 알고리즘 별 영화 추천
print(als_recomend)
print(svd_recomend)
print(slope_recomend)
print(nmf_recomend)