## 사용된 데이터 세트
- **The movies Dataset (https://www.kaggle.com/rounakbanik/the-movies-dataset)**

In [1]:
import pandas as pd 
import numpy as np

# 데이터 읽어오기 
movies = pd.read_csv('./dataset/net_movies.csv')
ratings = pd.read_csv('./dataset/net_ratings.csv')

In [2]:
# 아이템 기반 협업 필터링 
 
data = pd.merge(ratings,movies,on="movieId")
column = ['userId','movieId','rating','title','genres']
data = data[column]
print(data)
 
moviedata = data.pivot_table(index="movieId",  columns='userId')['rating']
print(moviedata)

         userId  movieId  rating  \
0             1        2     3.5   
1             5        2     3.0   
2            13        2     3.0   
3            29        2     3.0   
4            34        2     3.0   
...         ...      ...     ...   
1048570    7066    88572     1.5   
1048571    7066   112412     4.5   
1048572    7077    32013     3.5   
1048573    7086   102596     4.5   
1048574    7110    65651     2.0   

                                                     title  \
0                                           Jumanji (1995)   
1                                           Jumanji (1995)   
2                                           Jumanji (1995)   
3                                           Jumanji (1995)   
4                                           Jumanji (1995)   
...                                                    ...   
1048570                             Fred: The Movie (2010)   
1048571                             Perfect Sisters (2014)   
1048572  

In [3]:
# NaN값을 -1로 변경 ( 평점을 계산할 때 양수값만 처리하면 됌)

moviedata.fillna(-1, inplace=True)
moviedata

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.0,-1.0,4.0,-1.0,-1.0,5.0,-1.0,4.0,-1.0,4.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,4.0,4.0,-1.0,5.0,4.5
2,3.5,-1.0,-1.0,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.0
3,-1.0,4.0,-1.0,-1.0,-1.0,3.0,3.0,5.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4.0,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,4.0,3.5,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130073,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
130219,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
130462,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
130490,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [4]:
#피어슨 상관계수
from math import sqrt

def sim_pearson(data, n1, n2): 
    #구현
    sumX = 0
    sumY = 0
    sumSqX = 0 # x 제곱합 
    sumSqY = 0 # y 제곱합 
    sumXY = 0 # XY 합
    cnt = 0 # 영화 갯수 
    
    for i in data.loc[n1,data.loc[n1,:] >= 0].index:
        if  data.loc[n2,i] >= 0:
            sumX += data.loc[n1,i]
            sumY += data.loc[n2,i]
            sumSqX += pow(data.loc[n1,i],2)
            sumSqY += pow(data.loc[n2,i],2)
            sumXY += (data.loc[n1,i])*(data.loc[n2,i])
            cnt += 1
            global num # 전역변수 선언
            global den # 전역변수 선언
            num = sumXY - ((sumX * sumY) / cnt)
            den = (sumSqX - (pow(sumX, 2) / cnt)) * (sumSqY - (pow(sumY, 2) / cnt))
    return num / sqrt(den + 0.00001) # 분모=0방지

In [5]:
# 나와 유사도가 높은 user와 매칭 함수

def top_match(data, name, rank = 5, simf = sim_pearson):
    simList = []
    for i in data.index:
        if name != i:
            if simf(data, name, i) is not None:
                simList.append((simf(data, name, i), i))
    simList.sort()
    simList.reverse()
    return simList[:rank]

In [6]:
#추천 시스템 함수

def recommendation(data, person, simf = sim_pearson):
    res = top_match(data, person, len(data))
    score_dic = {}
    sim_dic = {}
    myList = []
    for sim, name in res:
        if sim < 0:
            continue
        for movie in data.loc[person, data.loc[person, :] < 0].index:
            simSum = 0
            if data.loc[name, movie] >= 0:
                simSum += sim * data.loc[name, movie]
                
                score_dic.setdefault(movie, 0)
                score_dic[movie] += simSum
                
                sim_dic.setdefault(movie, 0)
                sim_dic[movie] += sim                
    for key in score_dic:
        myList.append((score_dic[key] / sim_dic[key], key))
    myList.sort()
    myList.reverse()
    
    return myList

In [7]:
# 25번 user가 안본 영화중에서
#추천 점수가 가장 높은 순으로 예상평점과 영화제목을 추천 (10개까지)
# 단점 : 시간이 엄청나게 걸림.

movieList = []
for rate, m_id in recommendation(moviedata, 1):
    movieList.append((rate, movies.loc[movies['movieId'] == m_id, 'title'].values[0]))
    if len(movieList)==10: # 10개 되면 멈추기 
        break
        
movieList[:10]

[(5.0, 'Siam Sunset (1999)'),
 (5.0, 'Faust (1926)'),
 (4.999999999999998, 'Mission to Mars (2000)'),
 (4.937573538543576, 'Dinosaur (2000)'),
 (4.936738122276483, '1969 (1988)'),
 (4.889092433581694, 'Austin Powers: International Man of Mystery (1997)'),
 (4.885140668693917, 'My Fair Lady (1964)'),
 (4.86901409423573,
  'Zombie Holocaust (a.k.a. Doctor Butcher M.D.) (Zombi Holocaust) (1980)'),
 (4.842156691080684, 'Torso (1973)'),
 (4.817939530203945, 'Torn Curtain (1966)')]