# 분류 - 영화추천
## 1. 패키지 참조

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import seaborn as sb
from matplotlib import pyplot as plt
from pandas import read_csv, DataFrame, pivot_table, merge


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

## 2. 데이터 가져오기
### 1) jupyter가 참조하고 있는 현재 디렉토리 확인

In [2]:
print(os.getcwd())

/Users/dayoonz/Desktop/data_analysis/G_데이터마이닝/2_Sklearn


### 2) 영화 데이터 가져오기
- 실 분석용은 아니다. 분석 후 결과값을 맵핑시키기 위한 데이터이다

In [3]:
origin_mv = read_csv('netflix//Netflix_Dataset_Movie.csv', encoding='utf-8')
origin_mv

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


### 3) 별점 데이터 가져오기

In [4]:
origin_rating = read_csv('netflix//Netflix_Dataset_Rating.csv', encoding='utf-8')
origin_rating

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3
...,...,...,...
17337453,520675,3,4496
17337454,1055714,5,4496
17337455,2643029,4,4496
17337456,1559566,3,4496


## 3. 데이터 전처리
### 1) 별점 데이터 재구조화
- 각 사용자 번호를 컬럼으로, 영화를 인덱스로 하는 `pivot-table`을 구성한다
> 다소 시간이 오래 걸림

In [5]:
movie_users = pivot_table(origin_rating, 
                          index='Movie_ID', 
                          columns='User_ID',
                          values = 'Rating')
movie_users

User_ID,6,7,79,97,134,169,183,188,195,199,...,2649308,2649328,2649331,2649335,2649336,2649370,2649378,2649388,2649426,2649429
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
8,,5.0,,,,,,,,,...,,,,,4.0,,,,,
16,,,,,,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,4.0,
18,,,,,,,,,,,...,4.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4488,,,,,,,,,,,...,,,,,5.0,,,3.0,,
4490,,,4.0,,,,3.0,3.0,,,...,,4.0,,,,,,,,
4492,,,,,,,,3.0,,,...,,,,,,,,,,
4493,,,,,,,,,,,...,,,,,,,,,,


### 2) 데이터 정제
- `결측치`는 해당 영화를 안 본 것으로 간주하고 `0으로 대체`

In [6]:
movie_users.fillna(0, inplace=True)
movie_users

User_ID,6,7,79,97,134,169,183,188,195,199,...,2649308,2649328,2649331,2649335,2649336,2649370,2649378,2649388,2649426,2649429
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0
4490,0.0,0.0,4.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3) 독립변수, 종속변수 분리
- $x$ 는 데이터프레임 자체
- $y$ 는 데이터프레임의 인덱스 (Movie_ID)

In [7]:
x = movie_users.copy()
y = movie_users.index
x.shape, y.shape

((1350, 143458), (1350,))

## 4. 분류 모델 구축
- `최적의 k 값`을 찾아야 된다
### 1) 단일 수행

In [8]:
k = 6
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x, y)
y_pred = knn.predict(x)
score = accuracy_score(y, y_pred)
print('분류 정확도: {:.2f}%'.format(score))

분류 정확도: 0.15%


### 2) 최적의 k 찾기

In [None]:
k_range = range(1, len(x.columns))
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    # score = cross_val_score(knn, x, y, cv = 100).mean()

### 2) 교차검증 적용

In [14]:
k_range = range(1, len(x.columns))
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    # score = cross_val_score(knn, x, y, cv=100).mean()
    knn.fit(x, y)
    y_pred = knn.predict(x)
    score = accuracy_score(y, y_pred)

    if np.isnan(score):
        break
    k_scores.append(score)

k_scores

KeyboardInterrupt: 