## Item based CF구현

1. item(music) 유사도 matrix구현
2. 개인 관심도 구현 
3. 개인 관심도로 유사 음악 찾아서 추천 

In [2]:
#def dataLoad():

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as mpl
from collections import defaultdict
%matplotlib inline

In [3]:
## string to int with ignore none disit
def toint(s,default=0):
    converted = default
    try :
        converted = int(s)
    except:
        #print('string convert exception "{0}" (default={1})"'.format(s , default))
        pass
    return converted

class music_recsys:
    model = None
    home_dir = '/Users/goodvc/Data/fastcampus/week5/resource/music-ds/'
    w2v_env = { 'min_count':5, 'size':100, 'window':5 }
    nn_func = None
    
    ## initialize
    def __init__(self):
        #self.data_load()
        #self.data_processing();
        print('init')
    
    ## dataset load
    def data_load(self):
        ## music streaming log data
        self.streamings = pd.read_csv(self.home_dir+'streaming_ds.csv', 
                                      names = ['MEM_UNO', 'SONG_ID', 'ARTIST_ID', 'GENRE_CODE', 'REG_DATE'], index_col = 1)
        self.streamings = self.streamings.convert_objects(convert_numeric=True).dropna()
        self.genre = pd.read_csv(self.home_dir+'genre.csv', 
                                      names = ['GENRE_CODE', 'GENRE_NAME'], index_col = 0)
        self.artist = pd.read_csv(self.home_dir+'artist.csv', 
                                      names = ['ARTIST_ID', 'ARTIST_NAME'], index_col = 0)
    def data_processing(self):
        #music_user_dict
        self.music_user_dict = defaultdict(set)
        for idx, row in self.streamings[:].iterrows():
            self.music_user_dict[idx].add(int(row.MEM_UNO))
        
    ## data filtering 
    ## 1. 시청이 2건이하 음원
    def filtering(self):
        self.streamings = self.streamings.groupby(['SONG_ID']).filter( lambda x: len(x) > 2)
        
    def nearest_neighbors(self, song_id, topn=10, threshold=3 ):
        item1 = self.music_user_dict[song_id]
        if len(item1) < threshold:
            return []
        nn = []
        for song_id2, item2 in self.music_user_dict.items():
            intersections = len(item1.intersection(item2))
            if intersections == 0 or song_id2 == song_id:
                continue
            unions = len(item1.union(item2))+1
            similarity = intersections / unions 
            nn.append([song_id2,similarity])
        return sorted(nn,key=lambda x: x[1], reverse=True)[:topn]
    
    def user_interests(self, uno):
        return self.streamings[self.streamings.MEM_UNO==uno].index.tolist()
    
    def cal_score(self, sim, pre_score):
        return  sim if pre_score < sim else pre_score
    
    def recommend_music(self, uno, topn=10):
        musics = defaultdict(float)
        interests = self.user_interests(uno)
        for song in interests:
            neighbors = self.nearest_neighbors(song)
            for (song2, sim) in neighbors:
                #if song2 in interests:
                #    continue
                pre = musics.get(song2, 0.0)
                musics[song2] = self.cal_score(sim, pre)

        return {'recommended' : sorted(musics.items(), key=lambda x: x[1], reverse=True)[:topn]
                ,'seens':interests}

    def print_artist(self, song_list ):
        for song in song_list:
            try:
                if type(song) in { tuple, list} :
                    song = song[0]
                song = int(song)
                artist = self.streamings.loc[song][:1].ARTIST_ID.tolist()[0]
                n = self.artist.loc[str(artist)][:1].ARTIST_NAME
                print(song, n)
            except:
                print(song, 'no-artist')


In [4]:
rs = music_recsys()
rs.data_load()
rs.data_processing()

init


  data = self._reader.read(nrows)


In [9]:
## 84699045 원더걸스 (Wonder Girls) 유사한 음원은?
ret = rs.nearest_neighbors(84699045, topn=20)
rs.print_artist([84699045]+ret)

84699045 원더걸스 (Wonder Girls)
84686902 BIGBANG
84581811 no-artist
84568778 Apink (에이핑크)
84201067 BIGBANG
84847396 현아 (4Minute)
84369985 마마무
84168515 혁오 (hyukoh)
84372538 씨스타 (Sistar)
84372192 AOA
84686903 BIGBANG
84059724 Zion.T
84760929 Simon Dominic
84064039 백아연
82821208 Zion.T
84699325 no-artist
84801754 소녀시대
84822793 SG워너비
84672668 유승우
84822791 SG워너비
82812257 혁오 (hyukoh)


In [7]:
ret = rs.recommend_music(301863475, topn=20)
print('*** 추천 음악 ***')
rs.print_artist(ret['recommended'])
print('*** 시청한 음악 ***')
rs.print_artist(ret['seens'])

*** 추천 음악 ***
80696497 씨야 (SeeYa)
16098763 뱅크
75195354 플라이 투 더 스카이 (Fly To The Sky)
16128654 이기찬
48197709 엠씨더맥스 (M.C the MAX)
81028747 김연우
16131081 윤미래 (T)
52802787 프리스타일 (Free Style)
16174214 유엔 (UN)
16178434 SG워너비
42299486 Boyz II Men
80594467 Boyz II Men
16174405 no-artist
49619408 no-artist
49619412 no-artist
49619413 no-artist
49619440 no-artist
49619442 no-artist
49619443 no-artist
49619449 no-artist
*** 시청한 음악 ***
15965255 no-artist
16186222 테이 (Tei)
17465599 버즈 (Buzz)
40207122 하동균
49619416 Boyz II Men
82872166 비스트 (Beast)
83360622 제아 (브라운아이드걸스)
