# **협업필터링**
플레이리스트 쌍마다 공통 곡의 수를 가중치로 하여 곡과 태그를 추천

In [None]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# json 파일 작성
def write_json(data, fname):
  def _conv(o):
    # isinstance : 주어진 인스턴스가 특정 클래스/데이터 타입인지 확인
    if isinstance(o, np.int64) or isinstance(o, np.int32):
      return int(o) # int64, int32인지 확인 후 int로 출력
    raise TypeError
  
  parent = os.path.dirname(fname) # os.path.dirname(fname) : 경로 중 디렉토리 이름까지만 얻기
  distutils.dir_util.mkpath("C:/Users/이다은/Desktop/BOAZ 14기/추천시스템/" + parent) # 디렉토리 만들기

  # 파일 입출력( W : 쓰기)
  with io.open("C:/Users/이다은/Desktop/BOAZ 14기/추천시스템/" + fname, "w", encoding="utf8") as f:
    # json.dumps : 직렬화해서 객체를 파일에 쓴다
    json_str = json.dumps(data, ensure_ascii=False, default = _conv)
    f.write(json_str)

# json 파일 로드
def load_json(fname):
  with open(fname, encoding='utf8') as f :
    json_obj = json.load(f)

  return json_obj

# json 파일 debug
def debug_json(r):
  print(json.dumps(r, ensure_ascii=False, indent=4))

## class로 nDCG 스코어 구현


In [None]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json

class CustomEvaluator:
  # idcg 점수 우선 def
  def _idcg(self, l):
    return sum((1.0/ np.log(i+2) for i in range(l)))
  
  # 객체 생성(첫 인자는 self)
  def __init__(self):
    self._idcgs = [self._idcg(i) for i in range(101)]

  ## ndcg 정의! >_<
  def _ndcg(self, gt, rec):
    dcg = 0.0
    for i, r in enumerate(rec): # enumerate : 인덱스 값을 포함하는 객체 리턴
      if r in gt:
        dcg += 1.0 / np.log(i+2) # += : 왼쪽 변수에 오른쪽 값 더하고 왼쪽에 할당

    return dcg / self._idcg[len(gt)]    

  def _eval(self, gt_fname, rec_fname):
    gt_playlists = load_json(gt_fname)
    gt_dict = {g['id'] : g for g in gt_playlists}

    rec_playlists = load_json(rec_fname) # 플레이리스트 파일 로드

    music_ndcg = 0.0
    tag_ndcg = 0.0

    for rec in rec_playlists:
      gt = gt_dict[rec['id']]
      music_ndcg += self._ndcg(gt['songs'], rec['songs'][:100]) # 노래 ndcg 
      tag_ndcg += self._ndcg(gt['tags'], rec['tags'][:10]) # 태그 ndcg

    music_ndcg = music_ndcg / len(rec_playlists)
    tag_ndcg = tag_ndcg / len(rec_playlists)
    score = music_ndcg * 0.85 + tag_ndcg * 0.15 # 최종 score

    return music_ndcg, tag_ndcg, score # score 구현

  def evaluate(self, gt_fname, rec_fname):
    try: # 실행할 코드
      music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
      print(f"Music nDCG: {music_ndcg:.6}")
      print(f"Tag nDCG: {tag_ndcg:.6}")
      print(f"Score: {score:.6}")

    except Exception as e: # 예외 발생시 실행
      print(e)

# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)  

In [None]:
from collections import Counter # 컨테이너에 동일한 자료가 몇개인지 파악 : 출력값은 딕셔너리
import numpy as np
import pandas as pd

import scipy.sparse as spr # Sparse matrices 출력
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
val = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/val.json', typ = 'frame',encoding='UTF-8')
song_meta = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/song_meta.json', typ = 'frame',encoding='UTF-8')
train = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/train.json', typ = 'frame', encoding='utf-8')
test = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/test.json', typ = 'frame', encoding='utf-8')
genre_gn_all = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/genre_gn_all.json', typ = 'series',encoding='utf-8')

### playlist, song, tag의 id 재생성

> 새로 생성할 id를 matrix의 row, column index로 사용할 것이기 때문

In [None]:
### plylst id 재생성

train['istrain'] = 1
val['istrain'] = 0

n_train = len(train)
n_val = len(val)

# train+val = plylst
plylst = pd.concat([train, val], ignore_index = True) 
# ignore_index = True : 열 이름 무시하고 정수 번호 자동 부여

## train과 val 합침!! 
plylst['nid'] = range(n_train + n_val) # 인덱스 재생성

# plylst의 id를 nid로 순서대로 바꾼다
# zip : 동일한 개수를 가진 자료형을 묶어 주는 역할 > zip으로 묶어서 딕셔너리로 만든다
plylst_id_nid = dict(zip(plylst['id'], plylst['nid']))
plylst_nid_id = dict(zip(plylst['nid'], plylst['id']))

song, tag 인덱스 재생성

In [None]:
plylst_tag = plylst['tags'] 
tag_counter = Counter( [tg for tgs in plylst_tag for tg in tgs] ) # 딕셔너리 형태로 출력
tag_dict = {x: tag_counter[x] for x in tag_counter} # Counter({딕셔너리}) 형태임 > 그냥 딕셔너리로 바꿔주기

# 태그 아이디 재생성
tag_id_tid = dict() # 딕셔너리 만들기
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
  tag_id_tid[t] = i
  tag_tid_id[i] = t

n_tags = len(tag_dict) # 태그 종류 개수

###########################################################################
plylst_song = plylst['songs']
song_counter = Counter( [sg for sgs in plylst_song for sg in sgs] )
song_dict = {x : song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
  song_id_sid[t] = i
  song_sid_id[i] = t

n_songs = len(song_dict) # song  종류 개수


In [None]:
print(n_songs, n_tags, song_meta.shape)
 # 중복된 곡은 같은 인덱스로 나옴. 중복 포함해서 쓰인 곡은 63만개밖에 안됨. 
 # song_meta 중 7만개는 아예 안쓰임. 

638336 30197 (707989, 9)


In [None]:
# plylst의 songs와 tags를 새로운 id로 변환하여 데이터프레임에 추가
### 중복 인덱스는 앞에 인덱스로 나온다 !!!
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None]) # 아이디로 리스트 만들어줌. 
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [None]:
plylst.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,songs_id,tags_id
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0]
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]"
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[3, 4]"
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,1,3,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...","[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]"
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,4,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",[15]


In [None]:
val.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain
0,[],118598,,"[373313, 151080, 275346, 696876, 165237, 52593...",1675,2019-05-27 14:14:33.000,0
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000,0
2,[],51464,,"[529437, 516103, 360067, 705713, 226062, 37089...",62,2008-06-21 23:26:22.000,0
3,[],45144,,"[589668, 21711, 570151, 320043, 13930, 599327,...",20,2017-10-30 18:15:43.000,0
4,[],79929,,"[672718, 121924, 102694, 683657, 201558, 38511...",20,2017-02-07 11:40:42.000,0


In [None]:
plylst.shape # train + val 합쳐버림

(138086, 10)

In [None]:
# song, tag, plylst id + train 여부 + 날짜만 출력
plylst_use = plylst[['istrain', 'nid', 'updt_date', 'songs_id', 'tags_id']]
plylst_use.head(2)

Unnamed: 0,istrain,nid,updt_date,songs_id,tags_id
0,1,0,2013-12-19 18:36:19.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0]
1,1,1,2014-12-02 16:19:42.000,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]"


In [None]:
# song의 개수, tag의 개수열 추가
plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

In [None]:
plylst_use.head(2)

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,2013-12-19 18:36:19.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0],19,1
1,1,2014-12-02 16:19:42.000,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]",42,2


In [None]:
# 데이터 나누기
plylst_train = plylst_use.iloc[:n_train, :]
plylst_val = plylst_use.iloc[n_train:, :]

In [None]:
plylst_train.head(2)

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,2013-12-19 18:36:19.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0],19,1
1,1,2014-12-02 16:19:42.000,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]",42,2


In [None]:
plylst_val.shape

(23015, 6)

In [None]:
plylst_val.head()

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
115071,0,2019-05-27 14:14:33.000,"[82770, 73350, 49850, 283466, 38811, 14654, 83...",[],27,0
115072,0,2014-07-16 15:24:24.000,[],[],0,0
115073,0,2008-06-21 23:26:22.000,"[42084, 86991, 615142, 615143, 66432, 191918, ...",[],14,0
115074,0,2017-10-30 18:15:43.000,"[19289, 156274, 92524, 5729, 9179, 4694, 3233,...",[],17,0
115075,0,2017-02-07 11:40:42.000,"[72186, 47442, 47461, 24939, 209259, 81164, 24...",[],8,0


row : **plylst(nid)**  
column : **item(sid or tid)** 인 sparse matrix A

In [None]:
range(n_train)

range(0, 115071)

In [None]:
np.repeat(range(n_train), plylst_train['num_songs']).shape

(5285871,)

In [None]:
a = np.array([song for songs in plylst_train['songs_id'] for song in songs])
a

array([     0,      1,      2, ...,  16938, 296045, 283396])

In [None]:
plylst_train['num_songs'].sum() # 이게 5285871개임. 

5285871

In [None]:
plylst_train['songs_id']

nid
0         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1         [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...
2         [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...
3         [89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...
4         [127, 128, 129, 130, 131, 132, 133, 134, 135, ...
                                ...                        
115066    [156225, 119784, 144788, 28312, 615129, 321240...
115067    [148864, 143636, 311523, 266259, 8989, 389684,...
115068    [70287, 13559, 7387, 140158, 168013, 168422, 1...
115069    [123250, 92055, 170131, 86695, 54428, 120047, ...
115070    [16335, 16267, 26291, 22614, 30836, 30404, 229...
Name: songs_id, Length: 115071, dtype: object

In [None]:
# 노래 행렬
row = np.repeat(range(n_train), plylst_train['num_songs']) # range(n_train)을 num_songs(각 플레이리스트의 노래 개수)만큼 반복! = 5285871개의 1차원 array
col = [song for songs in plylst_train['songs_id'] for song in songs] # 5285871개 (plylst에 등장한 모든 song = 중복 포함 )
dat = np.repeat(1, plylst_train['num_songs'].sum()) # row 와 col의 교차지점에 1을 박음. 

# 5285871 * 5285871 정사각행렬을 만들고 이를 csr_matrix를 활용해 바꿈. 

train_songs_A = spr.csr_matrix( (dat, (row, col) ), shape=(n_train, n_songs) ) # shape을 115071(train 개수) * 638336(song 개수)
train_songs_A

<115071x638336 sparse matrix of type '<class 'numpy.longlong'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [None]:
# train_songs_A.toarray()

In [None]:
# 태그 행렬
row = np.repeat(range(n_train), plylst_train['num_tags']) # 476331개 : plylst별 태그 등장 횟수 반복
col = [tag for tags in plylst_train['tags_id'] for tag in tags] # 476331개 : 각 plylst별 태그(중복 포함)
# 행렬 안에 tag의 개수를 넣음 (빈도 수)
dat = np.repeat(1, plylst_train['num_tags'].sum()) # input array 는 한개
# np.repeat(input array, 요소 반복 횟수)
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
train_tags_A

<115071x30197 sparse matrix of type '<class 'numpy.longlong'>'
	with 476331 stored elements in Compressed Sparse Row format>

In [None]:
## transpose ( 전치 )
train_songs_A_T = train_songs_A.T.tocsr() # tocar() : Compressed Sparse row로 만들어줌
train_tags_A_T = train_tags_A.T.tocsr() 

In [None]:
train_songs_A_T

<638336x115071 sparse matrix of type '<class 'numpy.longlong'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [None]:
# cosine similarity 구현
from sklearn.metrics.pairwise import cosine_similarity
tag_sim = cosine_similarity(train_tags_A_T, train_tags_A_T)
tag_sim

array([[1.        , 0.01115171, 0.00256802, ..., 0.        , 0.        ,
        0.        ],
       [0.01115171, 1.        , 0.79319467, ..., 0.        , 0.        ,
        0.        ],
       [0.00256802, 0.79319467, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
tag_sim.shape #(30197*30197)

(30197, 30197)

In [None]:
# train_tags_A 에 similarity 열 내적 추가(안되네;;;;;;;)
# train_tags_A = train_tags_A.dot(tag_sim)
# train_tags_A

In [None]:
# %%time
# song_sim = cosine_similarity(train_songs_A_T, train_songs_A_T)
# song_sim #터짐;;;;;;;;;

## CF

In [None]:
plylst_val.shape

(23015, 6)

In [None]:
# sample test
np.random.seed(33)
n_sample = 23015

# val = plylst_val.iloc[np.random.choice(range(n_val), n_sample, replace=False), :] # 비복원
# val.shape

plylst_val.shape

(23015, 6)

In [None]:
val.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain
0,[],118598,,"[373313, 151080, 275346, 696876, 165237, 52593...",1675,2019-05-27 14:14:33.000,0
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000,0
2,[],51464,,"[529437, 516103, 360067, 705713, 226062, 37089...",62,2008-06-21 23:26:22.000,0


In [None]:
plylst_val.head(3)

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
115071,0,2019-05-27 14:14:33.000,"[82770, 73350, 49850, 283466, 38811, 14654, 83...",[],27,0
115072,0,2014-07-16 15:24:24.000,[],[],0,0
115073,0,2008-06-21 23:26:22.000,"[42084, 86991, 615142, 615143, 66432, 191918, ...",[],14,0


In [None]:
from tqdm import tqdm

def rec(pids):

  %%time
  res = []

  for pid in pids:
    p = np.zeros((n_songs,1)) # n_?songs 만큼 0박은 1차원 array 만들기

    p[ plylst_val.loc[pid,'songs_id'] ] = 1 # 각 plylst에 이미 있는 song_id에는 1박기
    
    # song 빈도수 열과 p(이미 존재하는 song id에 1 박은거) 내적! 
    valid = train_songs_A.dot(p).reshape(-1)
    # reshape(-1) : 다른 나머지 차원의 크기를 맞추고 남은 크기를 해당 차원에 할당

    # 이미 있는 id
    songs_already = plylst_val.loc[pid, "songs_id"]
    tags_already = plylst_val.loc[pid, "tags_id"]

    # 전치시킨거에 valid(위에서 내적한거) 또 내적
    cand_song = train_songs_A_T.dot(valid)
    cand_song_idx = cand_song.reshape(-1).argsort()[-1000:][::-1] 
    # argsort() : 작은 값부터 순서대로 데이터 인덱스 반환
    # array[::-1] : 처음부터 끝까지 -1칸 간격으로(역순으로) > 큰값부터 반환
    # [-150:] : 150개 뒤에서부터 자름
    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
    # cand_song_idx 에서 songs_already 가 있는지 T/F로 알려준다.
    # 근데 False 만 골라서, 100개까지 자른다

    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]
    # cand_song_idx 의 index의 i를 song_sid_id[i]로 가져온다. = 최종 제출본

######################################################################################
    cand_tag = train_tags_A_T.dot(valid)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-100:][::-1]
    # 뒤에서 큰 순서대로 15개 자름. 큰 순서대로 데이터 정렬
    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    # 이미 태그에 들어있지 않은 것만(False) 10개 반환
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]
    #cand_tag_idx 의 인덱스를 가져와서 rec_tag_idx에 넣는다

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    # res에 딕셔너리 append
    
    # 1000번마다 시간 출력
    tt = 1
    if tt % 1000 == 0:
      print(tt)

    tt += 1
  return res  

In [None]:
# # 학습! 일해라 일해(37분 걸림)
# %%time
# answers = rec(plylst_val.index)

In [None]:
# answers # 와우 어메이징;;;;;;;

In [None]:
# import io
# import os

# def write_json(data, fname):
#     def _conv(o):
#         if isinstance(o, (np.int64, np.int32)):
#             return int(o)
#         raise TypeError

#     parent = os.path.dirname(fname)
#     with io.open(fname, "w", encoding="utf-8") as f:
#         json_str = json.dumps(data, ensure_ascii=False, default=_conv)
#         f.write(json_str)

# write_json(answers,'/content/gdrive/My Drive/Colab Notebooks/머신/results.json')

In [None]:
# results = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/results.json', typ = 'frame',encoding='UTF-8')

### 곡 발매일자<플레이리스트 최종 수정일보다 늦은 경우는 추천에서 제외해보자

# Word2Vec


In [None]:
import os
import json
import pandas as pd

from tqdm import tqdm
from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors

In [None]:
# 아니 class 왜만드냐?

min_count = 3
size = 100
window = 210
sg = 5
p2v_model = WordEmbeddingsKeyedVectors(size)

In [None]:
val = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/val.json', typ = 'frame',encoding='UTF-8')
song_meta = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/song_meta.json', typ = 'frame',encoding='UTF-8')
train = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/train.json', typ = 'frame', encoding='utf-8')
test = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/test.json', typ = 'frame', encoding='utf-8')
genre_gn_all = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/genre_gn_all.json', typ = 'series',encoding='utf-8')
result = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/results.json', typ = 'frame', encoding='UTF-8')

In [None]:
data = pd.concat([train, val], ignore_index = True) 
data.head(2)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138086 entries, 0 to 138085
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   tags          138086 non-null  object
 1   id            138086 non-null  int64 
 2   plylst_title  138086 non-null  object
 3   songs         138086 non-null  object
 4   like_cnt      138086 non-null  int64 
 5   updt_date     138086 non-null  object
dtypes: int64(2), object(4)
memory usage: 6.3+ MB


In [None]:
class PlaylistEmbedding:
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH
        self.min_count = 3
        self.size = 100
        self.window = 210
        self.sg = 5
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)

        with open(os.path.join(FILE_PATH, 'train.json'), encoding="utf-8") as f:
            self.train = json.load(f)
        with open(os.path.join(FILE_PATH, 'val.json'), encoding="utf-8") as f:
            self.val = json.load(f)
        with open(os.path.join(FILE_PATH, 'results.json'), encoding="utf-8") as f:
            self.most_results = json.load(f)

    def get_dic(self, train, val):
        song_dic = {}
        tag_dic = {}
        data = train + val
        for q in tqdm(data):
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['tags']
        self.song_dic = song_dic
        self.tag_dic = tag_dic
        total = list(map(lambda x: list(map(str, x['songs'])) + list(x['tags']), data))
        total = [x for x in total if len(x)>1]
        self.total = total
        
    def get_w2v(self, total, min_count, size, window, sg):
        w2v_model = Word2Vec(total, min_count = min_count, size = size, window = window, sg = sg)
        self.w2v_model = w2v_model
            
    def update_p2v(self, train, val, w2v_model):
        ID = []   
        vec = []
        for q in tqdm(train + val):
            tmp_vec = 0
            if len(q['songs'])>=1:
                for song in q['songs'] + q['tags']:
                    try: 
                        tmp_vec += w2v_model.wv.get_vector(str(song))
                    except KeyError:
                        pass
            if type(tmp_vec)!=int:
                ID.append(str(q['id']))    
                vec.append(tmp_vec)
        self.p2v_model.add(ID, vec)

    def get_result(self, p2v_model, song_dic, tag_dic, most_results, val):
        answers = []
        for n, q in tqdm(enumerate(val), total = len(val)):
            try:
                most_id = [x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)]
                get_song = []
                get_tag = []
                for ID in most_id:
                    get_song += song_dic[ID]
                    get_tag += tag_dic[ID]
                get_song = list(pd.value_counts(get_song)[:200].index)
                get_tag = list(pd.value_counts(get_tag)[:20].index)
                answers.append({
                    "id": q["id"],
                    "songs": remove_seen(q["songs"], get_song)[:100],
                    "tags": remove_seen(q["tags"], get_tag)[:10],
                })
            except:
                answers.append({
                  "id": most_results[n]["id"],
                  "songs": most_results[n]['songs'],
                  "tags": most_results[n]["tags"],
                }) 
        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs'])!=100:
                answers[n]['songs'] += remove_seen(q['songs'], self.most_results[n]['songs'])[:100-len(q['songs'])]
            if len(q['tags'])!=10:
                answers[n]['tags'] += remove_seen(q['tags'], self.most_results[n]['tags'])[:10-len(q['tags'])]  
        self.answers = answers
    
    def run(self):
        self.get_dic(self.train, self.val)
        self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg)
        self.update_p2v(self.train, self.val, self.w2v_model)
        self.get_result(self.p2v_model, self.song_dic, self.tag_dic, self.most_results, self.val)
        write_json(self.answers, '/content/gdrive/My Drive/Colab Notebooks/머신/results1.json')

In [None]:
%%time

# FILE_PATH = '/content/gdrive/My Drive/Colab Notebooks/머신/'
# U_space = PlaylistEmbedding(FILE_PATH)
# U_space.run()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [None]:
# baseline

class GenreMostPopular:
    def _song_mp_per_genre(self, song_meta, global_mp):
        res = {}

        for sid, song in song_meta.items():
            for genre in song['song_gn_gnr_basket']:
                res.setdefault(genre, []).append(sid)

        for genre, sids in res.items():
            res[genre] = Counter({k: global_mp.get(int(k), 0) for k in sids})
            res[genre] = [k for k, v in res[genre].most_common(200)]

        return res

    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json}
        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers

    def run(self, song_meta_fname, train_fname, question_fname):
        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(song_meta_json, train_data, questions)
        write_json(answers, '/content/gdrive/My Drive/Colab Notebooks/머신/results_genre.json')


In [None]:
# song_meta = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/song_meta.json', typ = 'frame',encoding='UTF-8')
# global_mp = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/머신/genre_gn_all.json', typ = 'series',encoding='utf-8')

# baseline = GenreMostPopular(song_meta, global_mp)
# baseline.run()