In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20170925-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 10 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/var/cache/fontconfig: cleaning cache directory
/root/.cache/fontconfig: not cleaning non-existent cache directory
/root/.fontconfig: not cleaning non-existent cache directory
fc-cache: succeeded


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
import platform

In [None]:
tqdm.pandas()
import matplotlib.pyplot as plt

plt.rc('font', family='NanumBarunGothic') 

In [None]:
PATH = '/content/drive/MyDrive/dataset/melon/'

#### 데이터 로드

In [None]:
genre_data = pd.read_json(PATH+'genre_all.json')
song_meta = pd.read_json(PATH+'song_meta.json')
train = pd.read_json(PATH+'train.json')
test = pd.read_json(PATH+'test.json')

In [None]:
genre_data.head(3)

Unnamed: 0,genre_code,genre
0,GN0100,발라드
1,GN0101,세부장르전체
2,GN0102,'80


In [None]:
song_meta.head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2


In [None]:
train.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000


#### 장르 데이터 코드 라벨링

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# 장르 코드에서 GN을제거하고 정수로
genre_data['label'] = genre_data['genre_code'].progress_apply(lambda x: int(x.replace('GN','')))

100%|██████████| 254/254 [00:00<00:00, 62594.20it/s]


In [None]:
genre = pd.read_json(PATH+'genre_gn_all.json', typ='series')

In [None]:
genre_gn_all = pd.DataFrame(genre, columns=['gnr_name']).reset_index().rename(columns={'index':'gnr_code'})
gnr_code = genre_gn_all[genre_gn_all['gnr_code'].str[-2:] == '00']

dtl_gnr_code = genre_gn_all[genre_gn_all['gnr_code'].str[-2:] != '00'].reset_index(drop=True)
dtl_gnr_code.rename(columns={'gnr_code':'dtl_gnr_code', 'gnr_name':'dtl_gnr_name'}, inplace=True)

In [None]:
# 앞자리 네 자리 공통코드 추출
gnr_code = gnr_code.assign(join_code = gnr_code['gnr_code'].str[0:4])
dtl_gnr_code = dtl_gnr_code.assign(join_code = dtl_gnr_code['dtl_gnr_code'].str[0:4])

# Merge
gnr_code_tree = pd.merge(gnr_code, dtl_gnr_code, how = 'left', on = 'join_code')

gnr_code_tree[['gnr_code', 'gnr_name', 'dtl_gnr_code', 'dtl_gnr_name']]
gnr_code_tree

Unnamed: 0,gnr_code,gnr_name,join_code,dtl_gnr_code,dtl_gnr_name
0,GN0100,발라드,GN01,GN0101,세부장르전체
1,GN0100,발라드,GN01,GN0102,'80
2,GN0100,발라드,GN01,GN0103,'90
3,GN0100,발라드,GN01,GN0104,'00
4,GN0100,발라드,GN01,GN0105,'10-
...,...,...,...,...,...
220,GN2800,뮤직테라피,GN28,GN2806,반려동물
221,GN2900,뮤지컬,GN29,GN2901,세부장르전체
222,GN2900,뮤지컬,GN29,GN2902,국내뮤지컬
223,GN2900,뮤지컬,GN29,GN2903,국외뮤지컬


#### 플레이리스트별 길이 컬럼화

In [None]:
train['len_list'] = train['songs'].apply(lambda x: len(x))

In [None]:
sample = train.loc[0]

In [None]:
from collections import Counter

def song_genre_list(x):
    songs = x['songs']
    tmp = []
    
    for song in songs:
        tmp += song_meta.loc[song]['song_gn_dtl_gnr_basket']
    
    return tmp

In [None]:
train['genre_list'] = train.progress_apply(lambda x: song_genre_list(x), axis=1)

100%|██████████| 115071/115071 [14:41<00:00, 130.47it/s]


#### 곡별 장르 라벨의 평균치 계산

In [None]:
from scipy.stats import mode
import math

In [None]:
g = genre_data.copy()
g = g.set_index(g['genre_code'])
g = g.drop(['genre_code'], axis=1)

In [None]:
train['genre_mean'] =  train['genre_list'].progress_apply(lambda x: int(g.loc[x]['label'].mean()) if len(x) > 0 else int(0))
train['genre_mean_trunc'] =  train['genre_list'].progress_apply(lambda x: int(g.loc[x]['label'].mean()//100 * 100) if len(x) > 0 else int(0))

100%|██████████| 115071/115071 [01:40<00:00, 1146.91it/s]
100%|██████████| 115071/115071 [01:39<00:00, 1155.57it/s]


In [None]:
train['genre_mode'] = train['genre_list'].progress_apply(lambda x: mode(x)[0][0] if len(mode(x)[0]) > 0 else 0)

100%|██████████| 115071/115071 [00:47<00:00, 2400.97it/s]


In [None]:
def get_main_gnr(x):
    code = ''
    
    if len(str(x)) == 3:
        code = 'GN0'+str(x)
        return g.loc[code]['genre']
        
    elif len(str(x)) == 4:
        code = 'GN'+str(x)
        return g.loc[code]['genre']
    else:
        return np.NaN

In [None]:
train['main_genre'] = train['genre_mean_trunc'].progress_apply(lambda x: get_main_gnr(x))

100%|██████████| 115071/115071 [00:18<00:00, 6255.26it/s]


In [None]:
tag_per_genre = pd.DataFrame()
inputs = {"main_genre":[], "tags":[]}

In [None]:
genres = train['main_genre'].unique()

for genre in tqdm_notebook(genres):
  datas = train[train['main_genre'] == genre]['tags']
  tmp = []

  for data in datas:
    tmp += data
  
  inputs['main_genre'].append(genre)
  inputs['tags'].append(tmp)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
tag_per_genre = pd.DataFrame(inputs)
tag_per_genre

Unnamed: 0,main_genre,tags
0,록/메탈,"[락, 짝사랑, 취향저격, 슬픔, 고백, 사랑, 이별, 새해, 여행, 프로필음악, ..."
1,POP,"[추억, 회상, 댄스, 일렉트로니카, 포크, 메탈, 락, 댄스, 인디, 슬픔, 밤,..."
2,인디음악,"[까페, 잔잔한, 잔잔한, 추억, 회상, 록, Metal, 이일우, M에센셜, 메탈..."
3,포크/블루스,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,..."
4,랩/힙합,"[운동, 드라이브, Pop, 트로피컬하우스, 힐링, 기분전환, 2017, 팝, 트렌..."
5,OST,"[kpop, 댄스, 걸그룹댄스, 스트레스해소, 거슈윈, 존윌리엄스, 랩소디인블루, ..."
6,재즈,"[가을, 재즈, 공기정화, 피톤치드, 스트레스, 연주곡, 사무실, 뉴에이지, 뉴에이..."
7,일렉트로니카,"[락, 헬스, 스포츠, 피트니스, 운동, 다이어트, 런닝, 레깅스, 필라테스, 산책..."
8,성인가요,"[봄, 설렘, 사랑, 휴식, 힐링, 설렘, 사랑, 여행, 드라이브, 산책, 잔잔한,..."
9,클래식,"[스밍, 목록, 폐막식, 올림픽, 엑소, 카페, 재즈, 잔잔한, 아침, 빅밴드, 상..."


In [None]:
from wordcloud import WordCloud

In [None]:
np.isnan(tag_per_genre['main_genre'].loc[23])


True

In [None]:
counts = Counter(s)

In [None]:
font = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
img_path = '/content/drive/MyDrive/dataset/melon/images/'

for idx in range(0, len(tag_per_genre)):
  if idx == 23:
    continue

  data = tag_per_genre.loc[idx]
  tags = Counter(data['tags'])
  genre = data['main_genre']
  genre = genre.replace('/','_')
  wc = WordCloud(font_path = font, max_font_size=60)
  cloud = wc.generate_from_frequencies(counts)
  cloud.to_file(img_path+'black_'+genre+'.jpg')

In [None]:
train

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,len_list,genre_list,genre_mean,genre_mean_trunc,genre_mode,main_genre
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,19,"[GN1402, GN1401, GN0901, GN0902, GN1001, GN101...",1098,1000,GN0901,록/메탈
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,42,"[GN0101, GN0103, GN0601, GN0605, GN0104, GN010...",922,900,GN0101,POP
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,28,"[GN0401, GN0403, GN0401, GN0403, GN0501, GN060...",553,500,GN0401,인디음악
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,38,"[GN0908, GN1509, GN0901, GN2207, GN1501, GN150...",862,800,GN0101,포크/블루스
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,53,"[GN0101, GN0101, GN0103, GN2502, GN2506, GN250...",924,900,GN0201,POP
...,...,...,...,...,...,...,...,...,...,...,...,...
115066,"[록메탈, 밴드사운드, 록, 락메탈, 메탈, 락, extreme]",120325,METAL E'SM #2,"[429629, 441511, 612106, 516359, 691768, 38714...",3,2020-04-17 04:31:11.000,12,"[GN1006, GN1013, GN1001, GN1007, GN1013, GN100...",1009,1000,GN1001,록/메탈
115067,[일렉],106976,빠른 리스너를 위한 따끈따끈한 최신 인기 EDM 모음!,"[321330, 216057, 534472, 240306, 331098, 23288...",13,2015-12-24 17:23:19.000,11,"[GN1104, GN1101, GN1104, GN1102, GN1101, GN110...",1102,1100,GN1101,일렉트로니카
115068,"[담시, 가족, 눈물, 그리움, 주인공, 나의_이야기, 사랑, 친구]",11343,#1. 눈물이 앞을 가리는 나의_이야기,"[50512, 249024, 250608, 371171, 229942, 694943...",4,2019-08-16 20:59:22.000,11,"[GN0105, GN0101, GN0105, GN0101, GN2502, GN060...",510,500,GN0101,인디음악
115069,"[잔잔한, 버스, 퇴근버스, Pop, 풍경, 퇴근길]",131982,퇴근 버스에서 편히 들으면서 하루를 마무리하기에 좋은 POP,"[533534, 608114, 343608, 417140, 609009, 30217...",4,2019-10-25 23:40:42.000,55,"[GN1107, GN1102, GN1101, GN1013, GN1008, GN100...",1135,1100,GN1001,일렉트로니카
