In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np

### Knowledge Graph のデータの読み込み

In [2]:
kg_df = pd.read_csv('./data/last-fm/kg_final.txt', sep=' ', header=None, names=('e_h','r','e_t'))

In [3]:
kg_df.head()

Unnamed: 0,e_h,r,e_t
0,12700,0,48123
1,18104,0,48123
2,25838,1,48124
3,41691,2,48125
4,9746,1,48126


### 人気アーティストのアイテムのデータを読み込み

In [4]:
file = 'top_250_artist_item.csv'
top_250_artist_item_df = pd.read_csv(file)
top_250_artist_item_df = top_250_artist_item_df.drop('Unnamed: 0', axis=1)
top_250_artist_item_df.head()

Unnamed: 0,item_id,track_name,artist_name,interaction
0,72,Needles and Pins,Deftones,42.0
1,73,Minerva,Deftones,99.0
2,74,Good Morning Beautiful,Deftones,42.0
3,75,Deathblow,Deftones,31.0
4,76,When Girls Telephone Boys,Deftones,37.0


### アーティストの人気順のデータを読み込み

In [5]:
file = 'artists_population.csv'
artist_df = pd.read_csv(file)
artist_df = artist_df.drop('Unnamed: 0', axis=1)
artist_df.head()

Unnamed: 0,artist_name,interaction
0,Arctic Monkeys,4754
1,Linkin Park,4357
2,The Beatles,4271
3,Queen,4241
4,Daft Punk,3727


In [6]:
N = 5
artist =list(artist_df['artist_name'])

In [7]:
top_n_artist = artist[:N]
top_n_artist

['Arctic Monkeys', 'Linkin Park', 'The Beatles', 'Queen', 'Daft Punk']

### TOP N のアーティストのアイテムを抽出

In [8]:
top_N_artist_item_df = top_250_artist_item_df.query('artist_name in {}'.format(top_n_artist))
top_N_artist_item_df = top_N_artist_item_df.reset_index(drop=True)
top_N_artist_item_df.head()

Unnamed: 0,item_id,track_name,artist_name,interaction
0,2556,Easier to Run,Linkin Park,132.0
1,5344,Leave Out All the Rest,Linkin Park,186.0
2,17252,Veridis Quo,Daft Punk,104.0
3,734,Hit the Floor,Linkin Park,119.0
4,2556,Easier to Run,Linkin Park,132.0


In [9]:
# TODO: e_hのカウントで最も多いものをアーティストのidとするべき
# one_artist_item_df = top_N_artist_item_df[top_N_artist_item_df['artist_name'] == 'Daft Punk']
# one_artist_item_df.head()

In [10]:
target_item_list = list(top_N_artist_item_df['item_id'])

### TOP N のアーティストのアイテムを含む relation を抽出

In [11]:
item_list = list(top_N_artist_item_df['item_id'])

In [12]:
target_kg_df = kg_df.query('e_h in {}'.format(target_item_list))
# target_kg_df = kg_df.query('e_h in {}'.format(one_artist_item_df))
target_kg_df = target_kg_df.query('r == 2')

In [13]:
target_kg_df['e_t'].value_counts()

51034    95
48237    93
55627    83
48452    43
50096    27
55994     2
53412     2
62558     2
57301     1
77769     1
48948     1
59690     1
92906     1
83127     1
61738     1
Name: e_t, dtype: int64

In [14]:
artist_df = target_kg_df.drop_duplicates(subset='e_t')
artist_df = artist_df.reset_index(drop=True)
artist_df = artist_df.rename(columns={'e_h':'item_id','e_t':'artist_entity_id'})
artist_df = artist_df.drop('r', axis=1)
artist_df.head()

Unnamed: 0,item_id,artist_entity_id
0,28714,48237
1,887,48452
2,18090,50096
3,17961,51034
4,39392,53412


### アーティストの entity_id を特定する

In [15]:
item_meta_data_df = pd.read_csv('item_meta_data.csv', index_col=0)
item_meta_data_df.head()

Unnamed: 0,item_id,track_name,artist_name
0,72,Needles and Pins,Deftones
1,73,Minerva,Deftones
2,74,Good Morning Beautiful,Deftones
3,75,Deathblow,Deftones
4,76,When Girls Telephone Boys,Deftones


In [16]:
artist_entity_id_df = pd.merge(
    left = artist_df, 
    right = item_meta_data_df,
    on = 'item_id',
    how = 'left'
)
artist_entity_id_df = artist_entity_id_df.drop_duplicates(subset='artist_entity_id')
artist_entity_id_df

Unnamed: 0,item_id,artist_entity_id,track_name,artist_name
0,28714,48237,Keep Your Hands Off My Baby,The Beatles
3,887,48452,Valentine's Day,Linkin Park
59,18090,50096,Red Light Indicates Doors Are Secured,Arctic Monkeys
171,17961,51034,Let Me Live,Queen
203,39392,53412,Get Lucky - Radio Edit,Daft Punk
210,12983,55627,Superheroes,Daft Punk
280,18347,59690,Instant Crush,Daft Punk
290,4807,62558,Krwlng,Linkin Park
317,7554,77769,1Stp Klosr,Linkin Park
333,4802,83127,Frgt/10,Linkin Park


In [17]:
artist_entity_id_df = artist_entity_id_df.drop_duplicates(subset='artist_entity_id')
artist_entity_id_df = artist_entity_id_df.reset_index(drop=True)
artist_entity_id_df = artist_entity_id_df.drop(['item_id','track_name'],axis=1)
artist_entity_id_df

Unnamed: 0,artist_entity_id,artist_name
0,48237,The Beatles
1,48452,Linkin Park
2,50096,Arctic Monkeys
3,51034,Queen
4,53412,Daft Punk
5,55627,Daft Punk
6,59690,Daft Punk
7,62558,Linkin Park
8,77769,Linkin Park
9,83127,Linkin Park


In [18]:
target_artist_entity_list = list(artist_entity_id_df['artist_entity_id'])

In [19]:
target_artist_entity_list

[48237,
 48452,
 50096,
 51034,
 53412,
 55627,
 59690,
 62558,
 77769,
 83127,
 57301,
 55994,
 48948,
 92906,
 61738]

### TOP N のアーティストの曲を聴いているユーザを抽出

In [20]:
data_path = './data/last-fm/'

#　train.txt を読み込む
train_data = defaultdict(list)
f = open( data_path + 'train.txt')
line = f.readline()

while line :

    data = line.strip()
    data_list = data.split()

    user = data_list[0]
    items = data_list[1:]
    train_data[user] = items
    
    line = f.readline()
    
f.close()

In [21]:
user_list = []
item_list = []

for user, items in train_data.items():
    
    for item in items:
        user_list.append(user)
        item_list.append(item)

In [22]:
train_data_df = pd.DataFrame(
    data = {
        'user_id' : user_list,
        'item_id' : item_list
        
    },
    columns=['user_id', 'item_id']
)

In [23]:
target_item_list = list(artist_df['item_id'])
target_item_list = [str(i) for i in target_item_list]

In [24]:
target_train_data_df = train_data_df.query('item_id in {}'.format(target_item_list))
target_train_data_df.head()

Unnamed: 0,user_id,item_id
1728,6,887
3204,11,1568
6753,17,887
12918,25,4800
12920,25,4802


In [25]:
# 指定したアーティストの曲を聴いたユーザ
target_user_list = list(target_train_data_df['user_id'].unique())

###  ターゲットとなるデータの圧縮表現をロード
  - target_user_list
  - target_item_list
  - target_artist_entity_list

In [26]:
saved_path = './emb_2_dim/'
user_emd_df = pd.read_csv(saved_path + 'user_emb_2dim.csv', index_col=0)
entity_emb_df = pd.read_csv(saved_path + 'entity_emb_2dim.csv', index_col=0)
entity_emb_df = entity_emb_df.reset_index(drop=True)

In [27]:
user_emd_df.head()

Unnamed: 0,id,type,x,y
0,0,user,-23.38933,16.841595
1,1,user,27.926821,19.098354
2,2,user,-21.620014,21.031384
3,3,user,8.001935,28.562372
4,4,user,39.398445,-10.006981


In [28]:
entity_emb_df.head()

Unnamed: 0,id,type,x,y
0,0,entity,5.5196,5.664539
1,1,entity,-8.202256,30.739899
2,2,entity,-23.490604,16.579607
3,3,entity,0.374067,23.597845
4,4,entity,-23.443825,16.330036


#### user

In [29]:
target_user_emd_df = user_emd_df.query('id in {}'.format(target_user_list))
target_user_emd_df = target_user_emd_df.reset_index(drop=True)
target_user_emd_df.head()

Unnamed: 0,id,type,x,y
0,6,user,-17.046577,24.395184
1,11,user,9.798297,-3.423298
2,17,user,-28.116472,26.788073
3,25,user,-13.227847,34.64653
4,29,user,-9.395927,21.148972


In [30]:
def add_user_name(uid):
    return 'u' + str(uid)

_target_user_emd_df = target_user_emd_df.copy()
_target_user_emd_df['name'] = target_user_emd_df['id'].map(add_user_name)

target_user_emd_df = _target_user_emd_df
target_user_emd_df.head()

Unnamed: 0,id,type,x,y,name
0,6,user,-17.046577,24.395184,u6
1,11,user,9.798297,-3.423298,u11
2,17,user,-28.116472,26.788073,u17
3,25,user,-13.227847,34.64653,u25
4,29,user,-9.395927,21.148972,u29


#### item

In [31]:
target_item_list = list(top_N_artist_item_df['item_id'].unique())
len(set(target_item_list))

341

In [32]:
target_item_emb_df = entity_emb_df.query('id in {}'.format(target_item_list))
target_item_emb_df = target_item_emb_df.reset_index(drop=True)

In [33]:
def convert_type_to_item(type):
    return 'item'

In [34]:
target_item_emb_df['type'] = target_item_emb_df['type'].map(convert_type_to_item)
target_item_emb_df.head()

Unnamed: 0,id,type,x,y
0,425,item,34.556053,2.196449
1,426,item,34.571899,2.245995
2,427,item,34.48098,2.101987
3,436,item,21.446608,-12.320498
4,441,item,21.209848,-12.24884


In [35]:
target_item_emb_df = pd.merge(
    target_item_emb_df, item_meta_data_df,
    left_on = 'id',
    right_on = 'item_id',
    how = 'left'
)
target_item_emb_df = target_item_emb_df.drop_duplicates(subset='id')

In [36]:
target_item_emb_df = target_item_emb_df.drop('item_id',axis=1)
target_item_emb_df.rename(columns={'track_name':'name'}, inplace=True)
print(len(target_item_emb_df))
target_item_emb_df.head()

341


Unnamed: 0,id,type,x,y,name,artist_name
0,425,item,34.556053,2.196449,Killer Queen,Queen
185,426,item,34.571899,2.245995,Crazy Little Thing Called Love,Queen
318,427,item,34.48098,2.101987,We Will Rock You,Queen
677,436,item,21.446608,-12.320498,Riot Van,Arctic Monkeys
851,441,item,21.209848,-12.24884,My Propeller,Arctic Monkeys


In [37]:
# Validation
if len(set(target_item_list)) != len(target_item_emb_df):
    print('ERROR...!! アイテムのデータを確認してください。')
else:
    print('アイテムのembeddinのデータ取得に成功！！')

アイテムのembeddinのデータ取得に成功！！


In [38]:
# アイテムのインタラクションのデータとマージする
item_interaction_df = pd.read_csv('./item_interaction.csv', index_col=0)
item_interaction_df.head()

Unnamed: 0,item_id,interaction
0,72,42
1,73,99
2,74,42
3,75,31
4,76,37


In [39]:
target_item_emb_df = pd.merge(
    target_item_emb_df, item_interaction_df,
    left_on = 'id',
    right_on = 'item_id',
    how = 'left'
)
target_item_emb_df = target_item_emb_df.drop('item_id',axis=1)
target_item_emb_df.head()

Unnamed: 0,id,type,x,y,name,artist_name,interaction
0,425,item,34.556053,2.196449,Killer Queen,Queen,185
1,426,item,34.571899,2.245995,Crazy Little Thing Called Love,Queen,133
2,427,item,34.48098,2.101987,We Will Rock You,Queen,359
3,436,item,21.446608,-12.320498,Riot Van,Arctic Monkeys,174
4,441,item,21.209848,-12.24884,My Propeller,Arctic Monkeys,122


#### artist

In [40]:
target_artist_emb_df = entity_emb_df.query('id in {}'.format(target_artist_entity_list))
target_artist_emb_df = target_artist_emb_df.reset_index(drop=True)

def convert_type_to_artist(type):
    return 'artist'

target_artist_emb_df['type'] = target_artist_emb_df['type'].map(convert_type_to_artist)
target_artist_emb_df.head()

Unnamed: 0,id,type,x,y
0,48237,artist,-27.836536,1.785171
1,48452,artist,0.705296,-1.092529
2,48948,artist,-26.15942,5.985563
3,50096,artist,-9.305778,-7.913741
4,51034,artist,-12.6888,-11.555313


In [41]:
target_artist_emb_df = pd.merge(
    target_artist_emb_df, artist_entity_id_df,
    left_on = 'id',
    right_on = 'artist_entity_id',
    how = 'left'
)
target_artist_emb_df = target_artist_emb_df.drop('artist_entity_id', axis=1)
target_artist_emb_df.rename(columns={'artist_name':'name'}, inplace=True)
target_artist_emb_df.head()

Unnamed: 0,id,type,x,y,name
0,48237,artist,-27.836536,1.785171,The Beatles
1,48452,artist,0.705296,-1.092529,Linkin Park
2,48948,artist,-26.15942,5.985563,Queen
3,50096,artist,-9.305778,-7.913741,Arctic Monkeys
4,51034,artist,-12.6888,-11.555313,Queen


### ターゲットとなるユーザ・アーティスト・アイテムのデータを出力

In [42]:
target_all_data_df = pd.concat([target_user_emd_df, target_item_emb_df,target_artist_emb_df],sort=False)
target_all_data_df.head()

Unnamed: 0,id,type,x,y,name,artist_name,interaction
0,6,user,-17.046577,24.395184,u6,,
1,11,user,9.798297,-3.423298,u11,,
2,17,user,-28.116472,26.788073,u17,,
3,25,user,-13.227847,34.64653,u25,,
4,29,user,-9.395927,21.148972,u29,,


In [43]:
save_path = './data_related_to_specific_artist/'
file_name = 'top_5_artist.csv'

if not os.path.exists(save_path):
    os.makedirs(save_path)

target_all_data_df.to_csv(save_path + file_name)