In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np

### Knowledge Graph のデータの読み込み

In [2]:
kg_df = pd.read_csv('./data/last-fm/kg_final.txt', sep=' ', header=None, names=('e_h','r','e_t'))

In [3]:
kg_df.head()

Unnamed: 0,e_h,r,e_t
0,12700,0,48123
1,18104,0,48123
2,25838,1,48124
3,41691,2,48125
4,9746,1,48126


### アーティストのデータの読み込み

In [4]:
artist = 'Taylor Swift'

In [5]:
artist_df = pd.read_csv('./artist_item_data/{}.csv'.format(artist))
artist_df = artist_df.drop('Unnamed: 0', axis=1)
print('item_num:{}'.format(len(artist_df)))

item_num:23


In [6]:
artist_df.head()

Unnamed: 0,item_id,track_name,artist_name,interaction
0,5255,Hey Stephen,Taylor Swift,22.0
1,5392,Mary's Song (Oh My My My),Taylor Swift,13.0
2,5361,Tied Together With a Smile,Taylor Swift,8.0
3,20463,Fifteen,Taylor Swift,16.0
4,25060,Dear John,Taylor Swift,25.0


### アーティストのアイテムを含む relation を抽出

In [7]:
item_list = list(artist_df['item_id'])

In [8]:
target_kg_df = kg_df.query('e_h in {} or e_t in {}'.format(item_list, item_list))
target_kg_df = target_kg_df.query('r == 2')

In [9]:
target_kg_df.head(10)

Unnamed: 0,e_h,r,e_t
19315,5410,2,64848
19882,5395,2,64848
44360,5277,2,52699
55840,5357,2,64848
80218,5313,2,64848
86776,5337,2,64848
87481,5363,2,64848
114167,5361,2,64848
122887,20461,2,64848
170058,5370,2,64848


In [11]:
len(target_kg_df)

25

In [12]:
target_kg_df[target_kg_df['r']==0]

Unnamed: 0,e_h,r,e_t


In [17]:
notable = kg_df.query('r == 0')
notable

Unnamed: 0,e_h,r,e_t
0,12700,0,48123
1,18104,0,48123
20,23068,0,48123
23,43072,0,48123
25,19433,0,48123
30,16160,0,48123
42,48167,0,48123
57,41318,0,48123
58,41276,0,48123
68,48198,0,48123


In [20]:
notable['e_t'].value_counts()

48123    64386
48294     2147
53818      109
71490       42
85722       24
Name: e_t, dtype: int64

### アーティストの曲を聴いているユーザを抽出

In [10]:
data_path = './data/last-fm/'

#　train.txt を読み込む
train_data = defaultdict(list)
f = open( data_path + 'train.txt')
line = f.readline()

while line :

    data = line.strip()
    data_list = data.split()

    user = data_list[0]
    items = data_list[1:]
    train_data[user] = items
    
    line = f.readline()
    
f.close()

In [11]:
user_list = []
item_list = []

for user, items in train_data.items():
    
    for item in items:
        user_list.append(user)
        item_list.append(item)

In [12]:
train_data_df = pd.DataFrame(
    data = {
        'user_id' : user_list,
        'item_id' : item_list
        
    },
    columns=['user_id', 'item_id']
)

In [13]:
target_item_list = list(artist_df['item_id'])
target_item_list = [str(i) for i in target_item_list]

In [14]:
print('5410' in item_list)

True


In [15]:
target_train_data_df = train_data_df.query('item_id in {}'.format(target_item_list))
target_train_data_df.head()

Unnamed: 0,user_id,item_id
17360,30,5255
17366,30,5255
17394,31,5269
17403,31,5277
17440,31,5277


In [16]:
# 指定したアーティストの曲を聴いたユーザ
target_user = list(target_train_data_df['user_id'].unique())

### 指定したアーティストの曲を聴いたユーザについて<br>特定のアーティストの曲が占める割合を算出

In [17]:
target_user_train_data_df = train_data_df.query('user_id in {}'.format(target_user))
target_user_train_data_df.head()

Unnamed: 0,user_id,item_id
17312,30,5230
17313,30,5231
17314,30,5232
17315,30,5233
17316,30,5234


In [18]:
target_train_data = defaultdict(list)

user_list = target_user_train_data_df['user_id']
item_list = target_user_train_data_df['item_id']

for user, item in zip(user_list, item_list):
    target_train_data[user].append(item)

In [19]:
unique_target_list = []
ratio_list = []

for user, items in target_train_data.items():
    counter = 0
    item_num = len(target_train_data[user])
    
    for item in items:
        if item in target_item_list:
            counter += 1
    
    ratio = counter / item_num
    
    unique_target_list.append(user)
    ratio_list.append(ratio)
    

In [20]:
df = pd.DataFrame(
    data={
        'user_id' : unique_target_list,
        'ratio' : ratio_list
    },
    columns=['user_id', 'ratio']
)

In [21]:
save_patn = './user_analysis/{}/'.format(artist)
if not os.path.exists(save_patn):
    os.makedirs(save_patn)

df.to_csv(save_patn + 'user_id-ratio.csv')

### 指定したアーティストの曲を聴いたユーザについて<br>全てのinteractionしたアイテムを出力

In [24]:
interaction_file_path = './master_table/all_interaction_data.csv'
df = pd.read_csv(interaction_file_path)
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
0,0,72,16782,m.0qh8tx,Needles and Pins,362.0,Deftones
1,0,73,16781,m.0qh8v3,Minerva,362.0,Deftones
2,0,74,16780,m.0qh8vb,Good Morning Beautiful,362.0,Deftones
3,0,75,16779,m.0qh8vk,Deathblow,362.0,Deftones
4,0,76,16778,m.0w2yfq,When Girls Telephone Boys,362.0,Deftones


In [25]:
df = df.query('user_id in {}'.format(target_user))

In [26]:
df.head()

Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
4346,30,5254,288441,m.0fs1q26,Kilojoules,19954.0,Freelance Whales
4347,30,5708,746008,m.0fqdyt4,The Great Estates,19954.0,Freelance Whales
4348,30,5248,746013,m.0fyhs9l,Location,19954.0,Freelance Whales
4349,30,24476,252966,m.0g0yxk0,Broken Horse,19954.0,Freelance Whales
4350,30,5238,347396,m.0fvqyk7,"Oh, Maker",15910.0,Janelle Monáe


In [28]:
df.to_csv(save_patn + 'target_user_interaction.csv', encoding='utf_8_sig')