In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np

In [2]:
target = 'test'
# target = 'train'

data_path = './data/last-fm/'
row_data_path = './row_data/LFM-1b/'

#### *.txt の読み込み

In [3]:
all_data = defaultdict(list)
f = open( data_path + '{}.txt'.format(target))
line = f.readline()

while line :

    data = line.strip()
    data_list = data.split()

    user = data_list[0]
    items = data_list[1:]
    all_data[user] = items
    
    line = f.readline()
    
f.close()

#### txt data をDataFrameに整形

In [4]:
user_list = []
item_list = []

for user, items in all_data.items():
    
    for item in items:
        user_list.append(user)
        item_list.append(item)

In [5]:
data_df = pd.DataFrame(
    data = {
        'user_id' : user_list,
        'item_id' : item_list
    },
    columns=['user_id', 'item_id']
)

In [6]:
data_df.head()

Unnamed: 0,user_id,item_id
0,0,72
1,0,73
2,0,74
3,0,75
4,0,76


#### item_list.txt の読み込み

In [7]:
item_df = pd.read_csv( data_path + 'item_list.txt', sep=' ')
item_df.head(5)

Unnamed: 0,org_id,remap_id,freebase_id
0,152949,0,m.0f7jpbw
1,154518,1,m.0v1_6dj
2,64981,2,m.0mbpxv
3,11325966,3,m.0lh896
4,186771,4,m.0dykj_8


#### item_id と org_id (last-fm) を結合

In [8]:
data_df['item_id'] = data_df['item_id'].astype(np.int64)

In [9]:
df = pd.merge(
                    data_df, item_df, 
                    left_on='item_id', 
                    right_on='remap_id', 
                    how='left'
             ).drop(columns='remap_id')

In [10]:
df.head(10)

Unnamed: 0,user_id,item_id,org_id,freebase_id
0,0,72,16782,m.0qh8tx
1,0,73,16781,m.0qh8v3
2,0,74,16780,m.0qh8vb
3,0,75,16779,m.0qh8vk
4,0,76,16778,m.0w2yfq
5,0,77,16777,m.0qh8v_
6,0,9725,16776,m.0qh8w6
7,0,9726,16775,m.0dvhv4c
8,0,9727,16774,m.0qh8wn
9,0,9762,16773,m.0qh8ww


#### last fm data の読み込み

In [11]:
last_fm_stack_df = pd.read_csv(
    row_data_path + 'LFM-1b_tracks.txt', 
    sep='\t', 
    header=None, 
    names=('org_id', 'track_name', 'artist_id')
)

In [12]:
last_fm_stack_df.head(5)

Unnamed: 0,org_id,track_name,artist_id
0,1,A Matter of Time,3
1,2,Hangar 18,1
2,3,Up the Downstair,4
3,4,Mr. Carter (Featuring Jay-Z),2
4,5,Mixtaped,5


In [13]:
last_fm_artists_df = pd.read_csv(
    row_data_path + 'LFM-1b_artists.txt', 
    sep='\t',
    header=None, 
    names=('artist_id', 'artist_name')
)

In [14]:
last_fm_artists_df.head(5)

Unnamed: 0,artist_id,artist_name
0,1,Megadeth
1,2,Lil Wayne
2,3,Foo Fighters
3,4,Porcupine Tree
4,5,No-Man


In [15]:
last_fm_df = pd.merge(
    last_fm_stack_df, last_fm_artists_df,
    on='artist_id',
    how='left'
)

In [16]:
last_fm_df.head(5)

Unnamed: 0,org_id,track_name,artist_id,artist_name
0,1,A Matter of Time,3,Foo Fighters
1,2,Hangar 18,1,Megadeth
2,3,Up the Downstair,4,Porcupine Tree
3,4,Mr. Carter (Featuring Jay-Z),2,Lil Wayne
4,5,Mixtaped,5,No-Man


#### モデルの学習用データとLast-FMのデータを結合

In [17]:
all_data_df = pd.merge(
    df, last_fm_df,
    on='org_id',
    how='left'
)

In [18]:
all_data_df = all_data_df.drop_duplicates()
all_data_df.head(10)

Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
0,0,72,16782,m.0qh8tx,Needles and Pins,362.0,Deftones
1,0,73,16781,m.0qh8v3,Minerva,362.0,Deftones
2,0,74,16780,m.0qh8vb,Good Morning Beautiful,362.0,Deftones
3,0,75,16779,m.0qh8vk,Deathblow,362.0,Deftones
4,0,76,16778,m.0w2yfq,When Girls Telephone Boys,362.0,Deftones
5,0,77,16777,m.0qh8v_,Battle-Axe,362.0,Deftones
6,0,9725,16776,m.0qh8w6,Lucky You,362.0,Deftones
7,0,9726,16775,m.0dvhv4c,Bloody Cape,362.0,Deftones
8,0,9727,16774,m.0qh8wn,Anniversary of an Uninteresting Event,362.0,Deftones
9,0,9762,16773,m.0qh8ww,Moana,362.0,Deftones


In [19]:
all_data_df[all_data_df['user_id']=='23553']

Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
616264,23553,7345,181605,m.0lgbq3,Rain When I Die,93.0,Alice in Chains
616265,23553,2814,129341,m.0y55z_,Down in a Hole,93.0,Alice in Chains
616266,23553,8349,181604,m.0lgbqb,Sickman,93.0,Alice in Chains
616267,23553,8347,181600,m.0lgbqs,Junkhead,93.0,Alice in Chains
616268,23553,13325,18156,m.015vjsc,Into the Rainbow Vein,2267.0,Boards of Canada
616269,23553,13326,18155,m.015vjsl,Chromakey Dreamcoat,2267.0,Boards of Canada
616270,23553,13327,18154,m.015vjst,Satellite Anthem Icarus,2267.0,Boards of Canada
616271,23553,13328,18153,m.0152yhm,Peacock Tail,2267.0,Boards of Canada
616272,23553,13329,18152,m.0152yhv,Dayvan Cowboy,2267.0,Boards of Canada
616273,23553,13330,18151,m.015vjtg,A Moment of Clarity,2267.0,Boards of Canada


In [20]:
# 欠損値の確認
all_data_df[all_data_df.isnull().any(axis=1)]

Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
23978,211,9107,1087251,m.0113cxd,,,
26755,240,22624,25143736,m.010y5t16,,,
27101,246,19115,472926,m.0ncwk7,,,
29961,282,17554,1087655,m.0sxk4q,,,
36628,387,17554,1087655,m.0sxk4q,,,
41737,453,12058,1831823,m.0w0ybh,,,
42021,460,1383,420269,m.0md727,,,
44559,496,29863,1087343,m.0zl_f4,,,
44563,496,46196,2199211,m.0160qst,,,
46652,532,32460,923499,m.0vwc72,,,


#### data を csvで保存

In [21]:
master_table_path = './master_table/'

if not os.path.exists(master_table_path):
    os.makedirs(master_table_path)

In [22]:
all_data_df.to_csv(master_table_path + '{}_interaction_data.csv'.format(target))