In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import os
import numpy as np

### interaction のデータを読み込み

In [2]:
interaction_file_path = './master_table/all_interaction_data.csv'

In [3]:
df = pd.read_csv(interaction_file_path)
df.head(10)

Unnamed: 0.1,Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
0,0,0,72,16782,m.0qh8tx,Needles and Pins,362.0,Deftones
1,1,0,73,16781,m.0qh8v3,Minerva,362.0,Deftones
2,2,0,74,16780,m.0qh8vb,Good Morning Beautiful,362.0,Deftones
3,3,0,75,16779,m.0qh8vk,Deathblow,362.0,Deftones
4,4,0,76,16778,m.0w2yfq,When Girls Telephone Boys,362.0,Deftones
5,5,0,77,16777,m.0qh8v_,Battle-Axe,362.0,Deftones
6,6,0,9725,16776,m.0qh8w6,Lucky You,362.0,Deftones
7,7,0,9726,16775,m.0dvhv4c,Bloody Cape,362.0,Deftones
8,8,0,9727,16774,m.0qh8wn,Anniversary of an Uninteresting Event,362.0,Deftones
9,9,0,9762,16773,m.0qh8ww,Moana,362.0,Deftones


In [4]:
artists = list(df['artist_name'].unique())

In [5]:
artist_list = list(df['artist_name'])
artist_np = np.array(artist_list)

interaction_count = []

for artist in tqdm(artists):
    count = np.count_nonzero(artist_np == artist)
    interaction_count.append(count)

  import sys
100%|██████████████████████████████████████████████████████████████████████████████| 3383/3383 [00:56<00:00, 57.77it/s]


In [8]:
artists_population_df = pd.DataFrame(
    data = {
        'artist_name' : artists,
        'interaction' : interaction_count
    },
    columns = ['artist_name', 'interaction']
)

In [9]:
artists_population_df.head(10)

Unnamed: 0,artist_name,interaction
0,Deftones,2511
1,Marilyn Manson,2390
2,Rob Zombie,805
3,Apocalyptica,882
4,Soundgarden,799
5,The Doors,325
6,Tim Buckley,101
7,Wolf Parade,111
8,Buzzcocks,98
9,Black Lips,137


In [10]:
artists_population_df = artists_population_df.sort_values('interaction', ascending=False)
artists_population_df = artists_population_df.reset_index(drop=True)
artists_population_df.head(10)

Unnamed: 0,artist_name,interaction
0,Arctic Monkeys,4754
1,Linkin Park,4357
2,The Beatles,4271
3,Queen,4241
4,Daft Punk,3727
5,Muse,3712
6,Sia,3587
7,The White Stripes,3520
8,Queens of the Stone Age,3327
9,Red Hot Chili Peppers,3066


In [11]:
artists_population_df.to_csv('artists_population.csv', encoding='utf_8_sig')

### 特定のアーティストの順位を検索

In [6]:
artists_population_df = pd.read_csv('artists_population.csv')

In [8]:
artist = 'The Chemical Brothers'
artists_population_df[artists_population_df['artist_name'] == artist]

Unnamed: 0.1,Unnamed: 0,artist_name,interaction
178,178,The Chemical Brothers,792


### 人気のアーティストを抽出

In [12]:
top_artists_df = artists_population_df[:250]

In [13]:
top_artists_df.tail(5)

Unnamed: 0,artist_name,interaction
245,The Sisters of Mercy,636
246,Cradle of Filth,636
247,Cage the Elephant,634
248,DJ Shadow,631
249,Weezer,627


In [14]:
top_artist_list = top_artists_df['artist_name']
top_artist_intaraction_df = df[df['artist_name'].isin(top_artist_list)]

In [15]:
top_artist_intaraction_df.columns

Index(['Unnamed: 0', 'user_id', 'item_id', 'org_id', 'freebase_id',
       'track_name', 'artist_id', 'artist_name'],
      dtype='object')

In [16]:
top_artist_intaraction_df = top_artist_intaraction_df.drop("Unnamed: 0", axis=1)
top_artist_intaraction_df.head()

Unnamed: 0,user_id,item_id,org_id,freebase_id,track_name,artist_id,artist_name
0,0,72,16782,m.0qh8tx,Needles and Pins,362.0,Deftones
1,0,73,16781,m.0qh8v3,Minerva,362.0,Deftones
2,0,74,16780,m.0qh8vb,Good Morning Beautiful,362.0,Deftones
3,0,75,16779,m.0qh8vk,Deathblow,362.0,Deftones
4,0,76,16778,m.0w2yfq,When Girls Telephone Boys,362.0,Deftones


### 人気のアーティストのトラックの出現回数をカウント

In [17]:
items = list(top_artist_intaraction_df['item_id'].unique())
item_list = list(top_artist_intaraction_df['item_id'])

In [55]:
items_np = np.array(item_list)

interaction_count = []

for item in tqdm(items):
    count = np.count_nonzero(items_np == item)
    interaction_count.append(count)

100%|██████████████████████████████████████████████████████████████████████████| 12672/12672 [00:02<00:00, 4356.01it/s]


In [105]:
item_interaction_df = pd.DataFrame(
    data = {
        'item_id' : items,
        'interaction' : interaction_count
    },
    columns=['item_id', 'interaction']
)

In [106]:
item_interaction_df['item_id'] = item_interaction_df['item_id']
item_interaction_df.head(5)
item_interaction_df.to_csv('item_interaction.csv')

In [100]:
len(item_interaction_df)

12672

In [107]:
item_meta_data_df = df.loc[:, ['item_id', 'track_name', 'artist_name']]
item_meta_data_df['item_id'] = item_meta_data_df['item_id']
item_meta_data_df.to_csv('item_meta_data.csv')
item_meta_data_df.head()

Unnamed: 0,item_id,track_name,artist_name
0,72,Needles and Pins,Deftones
1,73,Minerva,Deftones
2,74,Good Morning Beautiful,Deftones
3,75,Deathblow,Deftones
4,76,When Girls Telephone Boys,Deftones


### 各アイテムの出現回数とメタ情報を含んだDataFrameを作成

In [155]:
item_interaction_df = pd.read_csv('item_interaction.csv')
item_interaction_df.head()

Unnamed: 0.1,Unnamed: 0,item_id,interaction
0,0,72,42
1,1,73,99
2,2,74,42
3,3,75,31
4,4,76,37


In [156]:
df_1 = item_interaction_df.drop('Unnamed: 0', axis=1)
df_1.head()

Unnamed: 0,item_id,interaction
0,72,42
1,73,99
2,74,42
3,75,31
4,76,37


In [158]:
item_meta_data_df = pd.read_csv('item_meta_data.csv')
df_2 = item_meta_data_df.drop('Unnamed: 0', axis=1)
df_2.head(100)

Unnamed: 0,item_id,track_name,artist_name
0,72,Needles and Pins,Deftones
1,73,Minerva,Deftones
2,74,Good Morning Beautiful,Deftones
3,75,Deathblow,Deftones
4,76,When Girls Telephone Boys,Deftones
5,77,Battle-Axe,Deftones
6,9725,Lucky You,Deftones
7,9726,Bloody Cape,Deftones
8,9727,Anniversary of an Uninteresting Event,Deftones
9,9762,Moana,Deftones


In [159]:
item_df = pd.merge(
    right=df_1,
    left=df_2, 
    on='item_id',
    how='left'
)
item_df.head(100)

Unnamed: 0,item_id,track_name,artist_name,interaction
0,72,Needles and Pins,Deftones,42.0
1,73,Minerva,Deftones,99.0
2,74,Good Morning Beautiful,Deftones,42.0
3,75,Deathblow,Deftones,31.0
4,76,When Girls Telephone Boys,Deftones,37.0
5,77,Battle-Axe,Deftones,41.0
6,9725,Lucky You,Deftones,51.0
7,9726,Bloody Cape,Deftones,45.0
8,9727,Anniversary of an Uninteresting Event,Deftones,29.0
9,9762,Moana,Deftones,26.0


In [174]:
item_df.drop_duplicates(subset='item_id')

Unnamed: 0,item_id,track_name,artist_name,interaction
0,72,Needles and Pins,Deftones,42.0
1,73,Minerva,Deftones,99.0
2,74,Good Morning Beautiful,Deftones,42.0
3,75,Deathblow,Deftones,31.0
4,76,When Girls Telephone Boys,Deftones,37.0
5,77,Battle-Axe,Deftones,41.0
6,9725,Lucky You,Deftones,51.0
7,9726,Bloody Cape,Deftones,45.0
8,9727,Anniversary of an Uninteresting Event,Deftones,29.0
9,9762,Moana,Deftones,26.0


In [175]:
item_df.to_csv('top_250_artist_item.csv')

### 特定のアーティストのアイテムを抽出
 - [人気アーティスト](https://docs.google.com/spreadsheets/d/1R11hDZONuB0rBP-FNAUt_TiArvYpUec9uSZLgCLqAhQ/edit#gid=808250101)

In [194]:
target_artist = 'Lady Gaga'

In [195]:
_target_df = item_df[item_df['artist_name']==target_artist]

In [196]:
_target_df = _target_df.drop_duplicates(subset='item_id')

In [197]:
_target_df

Unnamed: 0,item_id,track_name,artist_name,interaction
7968,9133,Dance in the Dark,Lady Gaga,60.0
9247,6251,Government Hooker,Lady Gaga,63.0
9248,6290,Judas,Lady Gaga,152.0
9249,8926,Americano,Lady Gaga,71.0
9250,6237,Bloody Mary,Lady Gaga,72.0
9251,6232,Bad Kids,Lady Gaga,58.0
9252,6288,Highway Unicorn (Road to Love),Lady Gaga,59.0
12924,8900,Paparazzi,Lady Gaga,127.0
16304,9135,Teeth,Lady Gaga,55.0
16305,6265,Just Dance,Lady Gaga,113.0


In [198]:
path = './artist_item_data/' 

if not os.path.exists(path):
    os.makedirs(path)
    
_target_df.to_csv(path + '{}.csv'.format(target_artist))