In [1]:
import pandas as pd

# Create Data File

The data source is https://console.cloud.google.com/marketplace/details/metabrainz/listenbrainz 

We pick the top top 300 most popular artists by number of listens. The data is available thanks to BigQuery's 1TB/mo of free tier processing.

The first step is we query the data set:


In [2]:
play_counts_df = pd.read_feather("play_counts_df.feather")

In [3]:
play_counts_df.head(10)

Unnamed: 0,user_name,artist_name,cnt
0,munhoz,Daft Punk,663
1,enkadeze,U2,321
2,Kilu,U2,32
3,ioncewasacat,Air,551
4,toastpaint,Air,342
5,Boris_Neo,Air,190
6,jaffry,Air,1041
7,ars_mvsica,Air,931
8,I_am_walking,Air,158
9,Svarthjelm,Air,32


## group by user by pivot the table

The value is the number of listens for each user per artist:


In [4]:
play_counts_pivot_df = play_counts_df.pivot(index='user_name', columns='artist_name', values='cnt')
play_counts_pivot_df.head()

artist_name,30 Seconds to Mars,65daysofstatic,A Perfect Circle,A Tribe Called Quest,ABBA,AC/DC,Adele,Aerosmith,Air,Alanis Morissette,...,ZZ Top,alt-J,blink-182,deadmau5,久石譲,川井憲次,梶浦由記,植松伸夫,菅野よう子,近藤浩治
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-andor-,,46.0,,43.0,7.0,13.0,,4.0,71.0,,...,12.0,,,29.0,,1.0,,,,
-dAtA-TRoN-,,,,,,,,,,,...,,,,,,,,,,
-nils-,,2.0,49.0,,,,,,,,...,8.0,,,,,12.0,,,,
00dani,,1.0,,,,,,,,,...,,3.0,,,,,,8.0,,10.0
00void,3.0,425.0,1.0,,,1.0,,12.0,17.0,,...,6.0,,11.0,,,,,,,


This is a quick way to flatten the pivoted table:

In [5]:
import pandas as pd
df_out = pd.DataFrame(play_counts_pivot_df.to_records())
df_out.head(4)

Unnamed: 0,user_name,30 Seconds to Mars,65daysofstatic,A Perfect Circle,A Tribe Called Quest,ABBA,AC/DC,Adele,Aerosmith,Air,...,ZZ Top,alt-J,blink-182,deadmau5,久石譲,川井憲次,梶浦由記,植松伸夫,菅野よう子,近藤浩治
0,-andor-,,46.0,,43.0,7.0,13.0,,4.0,71.0,...,12.0,,,29.0,,1.0,,,,
1,-dAtA-TRoN-,,,,,,,,,,...,,,,,,,,,,
2,-nils-,,2.0,49.0,,,,,,,...,8.0,,,,,12.0,,,,
3,00dani,,1.0,,,,,,,,...,,3.0,,,,,,8.0,,10.0


In [6]:
df_out[df_out.user_name == "-nils-"].iloc[0]['Muse']

20

## Enrich Data

We will enrich the data from [this notebook: Enrich_top_300.ipynb](Enrich_top_300.ipynb).

The idea here is we create a tag counter. So for each of the 300 artists we create or increment the column for that croud. f three of the artists played by the contain the "rock" tag, then the value for that user tag_rock column is 3, if 4 artists have a 80s tag, then tag_80s is 4, and so on...



In [7]:
band_tags = pd.read_csv("bands_tags.csv")
band_tags.head()

Unnamed: 0.1,Unnamed: 0,artist_name,tag_alternative,tag_alternative rock,tag_rock,tag_indie,tag_electronic,tag_classic rock,tag_british,tag_60s,...,tag_jay z,tag_shoegazer,tag_hair metal,tag_rapcore,tag_underground hip hop,tag_symphonic black metal,tag_darkwave,tag_world,tag_latin,tag_spanish
0,0,Radiohead,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,The Beatles,False,False,True,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,2,Pink Floyd,False,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,Daft Punk,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,Muse,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
list(band_tags.columns)[:15]

['Unnamed: 0',
 'artist_name',
 'tag_alternative',
 'tag_alternative rock',
 'tag_rock',
 'tag_indie',
 'tag_electronic',
 'tag_classic rock',
 'tag_british',
 'tag_60s',
 'tag_pop',
 'tag_progressive rock',
 'tag_psychedelic rock',
 'tag_psychedelic',
 'tag_house']

following may take awhile to run. This goes over every row in the user list (df_out) and compares every band to all the appropriate column, if it finds that that person listed to that band, it increments each tag for that artists: showing preference for those generes. 

In [9]:
from collections import defaultdict

tag_for = {}
for xi, xrow in df_out.iterrows():
    tag_for[xi] = defaultdict(int)
    for index, row in band_tags.iterrows():
        if pd.notnull(xrow[row['artist_name']]):
            if xrow[row['artist_name']] > 0:
                tags = [name.lower() for name, val in row.iteritems() if "tag_" in name and val == True]
                for tag in tags:
                    tag_for[xi][tag] += 1                          

 

Add the tags with their counts to the DataFrame as additional features

In [10]:
for xi, values in tag_for.items():
    for tag_name, count in values.items():
        df_out.at[xi, tag_name] = count
        
df_out.head().to_csv("test.csv", index=False)
df_out.head()   

  self.obj[key] = infer_fill_value(value)


Unnamed: 0,user_name,30 Seconds to Mars,65daysofstatic,A Perfect Circle,A Tribe Called Quest,ABBA,AC/DC,Adele,Aerosmith,Air,...,tag_melodic metal,tag_halo,tag_game soundtracks,tag_symphonic black metal,tag_underground hip hop,tag_grindcore,tag_noisecore,tag_noisegrind,tag_comedy,tag_monkey island
0,-andor-,,46.0,,43.0,7.0,13.0,,4.0,71.0,...,,,,,,,,,,
1,-dAtA-TRoN-,,,,,,,,,,...,,,,,,,,,,
2,-nils-,,2.0,49.0,,,,,,,...,,,,,,,,,,
3,00dani,,1.0,,,,,,,,...,,,,,,,,,,
4,00void,3.0,425.0,1.0,,,1.0,,12.0,17.0,...,,,,,,,,,,


Set a Target column to turn this into a classification problem, if they play Beatles (even once) then label is True

In [11]:
import numpy as np
df_out['Like The Beatles'] =  df_out['The Beatles'].apply(lambda x: not pd.isnull(x))

  


We need to clean up the column names as Cloud AutoML is very picky and will fail if you do things like have Unicode file names. I wish Google will fix this. I'll complain ;)

In [12]:
import csv

df_out.columns = df_out.columns.str.replace("post-roc", "post roc1")
df_out.columns = df_out.columns.str.replace("trip-hop", "trip hop1")
df_out.columns = df_out.columns.str.replace(" ", "_")
df_out.columns = df_out.columns.str.replace("/", "")
df_out.columns = df_out.columns.str.replace("é", "")
df_out.columns = df_out.columns.str.replace("ö", "o")
df_out.columns = df_out.columns.str.replace("+", "_and_")
df_out.columns = df_out.columns.str.replace("&", "and")
df_out.columns = df_out.columns.str.replace("!", "")
df_out.columns = df_out.columns.str.replace("-", "_")
df_out.columns = df_out.columns.str.replace(".", "")
df_out.columns = df_out.columns.str.replace("ó", "o")
df_out.columns = df_out.columns.str.replace("'", "")

# Japanese Artists
df_out.columns = df_out.columns.str.replace("久石譲", "Joe_Hisaishi")
df_out.columns = df_out.columns.str.replace("川井憲次", "Kenji_Kawai")
df_out.columns = df_out.columns.str.replace("梶浦由記", "Yuki_Kajiura")
df_out.columns = df_out.columns.str.replace("植松伸夫", "Nobuo_Uematsu")
df_out.columns = df_out.columns.str.replace("菅野よう子", "Yoko_Kanno")
df_out.columns = df_out.columns.str.replace("近藤浩治", "Koji_Kondo")


df_out.to_csv("file_out_2495_tags.csv", index=False)

  if __name__ == "__main__":
  del sys.path[0]


In [13]:
!head file_out_2495_tags.csv

user_name,30_Seconds_to_Mars,65daysofstatic,A_Perfect_Circle,A_Tribe_Called_Quest,ABBA,ACDC,Adele,Aerosmith,Air,Alanis_Morissette,Alice_in_Chains,Amon_Amarth,Amon_Tobin,Amorphis,Anal_Cunt,Anathema,Animal_Collective,Aphex_Twin,Apocalyptica,Arcade_Fire,Arctic_Monkeys,Audioslave,Autechre,Avenged_Sevenfold,Avril_Lavigne,Ayreon,Bad_Religion,Beach_House,Beastie_Boys,Beck,Belle_and_Sebastian,Beyonc,Bjork,Black_Sabbath,Blind_Guardian,Bloc_Party,Blur,Boards_of_Canada,Bob_Dylan,Bob_Marley_and_The_Wailers,Bon_Iver,Bon_Jovi,Bonobo,Brand_New,Breaking_Benjamin,Brian_Eno,Britney_Spears,Broken_Social_Scene,Bruce_Springsteen,Burial,Bush,CHVRCHES,Cake,Calvin_Harris,Carly_Rae_Jepsen,Childish_Gambino,Children_of_Bodom,Cocteau_Twins,Coheed_and_Cambria,Coldplay,Cradle_of_Filth,Creedence_Clearwater_Revival,Crystal_Castles,DJ_Shadow,Daft_Punk,Dark_Tranquillity,Dave_Matthews_Band,David_Bowie,Dead_Can_Dance,Death_Cab_for_Cutie,Death_Grips,Deep_Purple,Deftones,Depeche_Mode,Dire_Straits,Disturbed,Drake,Dream_Thea