In [1]:
import requests
import time
import pickle
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import os
from tqdm import tqdm
import ast

In [2]:
# open data files
with open('data/data_spotify_genres.csv', encoding='utf-8') as file_spotify:
    spotify_df = pd.read_csv(file_spotify, index_col=0)
    
with open('data/data_wikidata_top_genres.csv', encoding='utf-8') as file_wikidata:
    wikidata_df = pd.read_csv(file_wikidata, index_col=0)
    
wikidata_df

Unnamed: 0,genre_id,genre_name,genre_name_edited,top_genre_id,top_genre_name,unique_top_genre,everynoise_id,everynoise_name,discogs_genre_name,discogs_style_name
0,Q209498,2 tone,2 tone,Q54365,ska,0.0,,,,
1,Q1751409,2-step garage,2 step garage,Q316930,dance music,0.0,,,,
2,Q1751409,2-step garage,2 step garage,Q9778,electronic music,0.0,,,,
3,Q1338153,20th-century classical music,20th century classical,Q1583807,art music,0.0,,,,
4,Q4631020,21st-century classical music,21st century classical,Q1583807,art music,0.0,21stcenturyclassical,21st century classical,,
...,...,...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,čoček,Q2748579,čoček,1.0,,,,
6840,Q17272400,şarkı,şarkı,Q205049,world music,0.0,,,,
6841,Q17272400,şarkı,şarkı,Q1583807,art music,0.0,,,,
6842,Q1103669,țara călatei folk music and dance,țara călatei folk music and dance,Q205049,world music,0.0,,,,


In [3]:
# print sample of result dict

def print_sample_dict(dict_to_print):
    print('dict length:', len(dict_to_print))
    print('-----------------')
    counter = 0
    for key, value in dict_to_print.items():
        print(key)
        print(value)
        counter += 1
        if counter > 10:
            break

In [17]:
# create dict from spotify_df

spotify_dict = dict()

for idx, row in spotify_df.iterrows():
    spotify_dict[idx] = row['occurrences']
    
print_sample_dict(spotify_dict)

dict length: 3684
-----------------
21st century classical
6
5th wave emo
6
a cappella
13
aarhus indie
5
aberdeen indie
1
abstract
4
abstract beats
8
abstract hip hop
43
abstract idm
1
abstractro
5
accordeon
2


In [18]:
# create dict from wikidata_df
wikidata_dict = dict()

for idx, row in wikidata_df.iterrows():
    genre_id = row['genre_id']
    genre_name = row['genre_name']
    genre_name_edited = row['genre_name_edited']
    top_genre_id = row['top_genre_id']
    top_genre_name = row['top_genre_name']
    unique_top_genre = int(row['unique_top_genre'])
    everynoise_name = row['everynoise_name']
    discogs_style_name = row['discogs_style_name']
    discogs_genre_name = row['discogs_genre_name']
    
    if not pd.notna(everynoise_name):
        everynoise_name = ''
    if not pd.notna(discogs_style_name):
        discogs_style_name = ''
    
    try:
        wikidata_dict[genre_name]['top_genre'][top_genre_name] = {'top_genre_id': top_genre_id, 'unique_top_genre': unique_top_genre}
    except KeyError:    
        wikidata_dict[genre_name] = {
            'genre_id': genre_id,
            'genre_name_edited': genre_name_edited,
            'everynoise_name': everynoise_name,
            'discogs_style_name': discogs_style_name,
            'discogs_genre_name': discogs_genre_name,
            'top_genre': {
                top_genre_name: {
                    'top_genre_id': top_genre_id, 
                    'unique_top_genre': unique_top_genre,
                }}}   

print_sample_dict(wikidata_dict)

dict length: 4574
-----------------
2 tone
{'genre_id': 'Q209498', 'genre_name_edited': '2 tone', 'everynoise_name': '', 'discogs_style_name': '', 'discogs_genre_name': nan, 'top_genre': {'ska': {'top_genre_id': 'Q54365', 'unique_top_genre': 0}}}
2-step garage
{'genre_id': 'Q1751409', 'genre_name_edited': '2 step garage', 'everynoise_name': '', 'discogs_style_name': '', 'discogs_genre_name': nan, 'top_genre': {'dance music': {'top_genre_id': 'Q316930', 'unique_top_genre': 0}, 'electronic music': {'top_genre_id': 'Q9778', 'unique_top_genre': 0}}}
20th-century classical music
{'genre_id': 'Q1338153', 'genre_name_edited': '20th century classical', 'everynoise_name': '', 'discogs_style_name': '', 'discogs_genre_name': nan, 'top_genre': {'art music': {'top_genre_id': 'Q1583807', 'unique_top_genre': 0}}}
21st-century classical music
{'genre_id': 'Q4631020', 'genre_name_edited': '21st century classical', 'everynoise_name': '21st century classical', 'discogs_style_name': '', 'discogs_genre_nam

In [19]:
# match spotify genre with wikidata genre to find its top genre

match_counter = 0
spotify_genres_match_dict = dict()

for spotify_genre, occurrences in tqdm(spotify_dict.items()):
    spotify_genres_match_dict[spotify_genre] = {'occurrences': occurrences}
    matched = False
    
    for wiki_genre_name, wikidata_genre_data in wikidata_dict.items():
        wiki_genre_id = wikidata_genre_data['genre_id']
        wiki_genre_name_edited = wikidata_genre_data['genre_name_edited']
        wiki_everynoise_name = wikidata_genre_data['everynoise_name']
        wiki_discogs_style_name = wikidata_genre_data['discogs_style_name']
        wiki_discogs_genre_name = wikidata_genre_data['discogs_genre_name']
        wiki_top_genre_list = list(wikidata_genre_data['top_genre'].keys())
        
        # find if a wiki genre name exists within spotify genre name
        pattern_matched = False
        for name in [wiki_genre_name, wiki_genre_name_edited, wiki_everynoise_name, wiki_discogs_genre_name, wiki_discogs_style_name]:
            if pd.notna(name) and len(name) > 0:
                pattern = r'\b' + name + r'\b'
                result = re.search(pattern, spotify_genre)
                if result:
                    pattern_matched = True
        
        if spotify_genre in [wiki_genre_name, wiki_genre_name_edited, wiki_everynoise_name, wiki_discogs_genre_name, wiki_discogs_style_name] :
            #print(spotify_genre, '======', genre_name)
            spotify_genres_match_dict[spotify_genre] = {'occurrences': occurrences, wiki_genre_name: wiki_top_genre_list}
            matched = True
            break
        elif pattern_matched:
            #print(spotify_genre, '-', genre_name_edited)
            spotify_genres_match_dict[spotify_genre][wiki_genre_name] = wiki_top_genre_list
            matched = True
            
    if matched:
        match_counter += 1

print(match_counter)
print_sample_dict(spotify_genres_match_dict)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3684/3684 [16:24<00:00,  3.74it/s]

2972
dict length: 3684
-----------------
21st century classical
{'occurrences': 6, '21st-century classical music': ['art music']}
5th wave emo
{'occurrences': 6, 'emo': ['experimental music', 'rock music'], 'wave': ['hip hop music', 'world music', 'electronic music']}
a cappella
{'occurrences': 13, 'a cappella': ['vocal music']}
aarhus indie
{'occurrences': 5}
aberdeen indie
{'occurrences': 1}
abstract
{'occurrences': 4, 'absolute music': ['absolute music']}
abstract beats
{'occurrences': 8, 'absolute music': ['absolute music'], 'experimental electronic': ['electronic music']}
abstract hip hop
{'occurrences': 43, 'abstract hip hop': ['hip hop music', 'world music']}
abstract idm
{'occurrences': 1, 'absolute music': ['absolute music'], 'experimental electronic': ['electronic music'], 'intelligent dance music': ['electronic music']}
abstractro
{'occurrences': 5}
accordeon
{'occurrences': 2}





In [20]:
# get top genres for each spotify genre

def get_spotify_top_genres(spotify_genres_match_dict):
    spotify_top_genres_dict = dict()

    for spotify_genre, wiki_genre_data in spotify_genres_match_dict.items():
        spotify_top_genres_dict[spotify_genre] = set()
        for wiki_genre_name in wiki_genre_data.keys():
            if wiki_genre_name == 'occurrences':
                continue
            else:
                [spotify_top_genres_dict[spotify_genre].add(top_genre) for top_genre in wiki_genre_data[wiki_genre_name]]
    
    return spotify_top_genres_dict

In [21]:
# get list of spotify genres without a match

def get_no_match(spotify_genres_match_dict):
    no_match_dict = dict()

    for spotify_genre, genre_data in spotify_genres_match_dict.items():
        if len(genre_data) < 2:
            occurrences = genre_data['occurrences']
            no_match_dict[spotify_genre] = occurrences
    
    return no_match_dict

In [22]:
# for unmatched genre, get the top genres of related genres (genres associated with artists linked to unmatched genre)

def get_top_genre_with_related_genres(spotify_genres_match_dict):

    spotify_top_genres_dict = get_spotify_top_genres(spotify_genres_match_dict)
    no_match_dict = get_no_match(spotify_genres_match_dict)
    remaining_no_match_genres_dict = dict()

    # for each unmatched genre
    for no_match_genre, no_match_occurrences in tqdm(no_match_dict.items()):
        #print(no_match_genre)

        # get rows of spotify artists with current unmatched genre
        no_match_genre_df = genres_df[(genres_df == no_match_genre).any(axis=1)]

        # get all genres related to current unmatched genre (= genres associated to same artists linked to current unmatched genre)
        related_genres_dict = no_match_genre_df.stack().value_counts().to_dict()

        # remove current unmatched genre from dict of related genres
        related_genres_dict.pop(no_match_genre)

        # if there is at least one genre related to current unmatched genre
        if related_genres_dict:
            top_genres_dict = dict()

            # get the top genre for current unmatched genre (= most frequent top genre in related genres)
            for genre_name, occurrences in related_genres_dict.items():
                # get the list of top genres for current related genre
                top_genres_set = spotify_top_genres_dict[genre_name]

                # if there is at least one top genre for current related genre
                if top_genres_set:
                    # get the occurrences of that top genre
                    for top_genre in top_genres_set:
                        try:
                            top_genre_occurrences = top_genres_dict[top_genre]
                            top_genres_dict[top_genre] = top_genre_occurrences + occurrences
                        except KeyError:
                            top_genres_dict[top_genre] = occurrences

            # if at least one related genre has a top genre (= it is not in no_match list)
            if top_genres_dict:
                # sort dict of top genres by most occurrences (= most frequent top genres among related genres)
                top_genres_list = sorted(top_genres_dict.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)

                # add most frequent top genre to a list
                most_frequent_top_genres_set = {top_genres_list[0][0]}

                # if the most frequent top genre is 'world music', add second top genre, if there is one ('world music' is very frequent and broad)
                if top_genres_list[0][0] == 'world music' and len(top_genres_list) > 1:
                    most_frequent_top_genres_set.add(top_genres_list[1][0])

                # if the most frequent top genre has same occurrence as others, add all top genres with same occurrence
                for top_genre in top_genres_list:
                    if top_genre[1] == top_genres_list[0][1]:
                        most_frequent_top_genres_set.add(top_genre[0])

                for most_frequent_genre in most_frequent_top_genres_set:
                    spotify_genres_match_dict[no_match_genre][most_frequent_genre] = [most_frequent_genre]

            # if no related genre has a top genre (= related genre is in no_match list), add current genre to unmatched genres list
            else:
                remaining_no_match_genres_dict[no_match_genre] = no_match_occurrences
        
        # if current genre is not related to any genre, add current genre to unmatched genres list
        else:
            remaining_no_match_genres_dict[no_match_genre] = no_match_occurrences

    print(len(remaining_no_match_genres_dict))
    print(len(get_spotify_top_genres(spotify_genres_match_dict)))
    
    return spotify_genres_match_dict, remaining_no_match_genres_dict

In [23]:
# open spotify artists data file and get all of their genres data

with open('data/data_spotify_artists.csv', encoding='utf-8') as artists_file:
    spotify_artists_df = pd.read_csv(artists_file, index_col=0)

# get dataframe of all genres for all spotify artists
columns_list = ['spotify_genre_' + str(x) for x in range(1, 20)]
genres_df = spotify_artists_df[spotify_artists_df.columns.intersection(columns_list)]

In [25]:
# for unmatched genre, get the top genres of related genres (genres associated with artists linked to unmatched genre)

spotify_genres_match_dict, remaining_no_match_genres_dict = get_top_genre_with_related_genres(spotify_genres_match_dict)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105/105 [00:02<00:00, 46.07it/s]

93
3684





In [26]:
# open wikidata data files 

with open('data/data_wikidata_bigcities.csv', encoding='utf-8') as file_city:
    city_df = pd.read_csv(file_city)
    
with open('data/data_wikidata_demonyms.csv', encoding='utf-8') as file_demonym:
    demonym_df = pd.read_csv(file_demonym)

with open('data/data_wikidata_usstates.csv', encoding='utf-8') as file_us_state:
    us_state_df = pd.read_csv(file_us_state)
    
us_state_df

Unnamed: 0,state_id,state,abbrev
0,Q173,Alabama,AL
1,Q173,Alabama,Ala.
2,Q797,Alaska,AK
3,Q816,Arizona,AZ
4,Q816,Arizona,Ariz.
...,...,...,...
91,Q1371,West Virginia,W.Va.
92,Q1537,Wisconsin,WI
93,Q1537,Wisconsin,Wis.
94,Q1214,Wyoming,WY


In [27]:
# create a list of all demonyms

"""
SPARQL query for Wikidata items with demonym property:

SELECT (SAMPLE(?item) AS ?item) ?demonym WHERE {
  ?item wdt:P1549 ?demonym.
  FILTER((LANG(?demonym)) = "en")
}
GROUP BY ?demonym

"""

demonym_list = sorted(set(demonym_df['demonym']))
demonym_list = [demonym.lower() for demonym in demonym_list]
print(len(demonym_list))
demonym_list[:10]

796


['/b/tards',
 '/pol/acks',
 '/v/ermin',
 '/v/irgins',
 'abidjanese',
 'abkhaz',
 'abkhazian',
 'abu dhabian',
 'accran',
 'achaemenian']

In [28]:
# create a list of all big cities

"""
SPARQL query for Wikidata items with big city instance:

SELECT DISTINCT ?item ?itemLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  {
    SELECT DISTINCT ?item WHERE {
      ?item p:P31 ?statement0.
      ?statement0 (ps:P31/(wdt:P279*)) wd:Q1549591.
    }
  }
}
"""
city_list = sorted(set(city_df['city']))
city_list = [city.lower() for city in city_list]
print(len(city_list))
city_list[:10]

3548


["'s-hertogenbosch",
 '6th of october city',
 'a coruña',
 'aachen',
 'aalborg',
 'aarhus',
 'aba',
 'abadan',
 'abaetetuba',
 'abakan']

In [29]:
# create a list of all US States and their abbreviation

"""
SPARQL query for Wikidata items with US State instance:

SELECT DISTINCT ?item ?itemLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  {
    SELECT DISTINCT ?item WHERE {
      ?item p:P31 ?statement0.
      ?statement0 (ps:P31/(wdt:P279*)) wd:Q30.
    }
  }
}
"""
us_state_set = set(us_state_df['abbrev'])
us_state_set.update(us_state_df['state'])
us_state_list = sorted(us_state_set)
us_state_list = [state.lower() for state in us_state_list]
print(len(us_state_list))
us_state_list[:10]

145


['ak',
 'al',
 'ar',
 'az',
 'ala.',
 'alabama',
 'alaska',
 'ariz.',
 'arizona',
 'ark.']

In [31]:
# create dict with all words of the genres that are unmatched (example: 'bulgarian indie' -> 'bulgarian' and 'indie')

words_no_match_dict = dict()
city_set = set()
demonym_set = set()
state_set = set()
words_set = set()

for genre, occurences in remaining_no_match_genres_dict.items():
    # divide genre name in words
    words = re.findall(r'\b[\w&-]+\b', genre)
    
    for word in words:
        words_set.add(word)
        
        # if word is a city, reject it (doesn't help to determine the genre)
        if word in city_list :
            city_set.add(word)
            continue
         #if word is a demonym, reject it (doesn't help to determine the genre
        elif word in demonym_list:
            demonym_set.add(word)
            continue
        # if word is a US state, reject it (doesn't help to determine the genre
        elif word in us_state_list:
            state_set.add(word)
            continue
        else:
            # create dict with words unmatched to a top genre
            # counter: nbr of times the word appears in the list of genres
            # occurrences: nbr of times the word appears in all genres of every artist
            try:
                counter = words_no_match_dict[word]['counter']
                word_occurrences = words_no_match_dict[word]['occurrences']
                
                words_no_match_dict[word]['counter'] = counter + 1
                words_no_match_dict[word]['occurrences'] = word_occurrences + occurrences
                words_no_match_dict[word]['genres'].append(genre)
            except KeyError:
                words_no_match_dict[word] = {'counter': 1, 'occurrences': occurences}
                words_no_match_dict[word] = {'counter': 1, 'occurrences': occurences, 'genres': [genre]}
            
print('nbr words no match', len(words_no_match_dict))
print('nbr city words', len(city_set))
print('nbr demonym words', len(demonym_set))
print('nbr state words', len(state_set))
print('nbr total words:', len(words_set))
print_sample_dict(words_no_match_dict)

nbr words no match 114
nbr city words 5
nbr demonym words 9
nbr state words 3
nbr total words: 131
dict length: 114
-----------------
afrikaans
{'counter': 1, 'occurrences': 1, 'genres': ['afrikaans']}
backing
{'counter': 1, 'occurrences': 1, 'genres': ['backing track']}
track
{'counter': 1, 'occurrences': 1, 'genres': ['backing track']}
barnmusik
{'counter': 1, 'occurrences': 2, 'genres': ['barnmusik']}
belo
{'counter': 1, 'occurrences': 1, 'genres': ['belo horizonte indie']}
horizonte
{'counter': 1, 'occurrences': 1, 'genres': ['belo horizonte indie']}
indie
{'counter': 28, 'occurrences': 1135, 'genres': ['belo horizonte indie', 'bulgarian indie', 'cornwall indie', 'delaware indie', 'euskal indie', 'galway indie', 'hong kong indie', 'hungarian indie', 'indie boliviano', 'indie liguria', 'indie tico', 'indie triste', 'kansai indie', 'limerick indie', 'lincoln ne indie', 'milwaukee indie', 'minneapolis indie', 'normal indie', 'north dakota indie', 'pakistani indie', 'perth indie', 'por

In [32]:
# save words unmatched to a csv datafile for manual retrieval of top genre

words_no_match_df = pd.DataFrame.from_dict(words_no_match_dict, orient='index')
words_no_match_df.sort_values(by=['occurrences'], ascending=False, inplace=True)
words_no_match_df.to_csv('data/data_spotify_genres_no_match.csv', encoding='utf-8')
words_no_match_df.index.name = 'word'
words_no_match_df.reset_index(inplace=True)
words_no_match_df

Unnamed: 0,word,counter,occurrences,genres
0,indie,28,1135,"[belo horizonte indie, bulgarian indie, cornwa..."
1,musica,4,127,"[musica mato-grossense, musica para ninos, mus..."
2,viola,2,46,"[viola, viola da gamba]"
3,west,2,43,"[old west, west yorkshire indie]"
4,worship,2,43,"[chinese worship, swiss worship]"
...,...,...,...,...
109,galway,1,1,[galway indie]
110,quebecois,1,1,[folklore quebecois]
111,folklore,1,1,[folklore quebecois]
112,portugues,1,1,[folclore portugues]


In [33]:
# open data file with manual retrieval of top genre

with open('data/data_spotify_genres_no_match_manual.csv', encoding='utf-8') as file_nomatch:
    no_match_manual_df = pd.read_csv(file_nomatch)
    no_match_manual_df = no_match_manual_df.rename(columns={'Unnamed: 0': 'word'})
    
no_match_manual_df

Unnamed: 0,word,counter,occurrences,manual,genres
0,indie,28,28,independent music,"['belo horizonte indie', 'bulgarian indie', 'c..."
1,fake,1,5,functional music,['fake']
2,viola,2,5,instrumental music,"['viola', 'viola da gamba']"
3,tribute,1,4,functional music,['tribute']
4,musica,4,4,world music,"['musica mato-grossense', 'musica para ninos',..."
...,...,...,...,...,...
109,galway,1,1,,['galway indie']
110,quebecois,1,1,folk music,['folklore quebecois']
111,folklore,1,1,,['folklore quebecois']
112,portugues,1,1,,['folclore portugues']


In [34]:
# add the manual top genres to the main results dict

manual_genres_dict = dict()

for idx, row in no_match_manual_df.iterrows():
    top_genre = row['manual']
    genres_list = ast.literal_eval(row['genres'])
    
    if pd.notna(top_genre):
        for genre_name in genres_list:
            spotify_genres_match_dict[genre_name][top_genre] = [top_genre]

print('Nbr spotify genres:', len(spotify_genres_match_dict))
print('Nbr spotify genres unmatched:', len(get_no_match(spotify_genres_match_dict)))
#spotify_genres_match_dict

Nbr spotify genres: 3684
Nbr spotify genres unmatched: 0


In [35]:
# save results data to csv file

spotify_top_genres_dict = get_spotify_top_genres(spotify_genres_match_dict)
spotify_top_genres_df = pd.DataFrame.from_dict(spotify_top_genres_dict, orient='index')
#spotify_top_genres_df.to_csv('data/data_spotify_top_genres.csv', encoding='utf-8')
spotify_top_genres_df

Unnamed: 0,0,1,2,3,4,5
21st century classical,art music,,,,,
5th wave emo,hip hop music,experimental music,electronic music,world music,rock music,
a cappella,vocal music,,,,,
aarhus indie,rock music,,,,,
aberdeen indie,rock music,,,,,
...,...,...,...,...,...,...
zolo,rock music,,,,,
zouglou,zouglou,,,,,
zouk,world music,,,,,
zouk riddim,instrumental music,dance music,electronic music,world music,,


---
## Find manual top genre for unique top genre
---

In [82]:
with open('data/data_spotify_top_genres.csv', encoding='utf-8') as file_top:
    spotify_top_genres_df = pd.read_csv(file_top, index_col=0).fillna('')
    
spotify_top_genres_df

Unnamed: 0,0,1,2,3,4,5
21st century classical,art music,,,,,
5th wave emo,hip hop music,experimental music,electronic music,world music,rock music,
a cappella,vocal music,,,,,
aarhus indie,rock music,,,,,
aberdeen indie,rock music,,,,,
...,...,...,...,...,...,...
zolo,rock music,,,,,
zouglou,zouglou,,,,,
zouk,world music,,,,,
zouk riddim,instrumental music,dance music,electronic music,world music,,


In [83]:
unique_top_genre_df = wikidata_df.loc[wikidata_df['unique_top_genre'] == 1]
unique_top_genre_list = list(unique_top_genre_df['genre_name'])
unique_top_genre_list[:10]

['60s pop/rock',
 'abidat rma',
 'abozao',
 'absolute music',
 'abwe',
 'achewiq',
 "actor's song",
 'adhunik geet',
 'adult hits',
 'adult-oriented pop music']

In [84]:
spotify_unique_top_genre_set = set()
for genre, top_genres in spotify_top_genres_df.to_dict(orient='index').items():
    top_genres_set = set(top_genres.values())
    top_genres_set.discard('')
    
    for top_genre in top_genres_set:
        if top_genre in unique_top_genre_list:
            spotify_unique_top_genre_set.add(top_genre)
            
len(spotify_unique_top_genre_set)

128

In [85]:
spotify_unique_top_genre_df = pd.DataFrame(spotify_unique_top_genre_set)
spotify_unique_top_genre_df = spotify_unique_top_genre_df.rename(columns={0: 'unique_top_genre'}).sort_values(by=['unique_top_genre']).reset_index(drop=True)
spotify_unique_top_genre_df.to_csv('data/data_wikidata_unique_genres.csv', encoding='utf-8')
spotify_unique_top_genre_df

Unnamed: 0,unique_top_genre
0,absolute music
1,afro
2,afro-peruvian music
3,album-oriented rock
4,argentine punk
...,...
123,world fusion music
124,xhosa music
125,yacht rock
126,youth music


In [86]:
with open('data/data_wikidata_unique_manual.csv', encoding='utf-8') as file_manual:
    unique_manual_df = pd.read_csv(file_manual, index_col=1).drop(columns=['Unnamed: 0'])
    
unique_manual_df

Unnamed: 0_level_0,manual_top_genre
unique_top_genre,Unnamed: 1_level_1
absolute music,art music
afro,world music
afro-peruvian music,world music
album-oriented rock,rock music
argentine punk,rock music
...,...
world fusion music,world music
xhosa music,folk music
yacht rock,rock music
youth music,pop music


In [87]:
for idx, row in spotify_top_genres_df.iloc[:].iterrows():
    column_idx = 0
    for genre in row:
        if genre in list(unique_manual_df.index):
            manual_top_genre = unique_manual_df.loc[genre, 'manual_top_genre']
            if manual_top_genre not in row:
                spotify_top_genres_df.loc[idx, str(column_idx)] = manual_top_genre
            
        column_idx += 1
        
spotify_top_genres_df

Unnamed: 0,0,1,2,3,4,5
21st century classical,art music,,,,,
5th wave emo,hip hop music,experimental music,electronic music,world music,rock music,
a cappella,vocal music,,,,,
aarhus indie,rock music,,,,,
aberdeen indie,rock music,,,,,
...,...,...,...,...,...,...
zolo,rock music,,,,,
zouglou,dance music,,,,,
zouk,world music,,,,,
zouk riddim,instrumental music,dance music,electronic music,world music,,


In [88]:
spotify_top_genres_df.to_csv('data/data_spotify_top_genres_v2.csv', encoding='utf-8')