In [1]:
import pandas as pd
import re
from tqdm import tqdm

In [26]:
with open('data/data_wikidata_genres.csv', encoding='utf-8') as file_wikidata:
    wikidata_df = pd.read_csv(file_wikidata)
    
with open('data/data_everynoise_genres.csv', encoding='utf-8') as file_everynoise:
    everynoise_df = pd.read_csv(file_everynoise, index_col=0)
    
wikidata_df

Unnamed: 0,genre_id,genre_name,parent_genre_id,parent_genre_name,everynoise_id,discogs_genre_name,discogs_style_name
0,Q209498,2 tone,Q54365,ska,,,
1,Q1751409,2-step garage,Q1165777,UK garage,,,
2,Q1338153,20th-century classical music,Q1583807,art music,,,
3,Q4631020,21st-century classical music,Q1583807,art music,21stcenturyclassical,,
4,Q4637208,4-beat,Q663519,breakbeat,,,
...,...,...,...,...,...,...,...
6300,Q227045,zouk,Q107025062,French Caribbean music,zouk,,zouk
6301,Q11903204,Zouk chouv,,,,,
6302,Q8074636,zouk-love,Q227045,zouk,,,
6303,Q17111801,Zulu music,Q4794612,music of South Africa,,,


In [126]:
# get the parent genre of a given genre
# this is a recursive function, ultimately return the top (most parent) genre

def get_parent_genre(data_df, row, top_genre_dict):
    genre_name = row['genre_name']
    parent_genre_id = row['parent_genre_id']
    parent_genre_name = row['parent_genre_name']
    
    # if current genre has "popular music" as a parent genre
    # popular music is too broad (jazz, pop, rock, etc are all "popular music")
    if parent_genre_id == 'Q373342':  # wikidata id of "popular music"
        top_genre = row
        top_genre_dict[genre_name] = top_genre
        # return current genre as the top genre
        return top_genre_dict
    
    # check if current genre has a parent genre
    try:
        parent_genre_row = wikidata_df.loc[wikidata_df['genre_id'] == parent_genre_id]
    # if parent is not a music genre/ if parent does not have a parent genre
    except KeyError:
        top_genre = row
        top_genre_dict[genre_name] = top_genre
        # return current genre as the top genre
        return top_genre_dict
    
    # number of parent genres for current genre
    nbr_parent_genres = len(parent_genre_row.index)
    
    # if current genre has more than one parent genre (ex: edm has electronic and dance as parent genres)
    if nbr_parent_genres > 1:
        # get the parent genre of each parent genre
        for i in range(nbr_parent_genres):
            parent_genre_row_i = parent_genre_row.iloc[i]
            top_genre_dict = get_parent_genre(wikidata_df, parent_genre_row_i, top_genre_dict)
    # if current genre has only one parent genre (ex: electonica only has electronic parent genre)
    elif nbr_parent_genres == 1:
        # get the parent genre of the parent genre
        top_genre_dict = get_parent_genre(wikidata_df, parent_genre_row.iloc[0], top_genre_dict)
    # if current genre does not have a parent genre
    else:
        # set the current genre as the top genre
        top_genre = row
        top_genre_dict[genre_name] = top_genre
        
    return top_genre_dict

In [28]:
wikidata_top_df = wikidata_df.copy()

for idx, row in tqdm(wikidata_top_df.iterrows()):
    top_genre_dict = dict()
    genre_name = row['genre_name']
    
    # get the top (most parent) genre(s) for the current genre
    top_genre_dict = get_parent_genre(wikidata_df, row, top_genre_dict)
    
    # iterate on all top genres
    list_idx = 0
    for key, top_genre in top_genre_dict.items():
        # get wikidata id and name of top genre
        top_genre_id = top_genre['genre_id']
        top_genre_name = top_genre['genre_name']
        
        # for the first top genre, add top_genre columns to the current row
        if list_idx == 0:
            wikidata_top_df.loc[idx, 'top_genre_id'] = top_genre_id 
            wikidata_top_df.loc[idx, 'top_genre_name'] = top_genre_name
            
        # if there is more than one top genre, add a row at end of df with top_genre columns
        else:
            row['top_genre_id'] = top_genre_id
            row['top_genre_name'] = top_genre_name
            wikidata_top_df = wikidata_top_df.append(row, ignore_index=True)
                 
        list_idx += 1
        
wikidata_top_df

6305it [00:50, 124.70it/s]


Unnamed: 0,genre_id,genre_name,parent_genre_id,parent_genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name
0,Q209498,2 tone,Q54365,ska,,,,Q54365,ska
1,Q1751409,2-step garage,Q1165777,UK garage,,,,Q9778,electronic music
2,Q1338153,20th-century classical music,Q1583807,art music,,,,Q1583807,art music
3,Q4631020,21st-century classical music,Q1583807,art music,21stcenturyclassical,,,Q1583807,art music
4,Q4637208,4-beat,Q663519,breakbeat,,,,Q9778,electronic music
...,...,...,...,...,...,...,...,...,...
8591,Q8072085,Zimbabwean hip hop,Q11401,hip hop music,zimhiphop,,,Q205049,world music
8592,Q106654137,zinli,Q98528185,African folk music,,,,Q205049,world music
8593,Q965834,Znamenny chant,Q23072435,chant,,,,Q26897135,functional music
8594,Q245296,Zydeco,Q1026089,Cajun music,zydeco,,zydeco,Q1541229,American folk music


In [29]:
# drop parent_genre columns from df
wikidata_top_df = wikidata_top_df.drop(['parent_genre_id', 'parent_genre_name'], axis=1)

# set genre names to lower case
wikidata_top_df['genre_name'] = wikidata_top_df['genre_name'].str.lower()
wikidata_top_df['top_genre_name'] = wikidata_top_df['top_genre_name'].str.lower()

# sort df by values in genre_name column
wikidata_top_df = wikidata_top_df.sort_values(by=['genre_name'])

# reset index following sorting
wikidata_top_df = wikidata_top_df.reset_index(drop=True)
wikidata_top_df

Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name
0,Q209498,2 tone,,,,Q54365,ska
1,Q1751409,2-step garage,,,,Q316930,dance music
2,Q1751409,2-step garage,,,,Q9778,electronic music
3,Q1338153,20th-century classical music,,,,Q1583807,art music
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music
...,...,...,...,...,...,...,...
8591,Q17272400,şarkı,,,,Q1583807,art music
8592,Q1103669,țara călatei folk music and dance,,,,Q205049,world music
8593,Q1103669,țara călatei folk music and dance,,,,Q1103669,țara călatei folk music and dance
8594,Q1103669,țara călatei folk music and dance,,,,Q205049,world music


In [112]:
wikidata_top_df = backup_wikidata_df.copy()
wikidata_top_df

Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name
0,Q209498,2 tone,,,,Q54365,ska
1,Q1751409,2-step garage,,,,Q316930,dance music
2,Q1751409,2-step garage,,,,Q9778,electronic music
3,Q1338153,20th-century classical music,,,,Q1583807,art music
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music
...,...,...,...,...,...,...,...
8591,Q17272400,şarkı,,,,Q1583807,art music
8592,Q1103669,țara călatei folk music and dance,,,,Q205049,world music
8593,Q1103669,țara călatei folk music and dance,,,,Q1103669,țara călatei folk music and dance
8594,Q1103669,țara călatei folk music and dance,,,,Q205049,world music


In [113]:
# delete duplicate rows (same genre and same top genre)

for idx, row in tqdm(wikidata_top_df.iloc[:].iterrows()):
    genre_name = row['genre_name']
    top_genre_name = row['top_genre_name']
    
    # get df with rows that have the same genre
    same_genre_df = wikidata_top_df.loc[wikidata_top_df['genre_name'] == genre_name]
    
    # from df with same genre, get df with 
    same_top_genre_df = same_genre_df.loc[same_genre_df['top_genre_name'] == top_genre_name]
    
    if len(same_top_genre_df) > 1:
        wikidata_top_df = wikidata_top_df.drop([idx])

wikidata_top_df = wikidata_top_df.reset_index(drop=True)
wikidata_top_df

8596it [00:15, 564.66it/s]


Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name
0,Q209498,2 tone,,,,Q54365,ska
1,Q1751409,2-step garage,,,,Q316930,dance music
2,Q1751409,2-step garage,,,,Q9778,electronic music
3,Q1338153,20th-century classical music,,,,Q1583807,art music
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music
...,...,...,...,...,...,...,...
7525,Q17272400,şarkı,,,,Q205049,world music
7526,Q17272400,şarkı,,,,Q1583807,art music
7527,Q1103669,țara călatei folk music and dance,,,,Q1103669,țara călatei folk music and dance
7528,Q1103669,țara călatei folk music and dance,,,,Q205049,world music


In [94]:
'folk' in wikidata_top_df['top_genre_name'].values

False

In [114]:
# delete rows of false top genre (genre with multiple top genres including itself)
# example: rock and roll has top genre 'rock and roll' and 'rock music' -> delete row that has 'rock and roll' as the top genre (not a top genre since there is one parent genre)

false_top_genres_set = set()
genres_to_preserve_list = ['hip hop music', 'country music', 'opera', 'soul music', 'funk']

for idx, row in tqdm(wikidata_top_df.iloc[:].iterrows()):
    genre_name = row['genre_name']
    top_genre_name = row['top_genre_name']
    genre_in_df = wikidata_top_df.loc[wikidata_top_df['genre_name'] == genre_name]
    
    if genre_name == top_genre_name and len(genre_in_df) > 1 and genre_name not in genres_to_preserve_list:
        false_top_genres_set.add(genre_name)
        wikidata_top_df = wikidata_top_df.drop(idx)

wikidata_top_df = wikidata_top_df.reset_index(drop=True)
print(len(false_top_genres_set))
wikidata_top_df

7530it [00:08, 850.52it/s]

309





Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name
0,Q209498,2 tone,,,,Q54365,ska
1,Q1751409,2-step garage,,,,Q316930,dance music
2,Q1751409,2-step garage,,,,Q9778,electronic music
3,Q1338153,20th-century classical music,,,,Q1583807,art music
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music
...,...,...,...,...,...,...,...
7216,Q2748579,čoček,,,,Q2748579,čoček
7217,Q17272400,şarkı,,,,Q205049,world music
7218,Q17272400,şarkı,,,,Q1583807,art music
7219,Q1103669,țara călatei folk music and dance,,,,Q205049,world music


In [115]:
# delete rows that have false top genre as one of their top genres (not deleting genre with a single top genre)

for idx, row in tqdm(wikidata_top_df.iterrows()):
    genre_name = row['genre_name']
    top_genre_name = row['top_genre_name']
    
    if top_genre_name in false_top_genres_set:
        wikidata_top_df = wikidata_top_df.drop(idx)
        
    if genre_name not in wikidata_top_df['genre_name'].values:
        print(genre_name)
        
wikidata_top_df = wikidata_top_df.reset_index(drop=True)
wikidata_top_df

7221it [00:02, 3251.20it/s]


Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name
0,Q209498,2 tone,,,,Q54365,ska
1,Q1751409,2-step garage,,,,Q316930,dance music
2,Q1751409,2-step garage,,,,Q9778,electronic music
3,Q1338153,20th-century classical music,,,,Q1583807,art music
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music
...,...,...,...,...,...,...,...
6839,Q2748579,čoček,,,,Q2748579,čoček
6840,Q17272400,şarkı,,,,Q205049,world music
6841,Q17272400,şarkı,,,,Q1583807,art music
6842,Q1103669,țara călatei folk music and dance,,,,Q205049,world music


In [116]:
# get the list of top genres that only appear once (unique top genres)
unique_top_genres_list = list()
top_genres_list = list()

# get the occurences of top genres
top_genres_dict = wikidata_top_df['top_genre_name'].value_counts().to_dict()

for genre, count in top_genres_dict.items():
    if count == 1:
        unique_top_genres_list.append(genre)
    else:
        top_genres_list.append(genre)

print(len(unique_top_genres_list))
print(len(top_genres_list))
unique_top_genres_list[:10]

1323
80


['paschen',
 'pasodoble',
 'paso huanquilla',
 'passacaglia',
 'party-hop',
 'nu-funk',
 'percussive pagodas',
 'pastorella',
 'patacoré',
 'patriotic music']

In [117]:
# add a bool column to df: false if top genre is not unique, else otherwise

for idx, row in wikidata_top_df.iterrows():
    top_genre_name = row['top_genre_name']
    if top_genre_name in unique_top_genres_list:
        wikidata_top_df.loc[idx, 'unique_top_genre'] = 1
    else:
        wikidata_top_df.loc[idx, 'unique_top_genre'] = 0
        
wikidata_top_df

Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name,unique_top_genre
0,Q209498,2 tone,,,,Q54365,ska,0.0
1,Q1751409,2-step garage,,,,Q316930,dance music,0.0
2,Q1751409,2-step garage,,,,Q9778,electronic music,0.0
3,Q1338153,20th-century classical music,,,,Q1583807,art music,0.0
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music,0.0
...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,,,,Q2748579,čoček,1.0
6840,Q17272400,şarkı,,,,Q205049,world music,0.0
6841,Q17272400,şarkı,,,,Q1583807,art music,0.0
6842,Q1103669,țara călatei folk music and dance,,,,Q205049,world music,0.0


In [118]:
# edit discogs genre and style name

for idx, row in wikidata_top_df.iterrows():
    discogs_genre_name = row['discogs_genre_name']
    discogs_style_name = row['discogs_style_name']
    
    if pd.notna(discogs_genre_name):
        discogs_genre_name = re.sub(r'\+', ' ', discogs_genre_name)
        wikidata_top_df.loc[idx, 'discogs_genre_name'] = discogs_genre_name
        
    if pd.notna(discogs_style_name):
        discogs_style_name = re.sub(r'\+', ' ', discogs_style_name)
        discogs_style_name = re.sub(r'&', 'and', discogs_style_name)
        discogs_style_name = re.sub(r'\.', ' ', discogs_style_name)
        wikidata_top_df.loc[idx, 'discogs_style_name'] = discogs_style_name
        
wikidata_top_df

Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name,unique_top_genre
0,Q209498,2 tone,,,,Q54365,ska,0.0
1,Q1751409,2-step garage,,,,Q316930,dance music,0.0
2,Q1751409,2-step garage,,,,Q9778,electronic music,0.0
3,Q1338153,20th-century classical music,,,,Q1583807,art music,0.0
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music,0.0
...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,,,,Q2748579,čoček,1.0
6840,Q17272400,şarkı,,,,Q205049,world music,0.0
6841,Q17272400,şarkı,,,,Q1583807,art music,0.0
6842,Q1103669,țara călatei folk music and dance,,,,Q205049,world music,0.0


In [119]:
# create dict from everynoise_df

everynoise_dict = dict()

for key, value in everynoise_df.to_dict(orient='index').items():
    everynoise_id = value['everynoise_id']
    everynoise_name = value['everynoise_name']
    everynoise_dict[everynoise_id] = everynoise_name
    
len(everynoise_dict)

5602

In [120]:
# match wikidata everynoise_id with data from everynoise_df

for idx, row in wikidata_top_df.iterrows():
    wiki_everynoise_id = row['everynoise_id']
    if pd.notna(wiki_everynoise_id):
        for everynoise_id, everynoise_name in everynoise_dict.items():
            if wiki_everynoise_id == everynoise_id:
                wikidata_top_df.loc[idx, 'everynoise_name'] = everynoise_name

wikidata_top_df

Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name,unique_top_genre,everynoise_name
0,Q209498,2 tone,,,,Q54365,ska,0.0,
1,Q1751409,2-step garage,,,,Q316930,dance music,0.0,
2,Q1751409,2-step garage,,,,Q9778,electronic music,0.0,
3,Q1338153,20th-century classical music,,,,Q1583807,art music,0.0,
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music,0.0,21st century classical
...,...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,,,,Q2748579,čoček,1.0,
6840,Q17272400,şarkı,,,,Q205049,world music,0.0,
6841,Q17272400,şarkı,,,,Q1583807,art music,0.0,
6842,Q1103669,țara călatei folk music and dance,,,,Q205049,world music,0.0,


In [122]:
# edit genre_name to remove "music" and replace '-' by a space when present

for idx, row in wikidata_top_df.iterrows():
    genre_name = row['genre_name']
    genre_name_edited = genre_name
    
    if 'music' in genre_name and 'musical' not in genre_name:
        genre_name_edited = re.sub(r'\smusic$', '', genre_name)  # remove 'music' at end of genre name
    
    genre_name_edited = re.sub(r'-', ' ', genre_name_edited)  # replace '-' by a space
    wikidata_top_df.loc[idx, 'genre_name_edited'] = genre_name_edited
        
wikidata_top_df

Unnamed: 0,genre_id,genre_name,everynoise_id,discogs_genre_name,discogs_style_name,top_genre_id,top_genre_name,unique_top_genre,everynoise_name,genre_name_edited
0,Q209498,2 tone,,,,Q54365,ska,0.0,,2 tone
1,Q1751409,2-step garage,,,,Q316930,dance music,0.0,,2 step garage
2,Q1751409,2-step garage,,,,Q9778,electronic music,0.0,,2 step garage
3,Q1338153,20th-century classical music,,,,Q1583807,art music,0.0,,20th century classical
4,Q4631020,21st-century classical music,21stcenturyclassical,,,Q1583807,art music,0.0,21st century classical,21st century classical
...,...,...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,,,,Q2748579,čoček,1.0,,čoček
6840,Q17272400,şarkı,,,,Q205049,world music,0.0,,şarkı
6841,Q17272400,şarkı,,,,Q1583807,art music,0.0,,şarkı
6842,Q1103669,țara călatei folk music and dance,,,,Q205049,world music,0.0,,țara călatei folk music and dance


In [123]:
# rearrange the column order

columns = ['genre_id', 'genre_name', 'genre_name_edited', 'top_genre_id', 'top_genre_name', 'unique_top_genre', 'everynoise_id', 'everynoise_name', 'discogs_genre_name', 'discogs_style_name']
wikidata_top_df = wikidata_top_df[columns]
wikidata_top_df

Unnamed: 0,genre_id,genre_name,genre_name_edited,top_genre_id,top_genre_name,unique_top_genre,everynoise_id,everynoise_name,discogs_genre_name,discogs_style_name
0,Q209498,2 tone,2 tone,Q54365,ska,0.0,,,,
1,Q1751409,2-step garage,2 step garage,Q316930,dance music,0.0,,,,
2,Q1751409,2-step garage,2 step garage,Q9778,electronic music,0.0,,,,
3,Q1338153,20th-century classical music,20th century classical,Q1583807,art music,0.0,,,,
4,Q4631020,21st-century classical music,21st century classical,Q1583807,art music,0.0,21stcenturyclassical,21st century classical,,
...,...,...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,čoček,Q2748579,čoček,1.0,,,,
6840,Q17272400,şarkı,şarkı,Q205049,world music,0.0,,,,
6841,Q17272400,şarkı,şarkı,Q1583807,art music,0.0,,,,
6842,Q1103669,țara călatei folk music and dance,țara călatei folk music and dance,Q205049,world music,0.0,,,,


In [124]:
# nbr of values for each column

wikidata_top_df.count()

genre_id              6844
genre_name            6844
genre_name_edited     6844
top_genre_id          6844
top_genre_name        6844
unique_top_genre      6844
everynoise_id         2367
everynoise_name       2367
discogs_genre_name      22
discogs_style_name     904
dtype: int64

In [125]:
wikidata_top_df.to_csv('data/data_wikidata_top_genres.csv', encoding='utf-8')

In [410]:
"""
SPARQL query to fetch data from wikidata graph

SELECT ?genre ?genreLabel ?parent_genre ?parent_genreLabel ?everynoise_id ?discogs_genre_id ?discogs_style_id WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language ",en". }
  ?genre wdt:P31 wd:Q188451.
  OPTIONAL { ?genre wdt:P279 ?parent_genre. }
  OPTIONAL { ?genre wdt:P9881 ?everynoise_id. }
  OPTIONAL { ?genre wdt:P9218 ?discogs_genre_id. }
  OPTIONAL { ?genre wdt:P9219 ?discogs_style_id. }
}
"""

'\nSPARQL query to fetch data from wikidata graph\n\nSELECT ?genre ?genreLabel ?parent_genre ?parent_genreLabel ?everynoise_id ?discogs_genre_id ?discogs_style_id WHERE {\n  SERVICE wikibase:label { bd:serviceParam wikibase:language ",en". }\n  ?genre wdt:P31 wd:Q188451.\n  OPTIONAL { ?genre wdt:P279 ?parent_genre. }\n  OPTIONAL { ?genre wdt:P9881 ?everynoise_id. }\n  OPTIONAL { ?genre wdt:P9218 ?discogs_genre_id. }\n  OPTIONAL { ?genre wdt:P9219 ?discogs_style_id. }\n}\n'