In [1]:
import requests
import time
import pickle
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import os
from tqdm import tqdm

In [2]:
# open data files

with open('data/data_discogs_styles.csv', encoding='utf-8') as file_discogs:
    discogs_df = pd.read_csv(file_discogs, index_col=0)
    discogs_df.dropna(subset=['genre_url'], inplace=True)

with open('data/data_spotify_genres.csv', encoding='utf-8') as file_spotify:
    spotify_df = pd.read_csv(file_spotify, index_col=0)
    
with open('data/data_wikidata_top_genres.csv', encoding='utf-8') as file_wikidata:
    wikidata_df = pd.read_csv(file_wikidata, index_col=0)
    
wikidata_df

Unnamed: 0,genre_id,genre_name,genre_name_edited,top_genre_id,top_genre_name,unique_top_genre,everynoise_id,everynoise_name,discogs_genre_name,discogs_style_name
0,Q209498,2 tone,2 tone,Q54365,ska,0.0,,,,
1,Q1751409,2-step garage,2 step garage,Q316930,dance music,0.0,,,,
2,Q1751409,2-step garage,2 step garage,Q9778,electronic music,0.0,,,,
3,Q1338153,20th-century classical music,20th century classical,Q1583807,art music,0.0,,,,
4,Q4631020,21st-century classical music,21st century classical,Q1583807,art music,0.0,21stcenturyclassical,21st century classical,,
...,...,...,...,...,...,...,...,...,...,...
6839,Q2748579,čoček,čoček,Q2748579,čoček,1.0,,,,
6840,Q17272400,şarkı,şarkı,Q205049,world music,0.0,,,,
6841,Q17272400,şarkı,şarkı,Q1583807,art music,0.0,,,,
6842,Q1103669,țara călatei folk music and dance,țara călatei folk music and dance,Q205049,world music,0.0,,,,


In [3]:
# create dict from spotify_df

spotify_dict = dict()

for idx, row in spotify_df.iterrows():
    spotify_dict[idx] = row['occurrences']
    
print(len(spotify_dict))
spotify_dict

3684


{'21st century classical': 6,
 '5th wave emo': 6,
 'a cappella': 13,
 'aarhus indie': 5,
 'aberdeen indie': 1,
 'abstract': 4,
 'abstract beats': 8,
 'abstract hip hop': 43,
 'abstract idm': 1,
 'abstractro': 5,
 'accordeon': 2,
 'accordion': 7,
 'acid house': 19,
 'acid idm': 5,
 'acid jazz': 3,
 'acid rock': 4,
 'acid techno': 11,
 'acid trance': 3,
 'acousmatic': 3,
 'acoustic blues': 5,
 'acoustic cover': 3,
 'acoustic pop': 37,
 'acoustic punk': 12,
 'acoustic rock': 13,
 'action rock': 19,
 'adoracao': 1,
 'adult standards': 48,
 'aesthetic rap': 4,
 'afghan rap': 1,
 'african electronic': 6,
 'african experimental': 3,
 'african metal': 1,
 'african percussion': 3,
 'african reggae': 5,
 'african rock': 6,
 'african-american classical': 2,
 'afrikaans': 1,
 'afro dancehall': 11,
 'afro house': 14,
 'afro psych': 1,
 'afro r&b': 1,
 'afro soul': 4,
 'afro-cuban percussion': 3,
 'afro-funk': 5,
 'afrobeat': 32,
 'afrobeat brasileiro': 1,
 'afrobeat fusion': 4,
 'afrofuturism': 25,

In [4]:
# create dict from wikidata_df
wikidata_dict = dict()

for idx, row in wikidata_df.iterrows():
    genre_id = row['genre_id']
    genre_name = row['genre_name']
    genre_name_edited = row['genre_name_edited']
    top_genre_id = row['top_genre_id']
    top_genre_name = row['top_genre_name']
    unique_top_genre = int(row['unique_top_genre'])
    everynoise_name = row['everynoise_name']
    discogs_style_name = row['discogs_style_name']
    discogs_genre_name = row['discogs_genre_name']
    
    if not pd.notna(everynoise_name):
        everynoise_name = ''
    if not pd.notna(discogs_style_name):
        discogs_style_name = ''
    
    try:
        wikidata_dict[genre_name]['top_genre'][top_genre_name] = {'top_genre_id': top_genre_id, 'unique_top_genre': unique_top_genre}
    except KeyError:    
        wikidata_dict[genre_name] = {
            'genre_id': genre_id,
            'genre_name_edited': genre_name_edited,
            'everynoise_name': everynoise_name,
            'discogs_style_name': discogs_style_name,
            'discogs_genre_name': discogs_genre_name,
            'top_genre': {
                top_genre_name: {
                    'top_genre_id': top_genre_id, 
                    'unique_top_genre': unique_top_genre,
                }}}   

print(len(wikidata_dict))
wikidata_dict

4574


{'2 tone': {'genre_id': 'Q209498',
  'genre_name_edited': '2 tone',
  'everynoise_name': '',
  'discogs_style_name': '',
  'discogs_genre_name': nan,
  'top_genre': {'ska': {'top_genre_id': 'Q54365', 'unique_top_genre': 0}}},
 '2-step garage': {'genre_id': 'Q1751409',
  'genre_name_edited': '2 step garage',
  'everynoise_name': '',
  'discogs_style_name': '',
  'discogs_genre_name': nan,
  'top_genre': {'dance music': {'top_genre_id': 'Q316930',
    'unique_top_genre': 0},
   'electronic music': {'top_genre_id': 'Q9778', 'unique_top_genre': 0}}},
 '20th-century classical music': {'genre_id': 'Q1338153',
  'genre_name_edited': '20th century classical',
  'everynoise_name': '',
  'discogs_style_name': '',
  'discogs_genre_name': nan,
  'top_genre': {'art music': {'top_genre_id': 'Q1583807',
    'unique_top_genre': 0}}},
 '21st-century classical music': {'genre_id': 'Q4631020',
  'genre_name_edited': '21st century classical',
  'everynoise_name': '21st century classical',
  'discogs_style

In [43]:
# print sample of result dict

def print_sample_dict(spotify_genres_match_dict):
    print(len(spotify_genres_match_dict))
    counter = 0
    for spotify_genre, wikidata_data in spotify_genres_match_dict.items():
        print(spotify_genre)
        print(wikidata_data)
        print('-------------')
        counter += 1
        if counter > 10:
            break

In [6]:
match_counter = 0
spotify_genres_match_dict = dict()

for spotify_genre, occurrences in tqdm(spotify_dict.items()):
    spotify_genres_match_dict[spotify_genre] = {'occurrences': occurrences}
    matched = False
    
    for wiki_genre_name, wikidata_genre_data in wikidata_dict.items():
        wiki_genre_id = wikidata_genre_data['genre_id']
        wiki_genre_name_edited = wikidata_genre_data['genre_name_edited']
        wiki_everynoise_name = wikidata_genre_data['everynoise_name']
        wiki_discogs_style_name = wikidata_genre_data['discogs_style_name']
        wiki_discogs_genre_name = wikidata_genre_data['discogs_genre_name']
        wiki_top_genre_list = list(wikidata_genre_data['top_genre'].keys())
        
        # find if a wiki genre name exists within spotify genre name
        pattern_matched = False
        for name in [wiki_genre_name, wiki_genre_name_edited, wiki_everynoise_name, wiki_discogs_genre_name, wiki_discogs_style_name]:
            if pd.notna(name) and len(name) > 0:
                pattern = r'\b' + name + r'\b'
                result = re.search(pattern, spotify_genre)
                if result:
                    pattern_matched = True
        
        if spotify_genre in [wiki_genre_name, wiki_genre_name_edited, wiki_everynoise_name, wiki_discogs_genre_name, wiki_discogs_style_name] :
            #print(spotify_genre, '======', genre_name)
            spotify_genres_match_dict[spotify_genre] = {'occurrences': occurrences, wiki_genre_name: wiki_top_genre_list}
            matched = True
            break
        elif pattern_matched:
            #print(spotify_genre, '-', genre_name_edited)
            spotify_genres_match_dict[spotify_genre][wiki_genre_name] = wiki_top_genre_list
            matched = True
            
    if matched:
        match_counter += 1

print(match_counter)
print_sample_dict(spotify_genres_match_dict)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3684/3684 [20:11<00:00,  3.04it/s]


2972

In [48]:
# get top genres for each spotify genre

def get_spotify_top_genres(spotify_genres_match_dict):
    spotify_top_genres_dict = dict()

    for spotify_genre, wiki_genre_data in spotify_genres_match_dict.items():
        spotify_top_genres_dict[spotify_genre] = set()
        for wiki_genre_name in wiki_genre_data.keys():
            if wiki_genre_name == 'occurrences':
                continue
            else:
                [spotify_top_genres_dict[spotify_genre].add(top_genre) for top_genre in wiki_genre_data[wiki_genre_name]]
    
    return spotify_top_genres_dict

In [44]:
# get list of spotify genres without a match

def get_no_match(spotify_genres_match_dict):
    no_match_dict = dict()

    for spotify_genre, genre_data in spotify_genres_match_dict.items():
        if len(genre_data) < 2:
            occurrences = genre_data['occurrences']
            no_match_dict[spotify_genre] = occurrences
    
    return no_match_dict

In [39]:
with open('data/data_spotify_artists.csv', encoding='utf-8') as artists_file:
    spotify_artists_df = pd.read_csv(artists_file, index_col=0)

# get dataframe of all genres for all spotify artists
columns_list = ['spotify_genre_' + str(x) for x in range(1, 20)]
genres_df = spotify_artists_df[spotify_artists_df.columns.intersection(columns_list)]

In [49]:
# for unmatched genre, get the top genres of related genres (genres associated with artists linked to unmatched genre)

spotify_top_genres_dict = get_spotify_top_genres(spotify_genres_match_dict)
no_match_dict = get_no_match(spotify_genres_match_dict)
remaining_no_match_genres_dict = dict()

# for each unmatched genre
for no_match_genre, no_match_occurrences in tqdm(no_match_dict.items()):
    #print(no_match_genre)
    
    # get rows of spotify artists with current unmatched genre
    no_match_genre_df = genres_df[(genres_df == no_match_genre).any(axis=1)]
    
    # get all genres related to current unmatched genre (= genres associated to same artists linked to current unmatched genre)
    related_genres_dict = no_match_genre_df.stack().value_counts().to_dict()
    
    # remove current unmatched genre from dict of related genres
    related_genres_dict.pop(no_match_genre)
    
    # if there is at least one genre related to current unmatched genre
    if related_genres_dict:
        top_genres_dict = dict()
        
        # get the top genre for current unmatched genre (= most frequent top genre in related genres)
        for genre_name, occurrences in related_genres_dict.items():
            # get the list of top genres for current related genre
            top_genres_set = spotify_top_genres_dict[genre_name]
            
            # if there is at least one top genre for current related genre
            if top_genres_set:
                # get the occurrences of that top genre
                for top_genre in top_genres_set:
                    try:
                        top_genre_occurrences = top_genres_dict[top_genre]
                        top_genres_dict[top_genre] = top_genre_occurrences + occurrences
                    except KeyError:
                        top_genres_dict[top_genre] = occurrences
                
        # if at least one related genre has a top genre (= it is not in no_match list)
        if top_genres_dict:
            # sort dict of top genres by most occurrences (= most frequent top genres among related genres)
            top_genres_list = sorted(top_genres_dict.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
            
            # add most frequent top genre to a list
            most_frequent_top_genres_set = {top_genres_list[0][0]}
            
            # if the most frequent top genre is 'world music', add second top genre, if there is one ('world music' is very frequent and broad)
            if top_genres_list[0][0] == 'world music' and len(top_genres_list) > 1:
                most_frequent_top_genres_set.add(top_genres_list[1][0])
            
            # if the most frequent top genre has same occurrence as others, add all top genres with same occurrence
            for top_genre in top_genres_list:
                if top_genre[1] == top_genres_list[0][1]:
                    most_frequent_top_genres_set.add(top_genre[0])
            
            for most_frequent_genre in most_frequent_top_genres_set:
                spotify_genres_match_dict[no_match_genre][most_frequent_genre] = [most_frequent_genre]
            
        # if no related genre has a top genre (= related genre is in no_match list)
        else:
            #print('xxxxxxxxxxxxxxxxxxxxxxxxx')
            remaining_no_match_genres_dict[no_match_genre] = no_match_occurrences
    else:
        #print('-------------------------------')
        remaining_no_match_genres_dict[no_match_genre] = no_match_occurrences

print(len(remaining_no_match_genres_dict))
print(len(get_spotify_top_genres(spotify_genres_match_dict)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93/93 [00:01<00:00, 49.41it/s]

93
3684





In [52]:
spotify_top_genres_dict = get_spotify_top_genres(spotify_genres_match_dict)
spotify_top_genres_df = pd.DataFrame.from_dict(spotify_top_genres_dict, orient='index')
spotify_top_genres_df.to_csv('data/data_spotify_top_genres.csv', encoding='utf-8')
spotify_top_genres_df

Unnamed: 0,0,1,2,3,4,5
21st century classical,art music,,,,,
5th wave emo,world music,electronic music,hip hop music,experimental music,rock music,
a cappella,vocal music,,,,,
aarhus indie,rock music,,,,,
aberdeen indie,rock music,,,,,
...,...,...,...,...,...,...
zolo,rock music,,,,,
zouglou,zouglou,,,,,
zouk,world music,,,,,
zouk riddim,electronic music,world music,instrumental music,dance music,,


In [31]:
with open('data/data_wikidata_bigcities.csv', encoding='utf-8') as file_city:
    city_df = pd.read_csv(file_city)
    
with open('data/data_wikidata_demonyms.csv', encoding='utf-8') as file_demonym:
    demonym_df = pd.read_csv(file_demonym)

with open('data/data_wikidata_usstates.csv', encoding='utf-8') as file_us_state:
    us_state_df = pd.read_csv(file_us_state)
    
us_state_df

Unnamed: 0,state_id,state,abbrev
0,Q173,Alabama,AL
1,Q173,Alabama,Ala.
2,Q797,Alaska,AK
3,Q816,Arizona,AZ
4,Q816,Arizona,Ariz.
...,...,...,...
91,Q1371,West Virginia,W.Va.
92,Q1537,Wisconsin,WI
93,Q1537,Wisconsin,Wis.
94,Q1214,Wyoming,WY


In [32]:
"""
SPARQL query for Wikidata items with demonym property:

SELECT (SAMPLE(?item) AS ?item) ?demonym WHERE {
  ?item wdt:P1549 ?demonym.
  FILTER((LANG(?demonym)) = "en")
}
GROUP BY ?demonym

"""

demonym_list = sorted(set(demonym_df['demonym']))
demonym_list = [demonym.lower() for demonym in demonym_list]
print(len(demonym_list))
demonym_list[:10]

796


['/b/tards',
 '/pol/acks',
 '/v/ermin',
 '/v/irgins',
 'abidjanese',
 'abkhaz',
 'abkhazian',
 'abu dhabian',
 'accran',
 'achaemenian']

In [33]:
"""
SPARQL query for Wikidata items with big city instance:

SELECT DISTINCT ?item ?itemLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  {
    SELECT DISTINCT ?item WHERE {
      ?item p:P31 ?statement0.
      ?statement0 (ps:P31/(wdt:P279*)) wd:Q1549591.
    }
  }
}
"""
city_list = sorted(set(city_df['city']))
city_list = [city.lower() for city in city_list]
print(len(city_list))
city_list[:10]

3548


["'s-hertogenbosch",
 '6th of october city',
 'a coruña',
 'aachen',
 'aalborg',
 'aarhus',
 'aba',
 'abadan',
 'abaetetuba',
 'abakan']

In [34]:
"""
SPARQL query for Wikidata items with US State instance:

SELECT DISTINCT ?item ?itemLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  {
    SELECT DISTINCT ?item WHERE {
      ?item p:P31 ?statement0.
      ?statement0 (ps:P31/(wdt:P279*)) wd:Q30.
    }
  }
}
"""
us_state_set = set(us_state_df['abbrev'])
us_state_set.update(us_state_df['state'])
us_state_list = sorted(us_state_set)
us_state_list = [state.lower() for state in us_state_list]
print(len(us_state_list))
us_state_list[:10]

145


['ak',
 'al',
 'ar',
 'az',
 'ala.',
 'alabama',
 'alaska',
 'ariz.',
 'arizona',
 'ark.']

In [35]:
words_no_match_dict = dict()
city_set = set()
demonym_set = set()
state_set = set()
words_set = set()

for genre, occurences in remaining_no_match_genres_dict.items():
    words = re.findall(r'\b[\w&-]+\b', genre)
    for word in words:
        words_set.add(word)
        if word in city_list :
            city_set.add(word)
            continue
        elif word in demonym_list:
            demonym_set.add(word)
            continue
        elif word in us_state_list:
            state_set.add(word)
            continue
        else:
            try:
                counter = words_no_match_dict[word]['counter']
                word_occurrences = words_no_match_dict[word]['occurrences']
                
                words_no_match_dict[word]['counter'] = counter + 1
                words_no_match_dict[word]['occurrences'] = word_occurrences + occurrences
                words_no_match_dict[word]['genres'].append(genre)
            except KeyError:
                words_no_match_dict[word] = {'counter': 1, 'occurrences': occurences}
                words_no_match_dict[word] = {'counter': 1, 'occurrences': occurences, 'genres': [genre]}
            
print('nbr words no match', len(words_no_match_dict))
print('nbr city words', len(city_set))
print('nbr demonym words', len(demonym_set))
print('nbr state words', len(state_set))
print('nbr total words:', len(words_set))
words_no_match_dict

nbr words no match 127
nbr city words 8
nbr demonym words 9
nbr state words 5
nbr total words: 149


{'afrikaans': {'counter': 1, 'occurrences': 1, 'genres': ['afrikaans']},
 'backing': {'counter': 1, 'occurrences': 1, 'genres': ['backing track']},
 'track': {'counter': 1, 'occurrences': 1, 'genres': ['backing track']},
 'barnmusik': {'counter': 1, 'occurrences': 2, 'genres': ['barnmusik']},
 'belo': {'counter': 1, 'occurrences': 1, 'genres': ['belo horizonte indie']},
 'horizonte': {'counter': 1,
  'occurrences': 1,
  'genres': ['belo horizonte indie']},
 'indie': {'counter': 33,
  'occurrences': 33,
  'genres': ['belo horizonte indie',
   'bulgarian indie',
   'charlotte nc indie',
   'cornwall indie',
   'delaware indie',
   'euskal indie',
   'galway indie',
   'hong kong indie',
   'hungarian indie',
   'indie boliviano',
   'indie liguria',
   'indie rockism',
   'indie tico',
   'indie triste',
   'kansai indie',
   'limerick indie',
   'lincoln ne indie',
   'milwaukee indie',
   'minneapolis indie',
   'missouri indie',
   'normal indie',
   'north dakota indie',
   'ottawa i

In [36]:
words_no_match_df = pd.DataFrame.from_dict(words_no_match_dict, orient='index')
words_no_match_df.sort_values(by=['occurrences'], ascending=False, inplace=True)
words_no_match_df.to_csv('data/data_spotify_genres_no_match.csv', encoding='utf-8')
words_no_match_df.index.name = 'word'
words_no_match_df.reset_index(inplace=True)
words_no_match_df

Unnamed: 0,word,counter,occurrences,genres
0,indie,33,33,"[belo horizonte indie, bulgarian indie, charlo..."
1,blaskapelle,1,5,[blaskapelle]
2,fake,1,5,[fake]
3,viola,2,5,"[viola, viola da gamba]"
4,tribute,1,4,[tribute]
...,...,...,...,...
122,ccb,1,1,[hinos ccb]
123,hinos,1,1,[hinos ccb]
124,guqin,1,1,[guqin]
125,galway,1,1,[galway indie]


In [214]:
# create dict from discogs_df
# keep only discogs styles linked to a discogs genre

discogs_dict = dict()
        
for idx, row in discogs_df.iterrows():
    genre_name = row['genre_name']
    genre_url = row['genre_url']
    style_name = row['style_name']
    style_url = idx
    
    if pd.notna(genre_url):
        style_name = style_name.lower()  # convert name to lowercase
        genre_name = genre_name.lower()
        discogs_dict[style_name] = {'type': 'style', 'genre_name': genre_name}
        discogs_dict[genre_name] = {'type': 'genre', 'genre_name': genre_name}

print(len(spotify_list), len(wikidata_dict), len(discogs_dict))

3684 4574 460


In [199]:
discogs_counter = 0
wikidata_counter = 0
genres_matches_dict = dict()

for spotify_genre in spotify_genres_list[:]:
    genres_matches_dict[spotify_genre] = {'discogs': dict(), 'wikidata': dict(),}
    
    for discogs_style_name, discogs_style_info in discogs_dict.items():
        discogs_genre_name = discogs_style_info['genre_name']
        
        if spotify_genre == discogs_style_name:    
            genres_matches_dict[spotify_genre]['discogs'] = {discogs_style_name: discogs_genre_name}
            discogs_counter += 1
            break
        
        # find if discogs style name exists within spotify genre name
        pattern = r'\b' + discogs_style_name + r'\b'
        match = re.search(pattern, spotify_genre)
        
        if match:
            genres_matches_dict[spotify_genre]['discogs'][discogs_style_name] = discogs_genre_name
            discogs_counter += 1
            
    for wikidata_genre_name, wikidata_genre_info in wikidata_dict.items():
        wikidata_genre_id = wikidata_genre_info['genre_id']
        wikidata_top_genre_list = list(wikidata_genre_info['top_genre'].keys())
        
        if spotify_genre == wikidata_genre_name:
            genres_matches_dict[spotify_genre]['wikidata'] = {wikidata_genre_name: wikidata_top_genre_list}
            wikidata_counter += 1
            break
                
print(discogs_counter, wikidata_counter)
genres_matches_dict

2755 769


{'21st century classical': {'discogs': {'classical': 'classical'},
  'wikidata': {}},
 '5th wave emo': {'discogs': {'emo': 'rock'}, 'wikidata': {}},
 'a cappella': {'discogs': {}, 'wikidata': {'a cappella': ['vocal music']}},
 'aarhus indie': {'discogs': {}, 'wikidata': {}},
 'aberdeen indie': {'discogs': {}, 'wikidata': {}},
 'abstract': {'discogs': {'abstract': 'electronic'}, 'wikidata': {}},
 'abstract beats': {'discogs': {'abstract': 'electronic'}, 'wikidata': {}},
 'abstract hip hop': {'discogs': {'abstract': 'electronic',
   'hip hop': 'hip hop'},
  'wikidata': {'abstract hip hop': ['hip hop music', 'world music']}},
 'abstract idm': {'discogs': {'abstract': 'electronic', 'idm': 'electronic'},
  'wikidata': {}},
 'abstractro': {'discogs': {}, 'wikidata': {}},
 'accordeon': {'discogs': {}, 'wikidata': {}},
 'accordion': {'discogs': {}, 'wikidata': {}},
 'acid house': {'discogs': {'acid house': 'electronic'},
  'wikidata': {'acid house': ['dance music', 'electronic music']}},
 'aci

In [180]:
counter = 0
no_match_list = list()

for genre_name, genre_matches in genres_matches_dict.items():
    discogs_genre = genre_matches['discogs']
    wikidata_genre = genre_matches['wikidata']
    if not discogs_genre and not wikidata_genre:
        no_match_list.append(genre_name)
        counter += 1
        
print(counter)
no_match_list[:10]

1295


['aarhus indie',
 'aberdeen indie',
 'abstractro',
 'accordeon',
 'accordion',
 'acousmatic',
 'adoracao',
 'adult standards',
 'aesthetic rap',
 'afghan rap']

In [181]:
no_match_dict = dict()

for genre in no_match_list:
    words = re.findall(r'\b[\w&-]+\b', genre)
    for word in words:
        try:
            counter = no_match_dict[word]
            no_match_dict[word] = counter + 1
        except KeyError:
            no_match_dict[word] = 1
            
len(no_match_dict)

1159

In [168]:
for word, count in no_match_dict.items():
    if count > 5:
        print({word: '',})

{'indie': ''}
{'rap': ''}
{'psych': ''}
{'r&b': ''}
{'metal': ''}
{'alternative': ''}
{'americana': ''}
{'orchestra': ''}
{'lo-fi': ''}
{'worship': ''}
{'groove': ''}
{'dnb': ''}
{'doom': ''}
{'drill': ''}
{'singer-songwriter': ''}
{'australian': ''}
{'dance': ''}
{'electropop': ''}
{'traditional': ''}
{'belgian': ''}
{'edm': ''}
{'new': ''}
{'wave': ''}
{'room': ''}
{'deathcore': ''}
{'brazilian': ''}
{'progressive': ''}
{'canadian': ''}
{'deep': ''}
{'german': ''}
{'dutch': ''}
{'prog': ''}
{'electronica': ''}
{'finnish': ''}
{'folklore': ''}
{'world': ''}
{'french': ''}
{'synthpop': ''}
{'irish': ''}
{'italian': ''}
{'modern': ''}
{'japanese': ''}
{'roots': ''}
{'melodic': ''}
{'musica': ''}
{'musique': ''}
{'polish': ''}
{'russian': ''}
{'scottish': ''}
{'swedish': ''}


In [170]:
no_match_genres = {'indie': '/genre/rock',
'rap': '/genre/hip-hop',
'psych': '/genre/rock',
'r&b': '/genre/hip-hop',
'metal': '/genre/rock',
'alternative': '/genre/rock',
'americana': '/genre/folk-world-country',
'orchestra': '/genre/classical',
'lo-fi': '/genre/rock',
'worship': '/genre/folk-world-country',
'groove': '/genre/funk-soul',
'dnb': '/genre/electronic',
'doom': '/genre/electronic',
'drill': '/genre/hip-hop',
'singer-songwriter': '/genre/folk-world-country',
'dance': '/genre/electronic',
'electropop': '/genre/pop',
'traditional': '/genre/folk-world-country',
'edm': '/genre/electronic',
'deathcore': '/genre/rock',
'progressive': '/genre/rock',
'prog': '/genre/rock',
'electronica': '/genre/electronic',
'folklore': '/genre/folk-world-country',
'world': '/genre/folk-world-country',
'synthpop': '/genre/pop',
'roots': '/genre/folk-world-country'}

In [184]:
genres_matches_complete_dict = genres_matches_dict.copy()

for genre in no_match_list:
    words = re.findall(r'\b[\w&-]+\b', genre)
    for word in words:
        for key, value in no_match_genres.items():
            if word in key:
                genres_matches_complete_dict[genre]['discogs'][key] = value
                
genres_matches_dict

{'21st century classical': {'discogs': {'classical': '/genre/classical'},
  'wikidata': {}},
 '5th wave emo': {'discogs': {'emo': '/style/emo'}, 'wikidata': {}},
 'a cappella': {'discogs': {}, 'wikidata': {'a cappella': 'Q185298'}},
 'aarhus indie': {'discogs': {'indie': '/genre/rock'}, 'wikidata': {}},
 'aberdeen indie': {'discogs': {'indie': '/genre/rock'}, 'wikidata': {}},
 'abstract': {'discogs': {'abstract': '/style/abstract'}, 'wikidata': {}},
 'abstract beats': {'discogs': {'abstract': '/style/abstract'},
  'wikidata': {}},
 'abstract hip hop': {'discogs': {'abstract': '/style/abstract',
   'hip hop': '/genre/hip-hop'},
  'wikidata': {'abstract hip hop': 'Q98528482'}},
 'abstract idm': {'discogs': {'abstract': '/style/abstract',
   'idm': '/style/idm'},
  'wikidata': {}},
 'abstractro': {'discogs': {}, 'wikidata': {}},
 'accordeon': {'discogs': {}, 'wikidata': {}},
 'accordion': {'discogs': {}, 'wikidata': {}},
 'acid house': {'discogs': {'acid house': '/style/acid-house'},
  'w