In [1]:
import pandas as pd
import numpy as np
import ast

### Genres:

In [2]:
df = pd.read_csv("data/tracks_with_genres_lang_emotion.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,...,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres,language,y_kmeans
0,0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en,1
1,1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,A Lover's Concerto['The Toys'],2020,1,The Toys,,,,[],en,0
2,2,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en,0
3,3,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en,1
4,4,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,...,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,,"['easy listening', 'adult standards', 'lounge']",en,1


In [3]:
# Converting string array into regular array.
df['genres'] = df['genres'].apply(ast.literal_eval)

df['genres'].head()

0    [adult standards, bubblegum pop, merseybeat, r...
1                                                   []
2    [adult standards, bubblegum pop, merseybeat, r...
3    [adult standards, bubblegum pop, merseybeat, r...
4            [easy listening, adult standards, lounge]
Name: genres, dtype: object

In [4]:
# Determining the frequency of the genre. Giving the input song's genre priority to go first.
genre_extract = df['genres'].reset_index()
genre_wo_index = genre_extract['genres']
tally = genre_wo_index.count()
genre_dict = {}

for i in range (tally):
    input = genre_wo_index.loc[i]
    if len(input) > 0:
        for n in input:
            if n in genre_dict.keys():
                genre_dict[n] += 1
            else:
                genre_dict[n] = 1

print(genre_dict)

{'adult standards': 1216, 'bubblegum pop': 416, 'merseybeat': 529, 'rock-and-roll': 650, 'british invasion': 495, 'classic uk pop': 704, 'brill building pop': 763, 'rockabilly': 550, 'folk rock': 933, 'easy listening': 126, 'lounge': 283, 'british blues': 265, 'canadian blues': 54, 'singer-songwriter': 312, 'lilith': 307, 'canadian singer-songwriter': 107, 'folk': 393, 'rock': 3193, 'mellow gold': 1141, 'jazz': 304, 'swing': 56, 'big band': 94, 'jazz piano': 73, 'stride': 75, 'vocal jazz': 355, 'cool jazz': 274, 'harlem renaissance': 23, 'jazz saxophone': 71, 'contemporary jazz': 73, 'jazz quartet': 20, 'jazz fusion': 297, 'swedish jazz': 134, 'free jazz': 58, 'avant-garde jazz': 49, 'contemporary post-bop': 57, 'italian jazz': 15, 'soundtrack': 456, 'italian soundtrack': 28, 'vintage italian soundtrack': 34, 'classic soundtrack': 89, 'jazz trumpet': 176, 'hard bop': 204, 'roots rock': 250, 'psychedelic rock': 592, 'classic rock': 1358, 'acid rock': 67, 'electric blues': 227, 'country 

In [5]:
genres_df = pd.Series(genre_dict)
genres_df.head()

adult standards     1216
bubblegum pop        416
merseybeat           529
rock-and-roll        650
british invasion     495
dtype: int64

In [6]:
# Creating the genre supersets, with priority to the input song's genre.
# Since the input song is the first one to be run, its genre will be able to start the classification of other sub-genres under it.

# This can be further improved. Allowing for later genres to take over earlier genres as the superset if it more general.
# E.g.: Detroit Hip Hop is found before general hip hop. Currently they will be seperated into 2 seperate genres because the first word 'detroit' is
# not 'hip hop' and is thus overlooked.

def generate_superset_mapping(genre_dict):
    superset_mapping = {}

    for genre, frequency in genre_dict.items():
        subset_added = False
        for superset in list(superset_mapping.keys()):
            if genre in superset:
                superset_mapping[genre] = [genre]
                superset_mapping[genre].extend(superset_mapping[superset])
                del superset_mapping[superset]
                subset_added = True
                break
            elif superset in genre:
                if superset in superset_mapping:
                    superset_mapping[superset].append(genre)
                    subset_added = True
                    break
        if not subset_added:
            superset_mapping[genre] = [genre]

    return superset_mapping

# Generate superset mapping
superset_mapping = generate_superset_mapping(genre_dict)

# Output the superset mapping
for superset, subsets in superset_mapping.items():
    print(f"{superset}: {subsets}")


adult standards: ['adult standards', 'deep adult standards']
merseybeat: ['merseybeat']
british invasion: ['british invasion']
brill building pop: ['brill building pop']
rockabilly: ['rockabilly', 'uk rockabilly', 'finnish rockabilly', 'german rockabilly', 'neo-rockabilly', 'swedish rockabilly', 'spanish rockabilly', 'rockabilly en espanol', 'japanese rockabilly', 'canadian rockabilly']
easy listening: ['easy listening']
lounge: ['lounge', 'lounge house', 'chill lounge', 'italian lounge', 'sunset lounge']
canadian blues: ['canadian blues']
singer-songwriter: ['singer-songwriter', 'canadian singer-songwriter', 'nashville singer-songwriter', 'taiwan singer-songwriter', 'singaporean singer-songwriter', 'indian singer-songwriter', 'scottish singer-songwriter', 'japanese singer-songwriter', 'swedish singer-songwriter', 'british singer-songwriter', 'turkish singer-songwriter', 'irish singer-songwriter', 'norwegian singer-songwriter', 'nz singer-songwriter', 'korean singer-songwriter', 'neo-s

In [7]:
genres_arr = list(superset_mapping.keys())
genres_arr

['adult standards',
 'merseybeat',
 'british invasion',
 'brill building pop',
 'rockabilly',
 'easy listening',
 'lounge',
 'canadian blues',
 'singer-songwriter',
 'lilith',
 'folk',
 'rock',
 'mellow gold',
 'jazz',
 'swing',
 'big band',
 'stride',
 'contemporary post-bop',
 'hard bop',
 'electric blues',
 'soul',
 'motown',
 'sunshine pop',
 'beatlesque',
 'cabaret',
 'freakbeat',
 'experimental',
 'german romanticism',
 'late romantic era',
 'classical',
 'french orchestra',
 'rhythm and blues',
 'funk',
 'nashville sound',
 'pop',
 'boy band',
 'nu metal',
 'hip hop',
 'rap',
 'ancient mediterranean',
 'glam metal',
 'urban contemporary',
 'hyphy',
 'wrestling',
 'eurodance',
 'new romantic',
 'zolo',
 'new wave',
 'post-punk',
 'miami indie',
 'edm',
 'brostep',
 'bassline',
 'country road',
 'country',
 'roots reggae',
 'reggae',
 'ska',
 'dub',
 'norteno',
 'ranchera',
 'grupera',
 'banda',
 'regional mexican',
 'middle earth',
 'celtic',
 'latin',
 'r&b',
 'neo mellow',
 'co

In [8]:
genres_df = pd.Series(genres_arr)
genres_df.head()

0       adult standards
1            merseybeat
2      british invasion
3    brill building pop
4            rockabilly
dtype: object

In [9]:
genres_df.to_csv('data/extracted_superset_genres.csv')

### Language:

In [13]:
df['language'].unique()

array(['en', 'de', 'ja', 'da', 'sl', 'cy', 'pt', 'id', 'af', nan, 'nl',
       'ko', 'sv', 'tl', 'hu', 'no', 'fr', 'so', 'sq', 'fi', 'it', 'es',
       'tr', 'ro', 'et', 'sk', 'sw', 'lv', 'cs', 'pl', 'lt', 'ca', 'hr',
       'z1', 'mk', 'vi', 'ar', 'ru', 'hi', 'ms', 'th', 'nn', 'gl', 'ug',
       'zh-tw', 'zh-cn', 'm7', 'ia', 'co', 'ur', 'fa', 'eu', 'tt', 'el',
       'ta', 'bg', 'zu', 'xh', 'st', 'sn', 'tn', 'gn', 'sm', 'rn', 'ig',
       'br', 'he', 'is', 'ne', 'sr', 'ln', 'la', 'mi', 'ht', 'az', 'qu',
       'iw', 'uk', 'am'], dtype=object)