Notebook 1. Cleans the network-graph csv.
- corrects genre misspellings and alternate spellings
- removes non-cities, useless non-genres, and duplicate genres.

## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import csv
import pandas as pd
from itertools import combinations
from tqdm.notebook import tqdm

In [3]:
# USER CONTROLS

# set path for csv file here
path = "/Users/geistling/Documents/1_Projects/bandcamp/network_graph/"

In [4]:
# set pandas display options
pd.set_option("display.max_rows", 350)
pd.set_option('display.max_colwidth', 500)
pd.set_option("display.max_columns", 200)

In [5]:
# start with from_country,to_country,count
conv_file = "{}network-graph.csv".format(path)

In [9]:
reader = pd.read_csv(conv_file) #, index_col=0
reader.count()

standard_name    477892
genre            477892
dtype: int64

In [10]:
reader.head()

Unnamed: 0,standard_name,genre
0,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
1,"Adelaide SA, Australia","electronica,industrial,ebm,electronic"
2,"Adelaide SA, Australia","electronica,ebm,industrial,electronic"
3,"Adelaide SA, Australia","industrial,electronic,electronica,ebm"
4,"Adelaide SA, Australia","electronica,ebm,electronic,industrial"


# Preliminary cleaning

## 1. remove non-cities from the set

In [11]:
# reader.standard_name.unique()
# 
notcities = [
    'Antarctica',
    'Argentina',
    'Arizona, USA',
    'Australia', 
    'Austria',
    'Belgium',
    'British Columbia, Canada',
    'Brazil',
    'Bulgaria',
    'Chile',
    'California, USA',
    'Colombia',
    'Colorado, USA',
    'Connecticut, USA',
    'Denmark',
    'England, UK',
    'Finland',
    'Florida, USA',
    'France',
    'Germany',
    'Greece',
    'Jamaica', 
    'Japan',
    'Hawaii, USA',
    'Hungary',
    'Iceland', 
    'Indiana, USA', 
    'India',
    'Italy', 
    'Lithuania',
    'Maryland, USA',
    'Massachusetts, USA',
    'Mexico',
    'Michigan, USA',
    'Minnesota, USA',
    'Netherlands',
    'New Jersey, USA',
    'New York, USA', 
    'New Zealand',
    'North Carolina, USA',
    'North Rhine-Westphalia, Germany',
    'Norway',
    'Ohio, USA',
    'Ontario, Canada',
    'Oregon, USA', 
    'Pennsylvania, USA',
    'Poland',
    'Portugal',
    'Romania',
    'Russia',
    'Scotland, UK',
    'Slovenia',
    'South Africa',
    'Spain',
    'Sweden',
    'Switzerland',
    'Texas, USA',
    'United Kingdom',
    'Ukraine', 
    'Virginia, USA',
    'Wales, UK',
    'Washington, USA',
    'Wisconsin, USA'
]

reader = reader.loc[~reader['standard_name'].isin(notcities)].copy()
reader.count()
# first file: 317741

standard_name    477892
genre            477892
dtype: int64

## 2. rename incorrect genres

In [13]:
# list of aliases. Phrase on the left is replaced with the one on the right. 
# Objective is to correct misspellings and combine genres that have multiple recognized spellings.
genre_aliases = [
    [' bpm',''],
#     ['140 bpm','140'],
#     ['160 bpm', '160'],
    ['1980s', '80s'],
    ['1990s', '90s'],
    ['a capella', 'a cappella'],
    ['acid tekno', 'acid techno'],
    ['afro beats','afro beat'],
    ['alt county','alt country'],
    ['alternativehip hop', 'alternative hip hop'],
    ['avant gard', 'avant garde'],
    ['ambiant','ambient'],
    ['audio book','audiobooks'],
    ['backing track', 'backing tracks'],
    ['backing trackss', 'backing tracks'],
    ['ballads', 'ballad'],
    ['beat tapes', 'beat tape'],
    ['berlin school electronic music', 'berlin school'],
    ['break beat', 'breakbeat'],
    ['breakbeats', 'breakbeat'],
    ['cassettes', 'cassette'],
    ['cassette tape', 'cassette'],
    ['comedy songs', 'comedy'],
    ['chiptunes', 'chiptune'],
    ['chip tune', 'chiptune'],
    ['covers','cover'],
    ['dark techno industrial', 'dark techno,industrial'],
    ['dub step', 'dubstep'],
    ['tapes', 'tape'],
    ['christmas music', 'christmas'],
    ['dance hall', 'dancehall'],
    ['dance music', 'dance'],
    ['dark psy darkpsy', 'dark psy'],
    ['demo', 'demos'],
    ['digeridoo','didgeridoo'],
    ['disco edit', 'disco edits'],
    ['dnb', 'drum & bass'],
    ['down tempo', 'downtempo'],
    ['drone ambient', 'ambient drone'],
    ['drum n bass', 'drum & bass'],
    ['drumandbass', 'drum & bass'],
    ['d&b', 'drum & bass'],
    ['dub step', 'dubstep'],
    ['dubstep & electronic','dubstep,electronic'],
    ['electronic dance music', 'edm'],
    ['eighties', '80s'],
    ['electronic body music', 'ebm'],
    ['eletronic', 'electronic'],
    ['electronic music', 'electronic'],
    ['electronica/dance','electronica,dance'],
    ['experiemental', 'experimental'],
    ['expiremental', 'experimental'],
    ['female vocalist', 'female vocals'],
    ['field recording', 'field recordings'],
    ['fingerstyle', 'fingerstyle guitar'],
    ['free download', 'free'],
    ['free tekno','techno'],
    ['free improv', 'free improvisation'],
    ['hard core', 'hardcore'],
    ['harmonies', 'harmony'],
    ['healing & meditation','healing,meditation'],
    ['heavy metal   hard rock', 'heavy metal,hard rock'],
    ['hip hop instrumental', 'hiphop instrumental'],
    ['hip hop instrumentals', 'hiphop instrumental'],
    ['hip hop rap', 'hip hop,rap'],
    ['hiphop rap', 'hip hop,rap'],
    ['hip hop/rap', 'hip hop,rap'],
    ['hip_hop', 'hip hop'],
    ['hip hop (real shit)', 'hip hop'],
    ['house techno minimal tech deep', 'house,techno,minimal,tech,deep'],
    ['indie pop rock','indie pop,indie rock'],
    ['indie rock pop','indie pop,indie rock'],
    ['instrumentalhip hop', 'instrumental hip hop'],
    ['jam track', 'jam'],
    ['jam tracks', 'jam'],
    ['jams', 'jam'],
    ['jazz & improvised music','jazz,improvised music'],
    ['left field', 'leftfield'],
    ['lgbt', 'lgbtq'],
    ['liquid dnb', 'liquid drum & bass'],
    ['live music', 'live'],
    ['live recording', 'live'],
    ['lo fi house', 'lo fihouse'],
    ['lofi hip hop', 'lo fi hip hop'],
    ['lofi hiphop', 'lo fi hip hop'],
    ['loop', 'loops'],
    ['love song', 'love songs'],
    ['mantras','mantra'],
    ['mash up', 'mash ups'],
    ['meditation music', 'meditation'],
    ['melancholic','melancholy'],
    ['metal core', 'metalcore'],
    ['montreal', 'montréal'],
    ['modular synth', 'modular synthesizer'],
    ['music for youtube', 'youtube'],
    ['nineties', '90s'],
    ['nostalgic', 'nostalgia'],
    ['oi','oi!'],
    ['podcasts', 'podcast'],
    ['post rock instrumental', 'post rock,instrumental'],
    ['psychadelic','psychedelic'],
    ['pyschedelic', 'psychedelic'],
    ['psyche','psych'],
    ['punk hardcore','hardcore punk'],
    ['r n b', 'r&b'],
    ['r & b', 'r&b'],
    ['rnb', 'r&b'],
    ['r&b/soul', 'r&b,soul'],
    ['rap & hip hop','rap,hip hop'],
    ['raga', 'ragga'],
    ['riot grrrl', 'riot grrl'],
    ['relaxation music', 'relaxation'],
    ['relaxing music', 'relaxing'],
    ['remixed','remix'],
    ['remixes', 'remix'],
    ['re edit', 're edits'],
    ['re issue', 'reissues'],
    ['rockandroll', 'rock & roll'],
    ['rock and roll', 'rock & roll'],
    ['rock n roll', 'rock & roll'],
    ['royalty free music', 'royalty free'],
    ['sample','samples'],
    ['sample based', 'sampling'],
    ['sci fi', 'science fiction'],
    ['scores','score'],
    ['scottish music', 'scottish'],
    ['shoegazing', 'shoegaze'],
    ['shoe gaze', 'shoegaze'],
    ['shoegazer','shoegaze'],
    ['soulfoul','soulful'],
    ['soundscape', 'soundscapes'],
    ['soundtracks', 'soundtrack'],
    ['soundtrack music', 'soundtrack'],
    ['spoken word poetry','spoken word'],
    ['steppas', 'steppa'],
    ['steppers', 'stepper'],
    ['synthwave 80s','synthwave,80s'],
    ['synthesizers', 'synthesizer'],
    ['technoandvariations', 'techno & variations'],
    ['tekno','techno'],
    ['techno hardcore','hardcore techno'],
    ['techno & variations', 'techno'],
    ['trap music', 'trap'],
    ['vgm', 'video game soundtrack'],
    ['videogame music', 'video game music'],
#     ['video game music', 'video game soundtrack'],
    ['world music', 'world'],
    ['yoga', 'yoga music']
]

In [14]:
for pair in genre_aliases:
    reader['genre'] = reader['genre'].str.replace(",{}(?=,|$)".format(pair[0]), ",{}".format(pair[1]), regex=True)
    reader['genre'] = reader['genre'].str.replace("^{}(?=,|$)".format(pair[0]), "{}".format(pair[1]), regex=True)
reader.count()

standard_name    477892
genre            477892
dtype: int64

## 3. remove non-genres from the set

In [11]:
# compile list of non-genres that don't belong in the set.
# Words here describe the album in a way that has nothing to do with the music
# (release type, production, organization, licensing, etc.)
# Or are so useless that they can't be justified for inclusion (e.g., "music", "everything", "etc")
ungenres = [
    'album',
    'awesome',
    'bootleg',
    'cassettes',
    'charity',
    'collaboration',
    'concept album',
    'conceptual',
    'creative commons',
    'demo',
    'demos',
    'diy record label',
    'diy',
    'download chill out music',
    'download',
    'download',
#     'duo',
    'ep',
    'etc',
    'everything',
#     'female',
    'free download',
    'free',
    'good music',
    'home recording',
    'homemade'
    'label',
    'live',
    'lyrics',
    'music',
#     'music for youtube',
    'multi genre',
    'new',
    'no copyright',
    'no copyright music',
    'no copyright sounds',
    'obscure',
    'original music',
    'original',
    'other',
    'production',
    'producer',
    'rare',
    'radio',
    "remix",
    're issue',
    "edit",
    'royalty free',
#     'solo',
#     'sounds',
    'songs',
#     'sound',
    'tape',
    'trax',
#     'trio',
    'unreleased',
    'various artists',
    'various',
    'records',
    'vinyl',
    'whatever'
#     'youtube'
]

for ungenre in ungenres:
    reader['genre'] = reader['genre'].str.replace(",{}(?=,|$)".format(ungenre), "", regex=True)
    reader['genre'] = reader['genre'].str.replace("^{}(,|$)".format(ungenre), "", regex=True)

## 4. Remove duplicate genres from the same row and strip blank spaces

In [15]:
# delete rows with blank genres
# place = reader.loc[reader['genre'] == '' ].copy()
reader = reader.loc[reader['genre'] != '' ].copy()
reader.count()

standard_name    477892
genre            477892
dtype: int64

In [16]:
with tqdm(total=reader.shape[0]) as pbar:    
    for index, row in reader.iterrows():
        pbar.update(1)
        genre_string = row['genre']
        # turn each "genre" field into alphabetized list
        genre_list = sorted(list(genre_string.split(",")))
        for item in genre_list:
            item.strip()
        # convert list into set to remove duplicates
        genre_set = set(genre_list)
        # save alphabetized set as genre
        reader.at[index,'genre'] = repr(genre_set)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=477892.0), HTML(value='')))




In [17]:
# reformats genre after 
reader['genre'] = reader['genre'].str.replace("'", "", regex=False)
reader['genre'] = reader['genre'].str.replace("{", "", regex=False)
reader['genre'] = reader['genre'].str.replace("}", "", regex=False)
reader['genre'] = reader['genre'].str.replace(", ", ",", regex=False)

reader.head()

Unnamed: 0,standard_name,genre
0,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
1,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
2,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
3,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"
4,"Adelaide SA, Australia","electronic,industrial,ebm,electronica"


# Save cleaned set to csv

In [18]:
reader.to_csv('2_network_graph_cleaned.csv', index=False)