## Merge cleaned SSO 2018-2021 concert data with composer and conductor metadata from Wikipedia and Wikidata

This code is in Jupyter notebook format to enable more flexibility in executing different sections as needed.

(It would probably be more efficient to split some of this code into separate modules, but I haven't had the time or motivation to do so yet).

In [None]:
from bs4 import BeautifulSoup, element
from collections import namedtuple
from datetime import datetime
from functools import reduce
from qwikidata.sparql import (get_subclasses_of_item, return_sparql_query_results)
from sso_utilities import file_utils
from typing import Dict, List, NamedTuple, Tuple, TypedDict
from unidecode import unidecode

import mwparserfromhell as mwparser
import pandas as pd
import re
import sys
import unicodedata

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### RULES USED FOR COMPOSER/CONDUCTOR NATIONALITY CLASSIFICATION ###
* For purposes of (imperfect) simplification:
  * 'Bohemian' composers are counted as 'Czech'. Don't @ me.
  * Certain double-listed Austrian/German composers (e.g. Mozart and Schubert) are counted as 'Austrian'.
  * Composers of disputed nationality (e.g. Gluck) are counted as the nationality most widely accepted in authoritative sources such as the New Grove Dictionary of Music.
* Other European composers should be counted as either (a) the nationality assigned them in Grove Dictionary (b) or failing that (ambiguity and/or non-existence in Grove), the nationality whose language they primarily composed in (if opera/vocal) - e.g. Offenbach counted as 'French' - or whose country in which they spent the majority of their professional life.
* Soviet composers e.g. Khachaturian are counted as their ethnic nationality rather than 'Russian' or 'Soviet'.

## Wikidata and related utility methods

Methods for querying and manipulating conductor and composer metadata from Wikidata and merging it with Wikipedia and SSO data.

In [None]:
def deduplicate_sso_composers(composers: pd.Series) -> List[str]:
    """Deduplicate SSO composers.

    :param composers: List of composers from cleaned SSO data
    """
    deduped_composers = sorted(set(reduce(lambda x, y: x + y, composers)))
    deduped_composers = [ unicodedata.normalize('NFC', composer) for composer in deduped_composers if composer != 'Unknown' ]
    return deduped_composers

def impute_composer_gender(df_row: pd.Series) -> pd.Series:
    """Update SSO composer gender value with the corresponding Wikidata value if the Wikidata value is not 'Unknown'.
    Assumes Wikidata to always be the authoritative source.

    :param df_row: Row from merged SSO composer map and Wikidata composer query response
    """
    if df_row['Gender_wiki'] != 'Unknown':
        df_row['Gender_sso'] == df_row['Gender_wiki']
    return df_row

def impute_composer_timeline_nationality(df_row: pd.Series) -> pd.Series:
    """Update missing SSO/Wikipedia composer data rows with the corresponding SSO/Wikidata rows.

    :param df_row: Row from merged SSO/Wikipedia composer data
    """
    # only update composers that weren't in Wikipedia
    if df_row['_merge'] == 'left_only':
        df_row['Timeline_sso'] = df_row['Timeline_wiki']
        df_row['Nationality_sso'] = df_row['Nationality_wiki']
        if pd.isna(df_row['Status']):
            df_row['Status'] = return_animate_status(df_row['Composer'], df_row['Timeline_wiki'])
    return df_row

def return_animate_status(composer: str, timeline: str) -> str:
    """Given a timeline, determine whether a composer is still alive.

    :param composer: Full composer name
    :param timeline: String with composer dates of birth and/or death if known
    """
    # everyone is 'Dead' by default and is updated to 'Living' if there is no recorded date of death
    # (this is unfortunately efficient due to the imbalance between dead/living classical music composers)
    status = 'Dead'

    # patterns like '1970-' are more likely to indicate a living status
    # patterns like '1970-<some string>' more often than not indicate uncertainty, so we default to 'dead' in these cases
    if '-' in timeline:
        timeline_split = [ item.strip() for item in timeline.split('-') ]
        if not timeline_split[-1]:
            status = 'Living'
    # timeline patterns without a hyphen require some additional logic to determine status
    else:
        """
        edge case: 'b. <year>' or 'born <year>' or 'founded <year>' (for contemporary ensembles 
        that are counted as 'composers')
        we will only plausibly update status from 'Dead' to 'Living' for people born after ~1920
        (this range should yield a relatively minimal # of false negatives that can be manually
        fixed later as needed)
        """
        if timeline.startswith(('born', 'b.', 'founded')):
            # update anyone born on or after 1920 as 'Living', except for a few prominent known edge cases
            if re.search(r'(19[2-9][0-9])|(\b2[0-9]{3})', timeline) and not re.search(r'(d[.]?|died) (\d{4})\Z', timeline, flags=re.IGNORECASE):
                if composer not in ['Dominick Argento', 'David Baker', 'Dave Bartholomew', 'Cy Coleman', 'Allen Toussaint']:
                    status = 'Living'
        # assume status is 'Living' if the only timeline value is 19xx or 20xx
        elif re.match(r'(19[2-9][0-9])|(\b2[0-9]{3})', timeline):
            status = 'Living'
        else:
            pass
    # known edge case where no dates were provided in Wikipedia
    if composer == 'Stephen Ferguson':
        status = 'Living'
        
    return status

In [None]:
class WikidataUtils:
    """Methods for querying and parsing Wikidata responses."""

    SSO_COMPOSER_NAME_MAP_FILE = 'data\sso_composer_name_map.csv'
    WIKIDATA_NATIONALITY_MAP_FILE = 'data\wikidata_country_name_map.csv'

    def __init__(self) -> None:
        pass
    
    @classmethod
    def composer_name_map(cls, sso_composer_name_map_file: str = SSO_COMPOSER_NAME_MAP_FILE):
        """Maps SSO composer last names to their full names.

        :param sso_composer_name_map_file: CSV file with composer name mappings
        """
        sso_composer_name_map = pd.read_csv(sso_composer_name_map_file, usecols=['ComposerFullName', 'Gender'], encoding='utf-8').drop_duplicates().reset_index(drop=True)
        return sso_composer_name_map.rename(columns={'ComposerFullName': 'Composer'})
    
    @classmethod
    def nationality_map(cls, nationality_map_file: str = WIKIDATA_NATIONALITY_MAP_FILE) -> Dict:
        """Translates Wikidata country names into a more standard (and sometimes shorter) format.

        :param nationality_map_file: CSV file with Wikidata nationality mappings
        """
        nationality_map = file_utils.ProcessCSV().load_csv(nationality_map_file)
        if not (('wikidataCountry' in nationality_map) and ('Country' in nationality_map)):
            raise ValueError("Error: Did not find wikidataCountry or Country columns")
        return dict(zip(nationality_map['wikidataCountry'], nationality_map['Country']))
    
    @staticmethod
    def query_composer_nationalities_and_dates(cleaned_df: pd.DataFrame, wikipedia_df: pd.DataFrame) -> Dict:
        """Query Wikidata API for details on SSO composer nationality, dates of birth and/or death, and gender.
        
        :param sso_cleaned_df: DataFrame with cleaned SSO data
        :param wikipedia_df: DataFrame with cleaned Wikipedia composer data
        """
        # we only care about composers that couldn't be matched with Wikipedia records
        merged_df = pd.DataFrame(deduplicate_sso_composers(cleaned_df['Composer']), columns=['Composer']).merge(wikipedia_df, how='left', left_on='Composer', right_on='Composer', indicator=True)[['Composer', 'Timeline', 'Nationality', 'Status', '_merge']]
        sso_composers = list(merged_df.loc[merged_df['_merge'] == 'left_only', 'Composer'])
        composer_sparql_query = []
        composer_sparql_query.append("""
        SELECT DISTINCT ?composer ?composer_name ?date_of_birth ?date_of_death ?nationality ?gender WHERE {
            { ?composer (wdt:P106/(wdt:P279*)) wd:Q36834. }
            UNION
            { ?composer (wdt:P106/(wdt:P279*)) wd:Q15981151. }
            UNION
            { ?composer (wdt:P106/(wdt:P279*)) wd:Q1278335. }
            ?composer rdfs:label ?composer_name.
            OPTIONAL { ?composer wdt:P569 ?date_of_birth. }
            OPTIONAL { ?composer wdt:P570 ?date_of_death. }
            OPTIONAL { ?composer wdt:P27 ?country_of_citizenship. }
            OPTIONAL { ?composer wdt:P21 ?sex_or_gender. }
            FILTER((LANG(?composer_name)) = "en")
            FILTER(STR(?composer_name) IN (""")
        composer_sparql_query.append(', '.join([ f"\"{composer}\"" for composer in sso_composers if (composer != 'Unknown') and (' and ' not in composer) ]))
        composer_sparql_query.append("""))
            SERVICE wikibase:label {
                bd:serviceParam wikibase:language "en".
                ?country_of_citizenship rdfs:label ?nationality.
                ?sex_or_gender rdfs:label ?gender.
            }
        }
        ORDER BY (?composer_name)
        """)
        response = return_sparql_query_results(''.join(composer_sparql_query))
        return response

    @staticmethod
    def parse_composer_nationalities_and_dates(nationality_map: dict, wikidata_resp: dict) -> Dict[List[str], List[str]]:
        """Extract composer nationality, dates of birth and/or date, and gender from Wikidata SPARQL query response.
        
        :param nationality_map: Wikidata country name map, as generated by nationality_map()
        :param wikidata_resp: Wikidata SPARQL query response, as generated by query_composer_nationalities_and_dates()
        """
        wikidata_composers = {'Composer': [], 'Timeline': [],'Nationality': [], 'Gender': []}
        for row in wikidata_resp['results']['bindings']:
            composer = row['composer_name']['value']
            # if there are multiple records for one conductor, only use the first record
            if composer not in wikidata_composers['Composer']:
                nationality = 'Unknown'
                gender = 'Unknown'
                wikidata_composers['Composer'].append(composer)
                # set nationality
                if 'nationality' in row:
                    # edge case: impute correct nationality for conflicting records
                    if composer == 'Erwin Schulhoff':
                        nationality = 'Czech Republic'
                    # else use nationality value from wikidata response
                    else:
                        nationality = nationality_map[row['nationality']['value']]
                # set gender
                if 'gender' in row:
                    gender = row['gender']['value']
                wikidata_composers['Nationality'].append(nationality)
                wikidata_composers['Gender'].append(gender.title())
                # set timeline
                date_of_birth = datetime.strptime(row['date_of_birth']['value'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y')
                if 'date_of_death' in row:
                    date_of_death = datetime.strptime(row['date_of_death']['value'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y')
                    timeline = f"{date_of_birth}-{date_of_death}"
                else:
                    timeline = f"born {date_of_birth}"
                wikidata_composers['Timeline'].append(timeline)
        return wikidata_composers
    
    @classmethod
    def sso_wikipedia_composers_combined(cls, cleaned_df: pd.DataFrame, wikipedia_df: pd.DataFrame):
        """Merge SSO and Wikipedia composer data.
        
        :param cleaned_df: DataFrame with cleaned SSO data
        :param wikipedia_df: DataFrame with cleaned Wikipedia composer data. Required in order to generate delta composer list.
        """
        sso_wikipedia_merged_df = pd.DataFrame(deduplicate_sso_composers(cleaned_df['Composer']), columns=['Composer']).merge(wikipedia_df, how='left', left_on='Composer', right_on='Composer', indicator=True)[['Composer', 'Timeline', 'Nationality', 'Status', '_merge']]
        return sso_wikipedia_merged_df
    
    @classmethod
    def merge_wikidata_and_sso_composers(cls, cleaned_df: pd.DataFrame, wikipedia_df: pd.DataFrame, export: bool = 0) -> Dict:
        """Merge Wikidata and SSO composer data.
        
        :param cleaned_df: DataFrame with cleaned SSO data
        :param wikipedia_df: DataFrame with cleaned Wikipedia composer data. Required in order to generate delta composer list.
        :param export: Marks whether to export the merged data to CSV [1 = yes, 0 = no]
        """
        wikidata_nationality_map = cls.nationality_map()
        wikidata_sparql_resp = cls.query_composer_nationalities_and_dates(cleaned_df)
        # we only care about composers that couldn't be matched with existing Wikipedia records
        sso_wikipedia = cls.sso_wikipedia_composers_combined(cleaned_df, wikipedia_df)
        sso_composers = sso_wikipedia.loc[sso_wikipedia['_merge'] == 'left_only'][['Composer']]
        # extract Wikidata details for matched composers
        wikidata_composers = pd.DataFrame(cls.parse_composer_nationalities_and_dates(wikidata_nationality_map, wikidata_sparql_resp))

        # merge delta SSO composer records with Wikidata composer records. Fill missing values with 'Unknown'.
        wikidata_merged_df = sso_composers.merge(wikidata_composers, how='left', left_on='Composer', right_on='Composer').fillna('Unknown').drop_duplicates().reset_index(drop=True)

        # manually impute correct values for the known 'Unknown's
        nationality_missing_dict = {
            'Australia': ['Bryony Marks', 'Connor D’Netto', 'Ella Macens', 'Eskimo Joe', 'Harry Sdraulig', 'James Ledger', 'Katy Abbott', 'Lachlan Skipworth', 'Lee and Cleworth', 'Lisa Illean', 'Paul Stanhope', 'Peggy Polias', 'Timothy Constable'],
            'Canada': ['An-lun Huang'],
            'China': ['Liu Tieshan and Mao Yuan'],
            'Finland': ['Peter Grans'],
            'Germany': ['Detlef Reikow'],
            'Italy': ['Alessio Murgia'],
            'New Zealand': ['Ben Hoadley', 'Miriama Young'],
            'United States': ['Benj Pasek and Justin Paul', 'Evanescence', 'Kaoru Watanabe', 'Robert B. and Richard M. Sherman', 'Nathaniel Stookey']
        }
        timeline_missing_dict = {
            'Alessio Murgia': 'born 1965',
            'An-lun Huang': 'born 1949',
            'Ben Hoadley': 'born 197?', # status: alive
            'Benj Pasek and Justin Paul': 'born 1985',
            'Bryony Marks': 'born 1971',
            'Connor D’Netto': 'born 1994',
            'Detlef Reikow': 'born 19??', # status: alive
            'Ella Macens': 'born 1991',
            'Eskimo Joe': 'born 1997',
            'Evanescence': 'born 1995',
            'Harry Sdraulig': 'born 1992',
            'Kaoru Watanabe': 'born 1976',
            'Lachlan Skipworth': 'born 1982',
            'Lee and Cleworth': 'born 19??',
            'Liu Tieshan and Mao Yuan': 'born 192?',
            'Lisa Illean': 'born 1983',
            'Miriama Young': 'born 1975',
            'Paul Stanhope': 'born 1969',
            'Peggy Polias': 'born 1981',
            'Peter Grans': 'born 1954',
            'Robert B. and Richard M. Sherman': 'born 1925/8',
            'Nathaniel Stookey': 'born 1970',
            'Timothy Constable': 'born 1981'
        }
        # update existing gender with whatever is in Wikidata
        gender_missing_dict = {
            'Female': ['Bryony Marks', 'Ella Macens', 'Lisa Illean', 'Miriama Young', 'Peggy Polias'],
            'Male': ['Alessio Murgia', 'An-lun Huang', 'Ben Hoadley', 'Benj Pasek and Justin Paul', 'Connor D’Netto', 'Detlef Reikow', 'Harry Sdraulig', 'Kaoru Watanabe', 'Lachlan Skipworth', 'Lee and Cleworth', 'Liu Tieshan and Mao Yuan', 'Paul Stanhope', 'Peter Grans', 'Robert B. and Richard M. Sherman', 'Nathaniel Stookey', 'Timothy Constable']
        }
        for country in nationality_missing_dict:
            for composer in nationality_missing_dict[country]:
                if wikidata_merged_df.loc[wikidata_merged_df['Composer'] == composer, 'Nationality'].values == 'Unknown':
                    wikidata_merged_df.loc[wikidata_merged_df['Composer'] == composer, 'Nationality'] = country
        for composer in timeline_missing_dict:
            if wikidata_merged_df.loc[wikidata_merged_df['Composer'] == composer, 'Timeline'].values == 'Unknown':
                wikidata_merged_df.loc[wikidata_merged_df['Composer'] == composer, 'Timeline'] = timeline_missing_dict[composer]
        for gender in gender_missing_dict:
            for composer in gender_missing_dict[gender]:
                if wikidata_merged_df.loc[wikidata_merged_df['Composer'] == composer, 'Gender'].values == 'Unknown':
                    wikidata_merged_df.loc[wikidata_merged_df['Composer'] == composer, 'Gender'] = gender
        
        # merge all the things
        sso_wiki_composer_map = sso_wikipedia.merge(cls.composer_name_map(), how='right', left_on='Composer', right_on='Composer')
        final_merged_df = sso_wiki_composer_map.merge(wikidata_merged_df, how='left', left_on='Composer', right_on='Composer', suffixes=('_sso', '_wiki'))
        final_merged_df = final_merged_df.apply(lambda row: impute_composer_gender(row), axis=1)
        final_merged_df = final_merged_df.apply(lambda row: impute_composer_timeline_nationality(row), axis=1)
        final_merged_df = final_merged_df.drop(columns=['_merge', 'Timeline_wiki', 'Nationality_wiki', 'Gender_wiki']).rename(columns={'Timeline_sso': 'Timeline', 'Nationality_sso': 'Nationality', 'Gender_sso': 'Gender'}).fillna('Unknown')

        # final ad-hoc fixes
        composers_to_fix = ['Ben Hoadley', 'Liu Tieshan and Mao Yuan', 'Detlef Reikow']
        for composer in composers_to_fix:
            final_merged_df.loc[final_merged_df['Composer'] == composer, 'Status'] = 'Living'

        # write to CSV if export = 1
        if export == 1:
            out_file = 'sso_composer_nationality_map.csv'
            final_merged_df.to_csv(path_or_buf=out_file, encoding='utf-8', index=False)
            print(f"Wrote composer nationality map to: {out_file}")
        return final_merged_df

    @staticmethod
    def query_conductor_nationalities(cleaned_df: pd.DataFrame) -> Dict:
        """Query Wikidata API for details on SSO conductor nationality and gender.
        
        :param sso_cleaned_df: DataFrame with cleaned SSO data
        """
        conductor_sparql_query = []
        conductor_sparql_query.append("""
        SELECT DISTINCT ?conductor ?conductor_name ?nationality ?gender WHERE {
            ?conductor (wdt:P106/(wdt:P279*)) wd:Q1198887;
                                rdfs:label ?conductor_name.
            OPTIONAL { ?conductor wdt:P27 ?country_of_citizenship. }
            OPTIONAL { ?conductor wdt:P21 ?sex_or_gender. }
            FILTER(LANG(?conductor_name) = "en").
            FILTER(STR(?conductor_name) IN (""")
        conductor_sparql_query.append(', '.join([ f"\"{conductor}\"" for conductor in sorted(set(cleaned_df['Conductor'])) if conductor != 'Unknown']))
        conductor_sparql_query.append("""))
            SERVICE wikibase:label {
                bd:serviceParam wikibase:language "en".
                ?country_of_citizenship rdfs:label ?nationality.
                ?sex_or_gender rdfs:label ?gender.
            }
        }
        ORDER BY (?conductor_name)
        """)
        response = return_sparql_query_results(''.join(conductor_sparql_query))
        return response
    
    @staticmethod
    def parse_conductor_nationalities(nationality_map: dict, wikidata_resp: dict) -> Dict[List[str], List[str]]:
        """Extract conductor nationality and gender from Wikidata SPARQL query response.
        
        :param nationality_map: Wikidata country name map, as generated by nationality_map()
        :param wikidata_resp: Wikidata SPARQL query response, as generated by query_conductor_nationalities()
        """
        wikidata_conductors = {'Conductor': [], 'Nationality': [], 'Gender': []}
        for row in wikidata_resp['results']['bindings']:
            conductor = row['conductor_name']['value']
            # if there are multiple records for one conductor, only use the first record
            if conductor not in wikidata_conductors['Conductor']:
                nationality = 'Unknown'
                gender = 'Unknown'
                wikidata_conductors['Conductor'].append(conductor)
                # set nationality
                if 'nationality' in row:
                    # edge case: correct erroneous nationality ('English')
                    if conductor == 'Marin Alsop':
                        nationality = 'United States'
                    # else use nationality value from wikidata response
                    else:
                        nationality = nationality_map[row['nationality']['value']]
                # set gender
                if 'gender' in row:
                    gender = row['gender']['value']
                wikidata_conductors['Nationality'].append(nationality)
                wikidata_conductors['Gender'].append(gender.title())
        return wikidata_conductors

    @classmethod
    def merge_wikidata_and_sso_conductors(cls, cleaned_df: pd.DataFrame, export: bool = 0) -> Dict:
        """Merge Wikidata and SSO conductor data, and manually impute missing values.
        
        :param cleaned_df: DataFrame with cleaned SSO data
        :param export: Marks whether to export the merged data to CSV [1 = yes, 0 = no]
        """
        wikidata_nationality_map = cls.nationality_map()
        wikidata_sparql_resp = cls.query_conductor_nationalities(cleaned_df)
        sso_conductors = pd.DataFrame(sorted(set(cleaned_df['Conductor'])), columns=['Conductor'])
        wikidata_conductors = pd.DataFrame(cls.parse_conductor_nationalities(wikidata_nationality_map, wikidata_sparql_resp))

        # merge SSO conductor records with Wikidata conductor records. Fill missing values with 'Unknown'.
        merged_df = sso_conductors.merge(wikidata_conductors, how='left', left_on='Conductor', right_on='Conductor').fillna('Unknown')
        # manually impute correct values for the known 'Unknown's
        nationality_missing_dict = {
            'Australia': ['Benjamin Northey', 'Brett Kelly', 'Brett Weymark', 'Dane Lam', 'Fabian Russell', 'Guy Noble', 'Iain Grandage', 'Nicholas Buc', 'Nicholas Carter', 'Vanessa Scammell'],
            'Italy': ['Umberto Clerici'],
            'New Zealand': ['Hamish McKeich'],
            'Singapore': ['Joshua Tan'],
            'United Kingdom': ['Andrew Haveron', 'Finnegan Downie Dear', 'Roger Benedict'],
            'United States': ['Erik Ochsner', 'Marc Taddei']
        }
        gender_missing_dict = {
            'Female': ['Vanessa Scammell'],
            'Male': ['Andrew Haveron', 'Brett Kelly', 'Brett Weymark', 'Dane Lam', 'Erik Ochsner', 'Fabian Russell', 'Finnegan Downie Dear', 'Guy Noble', 'Hamish McKeich', 'Iain Grandage', 'Joshua Tan', 'Nicholas Buc', 'Nicholas Carter', 'Roger Benedict', 'Umberto Clerici']
        }
        for country in nationality_missing_dict:
            for conductor in nationality_missing_dict[country]:
                if merged_df.loc[merged_df['Conductor'] == conductor, 'Nationality'].values == 'Unknown':
                    merged_df.loc[merged_df['Conductor'] == conductor, 'Nationality'] = country
        for gender in gender_missing_dict:
            for conductor in gender_missing_dict[gender]:
                if merged_df.loc[merged_df['Conductor'] == conductor, 'Gender'].values == 'Unknown':
                    merged_df.loc[merged_df['Conductor'] == conductor, 'Gender'] = gender

        # write to CSV if export = 1
        if export == 1:
            out_file = 'sso_conductor_nationality_map.csv'
            merged_df.to_csv(path_or_buf=out_file, encoding='utf-8', index=False)
            print(f"Wrote conductor nationality map to: {out_file}")
        return merged_df

## Wikipedia and related utility methods

Methods for querying and cleaning composer metadata from Wikipedia.

In [None]:
def extract_wikinode_string(node: mwparser.nodes.Node) -> str:
    """Extract text from a given Wikicode token.
    
    :param node: Accepted Wikicode types: [Template|Text|Wikilink]
    """
    extracted_str = ''
    if isinstance(node, mwparser.nodes.template.Template):
        if node.params and not node.has('colwidth'):
            extracted_str = node.params[0]
        else:
            if node.name in ['spaced ndash', 'snd']:
                extracted_str = '---DELETE---'
    elif isinstance(node, mwparser.nodes.wikilink.Wikilink):
        extracted_str = node.title
    else:
        if ' – ' in node.value:
            # in strings like ' (1900 – 1975) – some extraneous text', replace second hyphen with a deletion marker
            if re.match(r' [(](\w.+)([ ]?[–-][ ]?)(\w.+)[)]( [–] \w.+)', node.value):
                extracted_str = node.value.replace(' – ', '---DELETE---')
            # but don't replace the hyphen if the string is a legitimate date like '(1900 – 1975)'
            # with no trailing extraneous text
            else:
                extracted_str = node.value.replace(' – ', '-')
        else:
            extracted_str = node.value
    return str(extracted_str)

def parse_composer_wiki_xml(country_name: str, country_xml_content: mwparser.wikicode.Wikicode) -> List[str]:
    """Parse Wikipedia composer-by-nationality XML into a list of individual composer name items.

    :param country_name: Country name
    :param country_xml_content: Wiki XML blob for the specified country
    """
    str_clean_pat = re.compile(r'[ ]?\n[ ]?')
    composers = []
    if ('columns-list' in country_xml_content):
        template_filter_list = reduce(lambda x, y: x+y, [ t.params for t in filter(lambda r: r.name == 'columns-list', country_xml_content.filter_templates()) ])
        for composer_list in filter(lambda p: p and not p.startswith(('List', 'colwidth')), template_filter_list):
            composer_list_cleaned = re.sub(str_clean_pat, r'|', mwparser.parse(composer_list.replace('{{ndash}}', '-')).strip_code().strip()).split('|')
            composers.extend(composer_list_cleaned)
    else:
        # filter out extraneous header, tag, <ref></ref>, [[File:...]] and other content (albeit in an ugly way)
        node_list = [ node for node in filter(lambda n: (n not in [', '] and n) and isinstance(n, (mwparser.nodes.template.Template, mwparser.nodes.text.Text, mwparser.nodes.wikilink.Wikilink)), country_xml_content.nodes) if not any(match in node for match in ['<ref', '[[File:', 'Talk:', 'INSERT A PERSON INTO A SECTION', 'List of', 'Chronological list of', 'Only add names here']) ]

        # extract text from each node and then rejoin everything into a long string
        composer_list_cleaned = mwparser.parse(''.join(list(map(extract_wikinode_string, node_list))))
        # replace newlines with '|' and then split on '|' to generate a list of composer strings
        # (this works because each unique composer string is separated by '\n')
        composer_list_cleaned = re.sub(str_clean_pat, r'|', composer_list_cleaned.strip_code().strip()).split('|')
        # filter out empty strings
        composer_list_cleaned = filter(lambda c: c, composer_list_cleaned)

        # fix edge case with hyphen in unexpected place: 'Georg Friedrich Haas ---DELETE--- (b. 1953) Composer of contemporary classical music'
        composers.extend([ composer.replace(' ---DELETE--- ', ' ') if composer.startswith('Georg Friedrich Haas') else composer for composer in list(composer_list_cleaned) ])
    return composers

In [None]:
def reformat_composer_string(composer_string: str) -> Dict[str, str]:
    """Reformat composer strings into {Composer: Name, Timeline: YYYY-YYYY} format.
    
    :param composer_string: Most common input format: 'Aram Khachaturian (1903-1978)'

    Correctly formats and splits *most* but not all composer strings. Generates a small number of 
    false negatives that can be manually fixed later.

    Warning: Here lies lots of ugly regex...
    """
    composer = ''
    timeline = 'Unknown'

    # 1. do some initial string normalisation
    if isinstance(composer_string, element.Tag):
        composer_cleaned = unicodedata.normalize('NFC', composer_string.text.replace('\xa0', ' ').strip())
    else:
        composer_cleaned = unicodedata.normalize('NFC', composer_string.replace('\xa0', ' ').strip())
    # remove extra spaces
    composer_cleaned = re.sub(r'[ ]{2,}', r' ', composer_cleaned)
    
    # 2. attempt to split original composer string into 'Composer Name|1900-1975' format
    # edge case (with no parentheses): 'Composer Name 1900-1980'
    if '(' not in composer_cleaned:
        composer_cleaned = re.sub(r'^(\w.+) (\d.+)', r'\1|\2', composer_cleaned)
    # assume everything else follows 'Composer Name (1900-1980)' or 'Composer Name (1900-1980' format
    else:
        # convert '(the younger)' to 'the younger'
        composer_cleaned = re.sub(r'[(](the younger)[)]', r'\1', composer_cleaned)
        # filter out any text after a '---DELETE---' marker
        if '---DELETE---' in composer_cleaned:
            composer_cleaned = re.sub(r'(\w.+)(---DELETE---.*)', r'\1', composer_cleaned)
        # filter out any content between parentheses that doesn't contain numbers
        # e.g. we care about '(born 1900)' and '(1900-1970)', but not '(extraneous text)'
        composer_cleaned = re.sub(r'(\w.+)?([ ]{1,})([(]([^\d]+)\1[)])', r'\1', composer_cleaned)

        if ')' not in composer_string:
            composer_cleaned = re.sub(r'^(\w.+)[ ]?([(])(\w.+|\W.+)\Z', r'\1|\3', composer_cleaned)
        # edge case: 'Ernő Dohnányi (1877–1960), <lots of extraneous text>'
        elif composer_string.startswith('Ernő Dohnányi (1877–1960)'):
            composer_cleaned = re.sub(r'^(\w.+) ([(])(\w.+)([)])', r'\1|\3', composer_string.split(',')[0])
        else:
            composer_cleaned = re.sub(r'^(\w.+)[ ]?([(])(\w.+|\W.+)([)])([ ,]|\w|\Z)([ ]?\w.+)?', r'\1|\3', composer_cleaned)
    
    # edge case: remove any labels like '[de]'
    composer_cleaned = re.sub(r'^(\w.+) (\[\w.+\])?([|]\w.+)\Z', r'\1\3', composer_cleaned).rstrip(')')
    # filter edge case: 'May Brahe|née Mary Dixon 1884–1956'
    composer_cleaned = re.sub(r'[|](née \w.+) (\d.+)', r'|\2', composer_cleaned)

    # 3. update composer and timeline variables
    composer = composer_cleaned.split('|')[0]
    if len(composer_cleaned.split('|')) > 1:
        timeline = unidecode(composer_cleaned.split('|')[1])

    # 4. clean up some additional edge cases
    """
    Edge cases:
    1. convert e.g. 'William Cornysh the younger' to 'William Cornysh II'
    2. truncate e.g. 'Thomas Linley the elder' to 'Thomas Linley'
    3. truncate e.g. 'Leopold I, Holy Roman Emperor' to just 'Leopold I'
    4. truncate extraneous text after semi-colon in e.g. 'John Hanboys (14th century); may be J. de Alto Bosco'
    """
    suffix_dict = {'holy roman emperor': None, 'the elder': None, 'the younger': 'II'}
    composer = re.sub(r'(holy roman emperor)|(the (elder|younger))\Z', lambda suffix: suffix_dict.get(suffix.group()), composer.strip('()'), flags=re.IGNORECASE).strip(', ')
    composer = re.sub(r'(\w.+)([ ])?([;])( \w.+)', r'\1', composer)

    # 5. add separated composer names and birth/death year details to a dictionary
    # if year is missing, Timeline defaults to 'Unknown'
    return {'Composer': composer, 'Timeline': timeline}

In [None]:
def split_composer_name(composer: str) -> NamedTuple:
    """Split composer names into parts (last name, given names, initials).

    :param composer: Full composer name
    """
    # extract given and last names + any suffixes
    converted_name = HumanName(composer)
    # if composer has just one name, move that value to the 'last' name attribute
    if not converted_name.last:
        converted_name.last = converted_name.first
        converted_name.first = None
    
    # limit list of middle names to just 2 items since we only need 2 initials max
    if len(converted_name.middle_list) > 2:
        middle_names_truncated = converted_name.middle_list[:2]
    else:
        middle_names_truncated = converted_name.middle_list

    # extract first, second and/or third initials from name
    converted_initials = {'Initial1': None, 'Initial2': None, 'Initial3': None}
    if converted_name.middle_list:
        converted_initials['Initial1'] = converted_name.first[0]
        converted_initials['Initial2'] = middle_names_truncated[0][0]
        if len(middle_names_truncated) > 1:
            converted_initials['Initial3'] = middle_names_truncated[1][0]
    else:
        if not converted_name.first:
            converted_initials['Initial1'] = converted_name.last[0]
        else:
            converted_initials['Initial1'] = converted_name.first[0]

    # return split name + any initials
    SplitName = namedtuple('SplitName', 'LastName GivenNames Initial1 Initial2 Initial3')
    if converted_name.suffix:
        return SplitName(LastName=f"{converted_name.last} {converted_name.suffix}".strip(), GivenNames=f"{converted_name.first} {converted_name.middle}".strip(), Initial1=converted_initials['Initial1'], Initial2=converted_initials['Initial2'], Initial3=converted_initials['Initial3'])
    else:
        return SplitName(LastName=converted_name.last.strip(), GivenNames=f"{converted_name.first} {converted_name.middle}".strip(), Initial1=converted_initials['Initial1'], Initial2=converted_initials['Initial2'], Initial3=converted_initials['Initial3'])

In [None]:
## parse Wikipedia omnibus Composers nationality lists
def parse_composer_nationalities(master_xml: str, subsection_xml: str, mastersections: bool = 1, subsections: bool = 0) -> pd.DataFrame:
    """Returns list of dicts of composers, dates of birth and/or death, nationalities, and status (living/dead).

    :param master_xml: Wiki XML file with master list of composers by nationality
    :param subsection_xml: Wiki XML file with per-country list of composers by nationality. For countries with larger numbers of composers that aren't included on the master wiki.
    :param mastersections: Toggle parsing the master list (default = 1)
    :param subections: Toggle parsing the per-country list (default = 0)
    """
    # countries for which we want to process separate wiki sub-pages - map adjectives -> nouns
    country_dict = {'American': 'United States', 'Armenian': 'Armenia', 'Australian': 'Australia', 'Austrian': 'Austria', 'Canadian': 'Canada', 'Czech': 'Czech Republic', 'Estonian': 'Estonia', 'French': 'France', 'German': 'Germany', 'Icelandic': 'Iceland', 'Italian': 'Italy', 'English': 'United Kingdom'}
    subsection_countries = sorted(set(country_dict.values()))

    # section headers to exclude
    exclude_sections = ['Contents', 'External links', 'Further reading', 'References', 'See also']
    # regex exclude pattern needs to be a literal string since mwparser.get_sections() doesn't accept re.compile
    exclude_pattern = "r\'^(?!(" + '([ ]{1,})?(' + '|'.join(exclude_sections) + ')([ ]{1,})?' + ")).*\'"
    
    # master list for combining parsed data from the master wiki and sub-wikis
    master_composer_list = []

    # 1. loop through all countries from the master wiki and extract composer details
    if mastersections == 1:
        master_wiki_sections = mwparser.parse(re.sub(r'<!--|-->', '', file_utils.ProcessWikiXML().load_xml(master_xml).find('text', attrs={'xml:space': 'preserve'}).text))
        for master_country in master_wiki_sections.get_sections(matches=eval(exclude_pattern)):
            master_country_name = master_country.filter_headings()[0].title.strip()
            master_wiki_list = []
            """
            skip countries with wiki sub-pages that we want to process separately
            we exclude the 'United States' section of the master wiki in favour of the more
            comprehensive wiki sub-page
            """
            if (master_country_name not in subsection_countries) and (master_country_name not in ['Iran', 'Ireland', 'Japan', 'Lithuania', 'Mongolia', 'Philippines', 'Slovakia', 'Slovenia', 'South Africa', 'Sri Lanka']):
                master_wiki_list.extend(parse_composer_wiki_xml(master_country_name, master_country))
            # construct dict for each composer and add to master composers list
            for master_composer in master_wiki_list:
                master_composer = reformat_composer_string(master_composer)
                master_composer['Nationality'] = master_country_name
                master_composer['Status'] = return_animate_status(master_composer['Composer'], master_composer['Timeline'])
                master_composer_list.append(master_composer)

    # 2. loop through all countries from the subsection country wikis and extract composer details
    # 'subsection country' = country title is on the master wiki, but its actual full content is on a separate wiki
    if subsections == 1:
        sub_wiki_soup = file_utils.ProcessWikiXML().load_xml(subsection_xml)
        # generate list of tuples: (country_name, sanitised_xml_content)
        sub_wiki_countries = list(zip([ country_dict[re.sub(r'((\w+ )?List of )(\w+) ((\w+ )?composers)', r'\3', title.text, flags=re.IGNORECASE)] for title in sub_wiki_soup.find_all('title') ], [ mwparser.parse(re.sub(r'<!--|-->', '', blob.text)) for blob in sub_wiki_soup.find_all('text', attrs={'xml:space': 'preserve'}) ]))

        for sub_country_name, sub_wiki_text in sub_wiki_countries:
            sections = sub_wiki_text.get_sections(matches=eval(exclude_pattern))
            for section in sections:
                section_list = []
                section_list.extend(parse_composer_wiki_xml(sub_country_name, section))
                # construct dict for each composer and add to master_composer_list
                for idx, sub_composer in enumerate(section_list):
                    sub_composer = reformat_composer_string(sub_composer)
                    sub_composer['Nationality'] = sub_country_name
                    sub_composer['Status'] = return_animate_status(sub_composer['Composer'], sub_composer['Timeline'])
                    section_list[idx] = sub_composer
                master_composer_list.extend(section_list)

    # 3. return the aggregated list of composer details
    return master_composer_list

In [None]:
def wiki_remove_nationality_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """Remove duplicates and manually reconcile conflicting entries from parsed Wikipedia 'List of composers by nationality'.

    :param df: Master DataFrame of parsed Wiki XML data
    """
    fixed_df = df.copy()

    # 1a. Sanitise composer strings with pattern 'Composer Name (extraneous text'
    fixed_df['Composer'] = fixed_df['Composer'].apply(lambda name: re.sub(r' [(]([^\d]+)\Z', r'', name).strip() if re.search(r' [(]([^\d]+)\Z', name) else name)

    # 1b. Try a second-pass reformat of sanitised composer strings
    # also pre-sanitises edge case string ':fr:Jacques de la Presle (1888–1969'
    composer_replace_dict = dict(fixed_df.loc[fixed_df['Composer'].str.contains('\('), 'Composer'].apply(lambda name: tuple(reformat_composer_string(re.sub(r'^[:]\w{1,2}[:]', r'', name)).values())))
    for idx, row in fixed_df.loc[fixed_df['Composer'].str.contains('\(')].iterrows():
        fixed_df.loc[idx, 'Composer'] = composer_replace_dict[idx][0]
        fixed_df.loc[idx, 'Timeline'] = composer_replace_dict[idx][1]
        fixed_df.loc[idx, 'Status'] = return_animate_status(composer_replace_dict[idx][0], composer_replace_dict[idx][1])

    # 1c. Drop full duplicates (having same Composer/Timeline/Nationality)
    fixed_df = fixed_df.drop_duplicates().sort_values(by=['Composer', 'Timeline', 'Nationality']).reset_index(drop=True)

    # 2. manually reconcile conflicting nationalities (same Composer/Timeline, but multiple Nationalities)
    composers_to_update = {
        'Argentina': ['Analia Llugdar'],
        'Armenia': ['Aram Khachaturian', 'Karen Khachaturian'],
        'Austria': ['Carl Czerny', 'Joseph Haydn', 'Johann Nepomuk Hummel', 'Gustav Mahler', 'Leopold Mozart', 'Wolfgang Amadeus Mozart', 'Arnold Schoenberg', 'Franz Schreker', 'Franz Schubert'], 
        'Australia': ['Julian Cochran'], 
        'Belgium': ['Jean-Baptiste Accolay'], 
        'Bulgaria': ['Alexandra Fol'], 
        'Canada': ['León Zuckert'], 
        'Colombia': ['Kike Santander'],
        'Croatia': ['Elena Pucić-Sorkočević'], 
        'Czech Republic': ['Franz Benda', 'Florian Leopold Gassmann', 'Karel Husa', 'Johann Pehel', 'Johann Baptist Wanhal', 'Jaromír Weinberger'], 
        'Estonia': ['Boris Parsadanian', 'Helen Tobias-Duesberg'], 
        'France': ['Arthur Honegger', 'Jean-Baptiste Lully', 'Jacques Offenbach', 'Alexandre Tansman'], 
        'Germany': ['Christoph Willibald Gluck', 'George Frideric Handel', 'Johann Christoph Pepusch', 'Franz Xaver Richter', 'Carl Stamitz'],
        'Hungary': ['Franz Liszt'],
        'Netherlands': ['Fred Momotenko'], 
        'Poland': ['Krzysztof Penderecki'], 
        'Puerto Rico': ['Roberto Sierra'],
        'Romania': ['Mihail Jora'], 
        'Russia': ['Alla Pavlova'], 
        'United Kingdom': ['Samuel Coleridge-Taylor', 'Anthony de Countie', 'Eugene Goossens'],
        'United States': ['Leonardo Balada', 'Henry Brant', 'Douglas Knehans', 'Ernst Krenek', 'Gian Carlo Menotti', 'Conlon Nancarrow']
        }
    for country in composers_to_update:
        for composer in composers_to_update[country]:
            fixed_df.loc[fixed_df['Composer'] == composer, 'Nationality'] = country

    # 3. drop ad-hoc duplicates
    fixed_df = fixed_df.loc[~(((fixed_df['Composer'] == 'Ruby Claudia Davy') | (fixed_df['Composer'] == 'May Howlett')) & (fixed_df['Timeline'] == 'Unknown'))]

    # 4. also fix erroneous timeline data for misc duplicate records
    fixed_df.loc[fixed_df['Composer'] == 'Mona McBurney', 'Timeline'] = '1862-1932'
    fixed_df.loc[fixed_df['Composer'] == 'Johann Christoph Pepusch', 'Timeline'] = '1667-1752'
    fixed_df.loc[fixed_df['Composer'] == 'Giacomo Puccini', 'Timeline'] = '1858-1924'
    fixed_df.loc[fixed_df['Composer'] == 'Carl Stamitz', 'Timeline'] = '1745-1801' 
    fixed_df.loc[fixed_df['Composer'] == 'May Summerbelle', 'Timeline'] = '1867-1947'

    fixed_df = fixed_df.drop_duplicates().reset_index(drop=True)
    return fixed_df

def wiki_clean_nationality_data(df: pd.DataFrame) -> pd.DataFrame:
    """Ad-hoc manual fixes nationality data edge cases that regex couldn't handle

    :param df: De-duplicated master DataFrame of parsed Wiki XML data 
    """
    cleaned_df = df.copy()
    ## drop bad record
    cleaned_df = cleaned_df.loc[cleaned_df['Composer'] != '3']

    ## standardise Russian surname transliteration
    cleaned_df.loc[cleaned_df['Composer'] == 'Sergei Rachmaninoff', 'Composer'] = 'Sergei Rachmaninov'

    ## fix record with Chinese characters
    cleaned_df.loc[cleaned_df['Composer'].str.contains('Liu Shueh-Shuan'), 'Composer'] = 'Liu Shueh-Shuan'

    ## edge cases that regex couldn't handle
    # fix India edge cases (mostly ensembles)
    cleaned_df.loc[cleaned_df['Composer'].str.contains('Jatin-Lalit'), 'Composer'] = 'Jatin-Lalit'
    cleaned_df.loc[cleaned_df['Composer'].str.contains('Laxmikant–Pyarelal'), 'Composer'] = 'Laxmikant–Pyarelal'
    cleaned_df.loc[cleaned_df['Composer'].str.contains('Nadeem-Shravan'), 'Composer'] = 'Nadeem-Shravan'
    cleaned_df.loc[cleaned_df['Composer'].str.contains('Shankar–Ehsaan–Loy'), 'Composer'] = 'Shankar–Ehsaan–Loy'
    # fix New Zealand edge case
    cleaned_df.loc[cleaned_df['Composer'].str.contains('Wayan Yudane'), 'Composer'] = 'Wayan Yudane'

    cleaned_df = cleaned_df.drop_duplicates().sort_values(by=['Composer', 'Timeline']).reset_index(drop=True)
    return cleaned_df

## Merge SSO data with generated composer and conductor metadata

In [None]:
def merge_conductor_composer_metadata(cleaned_df: pd.DataFrame, conductor_map_file: str, composer_map_file: str) -> pd.DataFrame:
    """Merge the cleaned SSO master data set with pre-aggregated composer and conductor metadata.

    :param cleaned_df: DataFrame with cleaned SSO data
    :param conductor_map_file: Name of CSV with SSO conductor nationality/gender mappings
    :param composer_map_file: Name of CSV with SSO composer nationality/status/gender mappings
    """
    # work with a modified copy of the original DataFrame
    sso_df_copy = cleaned_df.copy()
    # temporarily convert Artist_Metadata into a string representation so that it can be added to a multiindex
    sso_df_copy['Artist_Metadata'] = sso_df_copy['Artist_Metadata'].apply(lambda x: str(x))

    # load previously generated conductor and composer metadata maps
    conductor_nationality_map = file_utils.ProcessCSV().load_csv(conductor_map_file)
    composer_nationality_map = file_utils.ProcessCSV().load_csv(composer_map_file)

    # add conductor metadata
    sso_df_copy = pd.concat([sso_df_copy.drop('Conductor', axis=1), sso_df_copy[['Conductor']].merge(conductor_nationality_map, how='left', left_on='Conductor', right_on='Conductor').rename(columns={'Nationality': 'ConductorNationality', 'Gender': 'ConductorGender'})], axis=1)

    # explode Piece and Composer lists into separate rows
    sso_df_copy = sso_df_copy.set_index(['Concert', 'Key', 'Date', 'Conductor', 'ConductorNationality', 'ConductorGender', 'Artist_Metadata'])[['Piece', 'Composer']].apply(lambda x: pd.Series.explode(x)).reset_index()
    # add composer metadata
    sso_df_copy = pd.concat([sso_df_copy.drop('Composer', axis=1), sso_df_copy[['Composer']].merge(composer_nationality_map, how='left', left_on='Composer', right_on='Composer').rename(columns={'Nationality': 'ComposerNationality', 'Timeline': 'ComposerTimeline', 'Status': 'ComposerStatus', 'Gender': 'ComposerGender'})], axis=1)

    # generate aggregate counts for each concert and piece: first date of performance, # times performed
    sso_df_copy = sso_df_copy.groupby(['Key', 'Concert', 'Piece', 'Composer', 'ComposerNationality', 'ComposerTimeline', 'ComposerStatus', 'ComposerGender', 'Conductor', 'ConductorNationality', 'ConductorGender', 'Artist_Metadata'])['Date'].agg([lambda date: datetime.strptime(date.min(), '%Y-%m-%d %H:%M').strftime('%Y-%m-%d'), lambda date: date.count()]).rename(columns={'<lambda_0>': 'First Date', '<lambda_1>': '# Performances'}).reset_index().sort_values(by=['First Date', 'Key'])

    # re-arrange the columns
    ordered_cols = ['Key', 'Concert', 'First Date', '# Performances', 'Piece', 'Composer', 'ComposerNationality', 'ComposerTimeline', 'ComposerStatus', 'ComposerGender', 'Conductor', 'ConductorNationality', 'ConductorGender', 'Artist_Metadata']
    sso_df_copy = sso_df_copy[ordered_cols]

    # convert Artist_Metadata back into a list of tuples
    sso_df_copy['Artist_Metadata'] = sso_df_copy['Artist_Metadata'].apply(lambda x: eval(x))

    return sso_df_copy

In [None]:
def main():
    # check whether to use back- or forward-slash path separators, depending on platform (Windows or Unix-based)
    if sys.platform in ['cygwin', 'win32']:
        path_separator = '\\'
    else:
        path_separator = '/'
    
    SSO_PICKLE_FILE = "sso_2018_2021_cleaned.pkl"
    WIKIPEDIA_MASTER_NATL_FILE = "Wikipedia-Composers_by_nationality.xml"
    WIKIPEDIA_SUB_NATL_FILE = "Wikipedia-Composers_by_sub_nationality.xml"

    COMPOSER_NATIONALITY_MAP_FILE = "sso_composer_nationality_map.csv"
    CONDUCTOR_NATIONALITY_MAP_FILE = "sso_conductor_nationality_map.csv"
    SSO_OUTFILE_PREFIX = "sso_2018_2021_cleaned_merged"

    # 1. load SSO pickle file into a DataFrame
    sso_cleaned_df = file_utils.ProcessPickle().load_pickle(f"data{path_separator}{SSO_PICKLE_FILE}")
    print(f"{SSO_PICKLE_FILE} dimensions: {sso_cleaned.shape}")

    """
    ### (BEGIN) EXECUTE SECTION ONLY IF COMPOSER/CONDUCTOR NATIONALITY MAP FILES STILL NEED TO BE GENERATED (BEGIN) ###
    # 2a. (if needed) import and merge Wikipedia data
    wikipedia_df = pd.DataFrame(parse_composer_nationalities(WIKIPEDIA_MASTER_NATL_FILE, WIKIPEDIA_SUB_NATL_FILE, 1, 1))
    wikipedia_df = wiki_remove_nationality_duplicates(wikipedia_df)
    wikipedia_df = wiki_clean_nationality_data(wikipedia_df)

    # 2b. (optional) uncomment the following line to write wikipedia_df to a Pickle file
    #wikipedia_df.to_pickle(f"data{path_separator}wikipedia-composers_by_nationality.pkl")
    #print(f"Wrote Pickle file: data{path_separator}wikipedia-composers_by_nationality.pkl")

    ## If COMPOSER_NATIONALITY_MAP_FILE does not already exist, uncomment the 2nd line to generate it
    # 3. (if needed) generate composer nationality map CSV
    composer_nationality_map = WikidataUtils.merge_wikidata_and_sso_composers(sso_cleaned_df, wikipedia_df, export=1)

    ## If CONDUCTOR_NATIONALITY_MAP_FILE does not already exist, uncomment the 2nd line to generate it
    # 4. (if needed) generate conductor nationality map CSV
    conductor_nationality_map = WikidataUtils.merge_wikidata_and_sso_conductors(sso_cleaned_df, export=1)
    ### (END) EXECUTE SECTION ONLY IF COMPOSER/CONDUCTOR NATIONALITY MAP FILES STILL NEED TO BE GENERATED (END) ###
    """

    # 5. join cleaned SSO data and conductor and composer metadata
    sso_cleaned_merged_df = merge_conductor_composer_metadata(sso_cleaned_df, f"data{path_separator}{CONDUCTOR_NATIONALITY_MAP_FILE}", f"data{path_separator}{COMPOSER_NATIONALITY_MAP_FILE}")
    print(f"\nsso_cleaned_merged dimensions: {sso_cleaned_merged_df.shape}\n")
    print(sso_cleaned_merged_df.head(5))

    # 6. write merged data to CSV
    sso_cleaned_merged_df.to_pickle(f"data{path_separator}{SSO_OUTFILE_PREFIX}.pkl")
    print(f"\nWrote Pickle file: data{path_separator}{SSO_OUTFILE_PREFIX}.pkl")
    sso_cleaned_merged_df.to_csv(path_or_buf=f"data{path_separator}{SSO_OUTFILE_PREFIX}.csv", index=False)
    print(f"Wrote CSV file: data{path_separator}{SSO_OUTFILE_PREFIX}.csv")

In [None]:
if __name__ == '__main__':
    main()