In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display import display, Markdown

import pandas as pd
import numpy as np

import json

np.__version__

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


'1.26.3'

# 0. Import & prep data

In [2]:
PERFORMER_SPLIT_FILE = "./data/22.4-OUTPUT-performer_list_str-with-multi-name-artists-merged.csv"

_all_performers_df = pd.read_csv(PERFORMER_SPLIT_FILE)
_all_performers_df

GENDER_LOOKUP_TABLE_PATH = './data/22.5-OUTPUT-1-gender-lookup-table.csv'
existing_gender_table = pd.read_csv(GENDER_LOOKUP_TABLE_PATH)
existing_gender_table

# get df of any performers in _all_performers_df that are not in existing_gender_table
new_performers_df = _all_performers_df[~_all_performers_df['performer'].isin(existing_gender_table['performer'])]
new_performers_df

raw_performers_df = new_performers_df.copy()
raw_performers_df

Unnamed: 0,performer,performers_list_str
13,Bill Medley & Jennifer Warnes,Bill Medley|Jennifer Warnes|
29,Daryl Hall John Oates,Daryl Hall|John Oates|
35,Dawn Featuring Tony Orlando,Dawn|Tony Orlando|
45,Elvis Presley With The Jordanaires,Elvis Presley|The Jordanaires|
46,Erykah Badu Featuring Common,Erykah Badu|Common|
...,...,...
2558,Tom Clay,Tom Clay|
2559,Bill Black's Combo,Bill Black's Combo|
2560,CJ,CJ|
2561,The Kid LAROI & Miley Cyrus,The Kid LAROI|Miley Cyrus|


In [3]:
performers_list = np.unique(
    np.concatenate(raw_performers_df['performers_list_str'].str.split("|").apply(lambda x: [i for i in x if i]).values)
)
len(performers_list)

904

In [4]:
# But, everything should END with "|" (this was a bug I caught)
assert(len(raw_performers_df[~raw_performers_df.performers_list_str.str.contains("|", regex=False)]) == 0)

# Setup GPT4 call

In [5]:
import os
from openai import OpenAI

# First, activate the './.env' file which sets the OPENAI_API_KEY environment variable using python-dotenv
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [6]:
LOVE_SONG_LABELING_PROMPT = """
You are a pop music expert, familiar with the songs and their performers featured in the US Billboard Top 10 hits.

You are also an expert in the gender identities of these performers, including those who have updated their genders, eg Sam Smith who is now non-binary.

Your task is, given a performer, label them with:
- "individual" or "group"
- their most recent gender identity, or, in the case of a group, the gender identity mix of the the group: "male", "female", "non-binary", "mixed gender group"
- If a performer is unclear or simply appears to be a mistake, label them as "unknown"

Here's an example input:

```json
[
	"Sam Smith",
    "Demi Lovato",
    "Adele",
    "SZA",
    "John Legend",
    "The Beatles",
    "Black Eyed Peas",
    "The Carpenters",
    "The Supremes",
    "Destiny's Child",
    "The Rolling Stones",
    "One Direction",
    "His Twangy Guitar",
    "asdf ghjkl"
]
```

Here's the sample corresponding output:

```json
{"output": [
	["Sam Smith", "individual", "non-binary"],
    ["Demi Lovato", "individual", "non-binary"],
    ["Adele", "individual", "female"],
    ["SZA", "individual", "female"],
    ["John Legend", "individual", "male"],
    ["The Beatles", "group", "male"],
    ["Black Eyed Peas", "group", "mixed gender group"],
    ["The Carpenters", "group", "mixed gender group"]
    ["The Supremes", "group", "female"],
    ["Destiny's Child", "group", "female"],
    ["The Rolling Stones", "group", "male"],
    ["One Direction", "group", "male"],
    ["His Twangy Guitar", "unknown", "unknown"],
    ["asdf ghjkl", "unknown", "unknown"]
]}
```

Ok, now generate the output for this input:
"""

In [7]:
import ast

def eval_this_str(s):
    return ast.literal_eval(s)

In [8]:
def get_love_song_labels_for_rows(row_subset_str):
    try: 
        completion = client.chat.completions.create(
          model="gpt-4o",
          response_format={'type': "json_object"},
          messages=[
            {"role": "user", "content": LOVE_SONG_LABELING_PROMPT},
            {"role": "user", "content": row_subset_str},
          ]
        )
        content = completion.choices[0].message.content
        # return content
        response_content = eval_this_str(content)['output']
        # print(f"Response contains {completion['usage']['completion_tokens']} tokens out of 4096 maximum allowed tokens.")
        return response_content
    except Exception as e:
        print('ERROR FETCHING', e)
        return []

### Setup API call batching code

In [9]:
def fetch_all_data_in_batches(performers_list, increment=50):
    gpt_labeled_result = []
    for i in range(101):
        subset = performers_list[i*increment : (i+1)*increment]
        if len(subset):
            print([i*increment, (i+1)*increment])
            print(subset)
            subset_as_json_string = json.dumps(subset.tolist())
            print(subset_as_json_string, '\n')
            # fetched_and_formatted_res = [['David Mora', 'individual', 'male']]
            fetched_and_formatted_res = get_love_song_labels_for_rows(subset_as_json_string)
            print(fetched_and_formatted_res)
            gpt_labeled_result = [*gpt_labeled_result, *fetched_and_formatted_res]
    return gpt_labeled_result

def get_file_path(postfix):
    return f'./data/22.5-TEMP-gpt4-label-gender-{postfix}.csv'

def save_to_temp_csv(gpt_labeled_result, run_num):
    first_label_pass_df = pd.DataFrame(
        gpt_labeled_result, 
        columns=['performer', 'type', 'gender']
    )
    first_label_pass_df.to_csv(get_file_path(run_num), index=False)

# (run a test call to be sure things look ok)
For 100 rows at once:
For GPT4 turbo: 3min, about 3400 tokens out of 4096 maximum allowed tokens 

In [10]:
TEST_POSTFIX = 'JUST A TEST, DELETE ME'
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         # just get first 4:
#         performers_list[900:950]
#     ), 
#     TEST_POSTFIX
# )

In [11]:
pd.read_csv(get_file_path(TEST_POSTFIX))

Unnamed: 0,performer,type,gender
0,the Blackhearts,group,mixed gender group
1,the Curls,unknown,unknown
2,the Delons,unknown,unknown
3,will.i.am,individual,male


# Execute calls (in batched batches in case of failure)

In [12]:
len(performers_list)

904

In [13]:
DATA_FILE_KEY_1 = 'thru-sep-2023-update'
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         performers_list[0:1000]
#     ), 
#     DATA_FILE_KEY_1
# )

In [14]:
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         performers_list[900:950] # an error apparently happened
#     ), 
#     2
# )

In [15]:
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         performers_list[2000:3000]
#     ), 
#     3
# )

# Merge the fetched data with the existing data. !NOTE! for convenience, I'm doing a no-no and renaming `gender_df` and `performers_list` to be the full dataset going forward. Honestly this should have been 2 separate notebooks.

In [16]:
new_gender_df = pd.concat([
    pd.read_csv(get_file_path(DATA_FILE_KEY_1)),
    pd.read_csv(get_file_path(2)),
    # pd.read_csv(get_file_path(3))
])


# make a df of any in performers_list NOT in gender_df

# Ok, merge with existing, removing any duplicates
gender_df = pd.concat([
    existing_gender_table,
    new_gender_df
])

assert(len(existing_gender_table) == len(gender_df) - len(new_gender_df))

# Now let's get a new 
raw_performers_df = _all_performers_df.copy()
performers_list = np.unique(
    np.concatenate(raw_performers_df['performers_list_str'].str.split("|").apply(lambda x: [i for i in x if i]).values)
)
len(performers_list)

2377

## Do some sanity checks & identify issues

In [17]:
gender_df.type.value_counts()


type
individual    2013
group         1117
unknown         52
Name: count, dtype: int64

In [18]:
gender_df.gender.value_counts()

gender
male                  2168
female                 697
mixed gender group     254
unknown                 54
non-binary               9
Name: count, dtype: int64

### Here is where we first caught the mistakes, which we fix below 

(why am I using "we"? It's just me, David. Hi.)

In [19]:
combnined = set([*performers_list, *gender_df['performer']])
# Print any performers that are not in both performers_list AND gender_df['performer']
culprit = None
for p in combnined:
    if p not in performers_list:
        culprit = p
        print(f"Performer '{p}' not in performers_list")

for performer in performers_list:
    if performer not in combnined:
        print(f"Performer '{performer}' not in combined")

gender_df[
    gender_df.performer.str.contains(culprit or '')
]

print('What IS in performers_list:', [performer for performer in performers_list if performer.startswith('Henry Mancini')])


# NA: write out WHAT the mistake is. Figure out how to fix it. (in this notebook of the last?)
# The mistakes is that I failed to TRIM the whitespace from the performer names, in the last notebook.
# 

Performer 'Angie Martinez' not in performers_list
Performer 'I-20' not in performers_list
Performer '? and the Mysterians' not in performers_list
Performer 'Diana Gordon' not in performers_list
Performer 'Randy Vanwarmer' not in performers_list
Performer 'James & Bobby Purify' not in performers_list
Performer 'Travis & Bob' not in performers_list
Performer 'Tha Chill' not in performers_list
Performer 'Jimmy Harnen with Synch' not in performers_list
Performer 'Derek and the Dominos' not in performers_list
Performer 'The Curls' not in performers_list
Performer 'Dale & Grace' not in performers_list
Performer 'Cornelius Brothers & Sister Rose' not in performers_list
Performer 'Disco-Tex and the Sex-O-Lettes' not in performers_list
Performer 'The Delons' not in performers_list
Performer 'Trina & Tamara' not in performers_list
Performer 'Freedom Williams' not in performers_list
Performer 'Dan + Shay' not in performers_list
Performer 'Paul & Paula' not in performers_list
What IS in performers

### We detected an underlying data mistake: several performers end in some variation of `" And "`

Note, however: these were all, helpfully, marketed as `unknown`. So we can address them within that larger type.

In [20]:
# The issue: seems GPT strugled with chopping of the ending "and ", thankfully pretty easy to identify.
[performer for performer in performers_list if performer.endswith(' ')]

[]

In [21]:
gender_df[
    gender_df.performer.str.endswith(' And ')
    | gender_df.performer.str.endswith(' and ')
    | gender_df.performer.str.endswith(' & ')
    | gender_df.performer.str.endswith(' And')
]

Unnamed: 0,performer,type,gender


# Make corrections

## Apply `unkown` fixes

### A. Export the `unknown`s, hand fix them in a spreadsheet, correct

Types of fixes provided by gsheet:
- It *is* a real performer. ACTION: Add the info, correct *just in the gender lookup table*
- Incorrectly *parsed* name. ACTION: fix the name, correct both *gender lookup and name split tables*
- The name shouldn't be in the dataset at all. ACTION: drop from BOTH *gender lookup and name split tables*


In [22]:
pd.set_option('display.max_rows', 30)
gender_df[
    gender_df.type == 'unknown'
]#.to_csv('./data/22.5-TEMP-unknown-gender-to-hand-correct.csv', index=False)

Unnamed: 0,performer,type,gender
0,"""The Sting""",unknown,unknown
2,100 Proof,unknown,unknown
14,?,unknown,unknown
29,Alice Bowie,unknown,unknown
36,Angie Mar,unknown,unknown
...,...,...,...
815,The Triumphs,unknown,unknown
816,The Twilights,unknown,unknown
821,The Wild Pair,unknown,unknown
823,Tijuana Brass,unknown,unknown


# PULL IN GOOGLE HAND FIXES

In [23]:
# Fetch the correction gSheet
G_SHEET_URL = "https://docs.google.com/spreadsheets/d/1xxE-vvzDUQTgYeigHMK_4lUznXAzDgNlAdY6JMOIOcc/export?format=csv&gid=1356757517"
gsheet_unknown_corrections_df = pd.read_csv(G_SHEET_URL)
gsheet_unknown_corrections_df

gsheet_unknown_corrections_df[gsheet_unknown_corrections_df.performer.str.contains('Henry Mancini')]

Unnamed: 0,performer,type,gender,correct_name,drop_it
28,Henry Mancini And His Orchestra,group,mixed gender group,Henry Mancini And His Orchestra,


In [24]:
# Fetch the incorrect split correction gSheet
INCORRECT_SPLIT_G_SHEET_URL = "https://docs.google.com/spreadsheets/d/1xxE-vvzDUQTgYeigHMK_4lUznXAzDgNlAdY6JMOIOcc/export?format=csv&gid=659955835"
gsheet_incorrect_split_corrections_df = pd.read_csv(INCORRECT_SPLIT_G_SHEET_URL)
gsheet_incorrect_split_corrections_df

Unnamed: 0,split_part_1,split_part_2,replace_with_this,type,gender
0,?,The Mysterians,? and the Mysterians,group,male
1,Travis,Bob,Travis & Bob,group,male
2,James,Bobby Purify,James & Bobby Purify,group,male
3,Dale,Grace,Dale & Grace,group,mixed gender group
4,Dan,Shay,Dan + Shay,group,male
5,Derek,The Dominos,Derek and the Dominos,group,male
6,Disco Tex,The Sex-O-Lettes,Disco-Tex and the Sex-O-Lettes,group,mixed gender group
7,Paul,Paula,Paul & Paula,group,mixed gender group
8,Cornelius Brothers,Sister Rose,Cornelius Brothers & Sister Rose,group,mixed gender group
9,Jimmy Harnen,Synch,Jimmy Harnen with Synch,group,male


### B. Finicky fixes listed below by removing the SPLIT names, replacing them with the FULL name

*Groups that were erroniously split* (these have been marked as `drop_it` in the gsheet)
- "? and the Mysterians" should be the entry, not a split out "?" and "The Mysterians"
- "Travis & Bob" is right, not "Travis" and "Bob"
- "James & Bobby Purify" is a group (don't split)
- "Dale and Grace" eg on I'm Leaving it Up to You
- "Dan + Shay" not "Dan" "Shay"
- "Derek and the Dominos" not "Derek" "The Dominos"
- "Disco-Tex and the Sex-O-Lettes"
- "Paul & Paula"
- "Cornelius Brothers & Sister Rose"
- "Jimmy Harnen with Synch"
- "Trina & Tamara"

# How to approach this bug:

## Break it down
1. Chose one NAME to track a time
2. Choose one CORRECTION DATASET to track at a time
3. Chose one FIXED dataset to track at a time


OPTION: "Soko", who was corrected in gsheet_unknown_corrections_df

## Leads on the potential cause:
- Only names from `gsheet_unknown_corrections_df` seem to be causing issues (so check that first)

In [25]:
# Create copies
fixed_gender_df = gender_df.copy()
fixed_raw_performers_df = raw_performers_df.copy()



# PART 1: apply hand fixes to incorrect splits

# The algorithm to update:
# 1. For each `row` in `gsheet_incorrect_split_corrections_df`:
# 2. For both `fixed_raw_performers_df` and `fixed_gender_df`, for every `performer` column that contains `row.split_part_1` or `row.split_part_2`
# a. Remove both `row.split_part_1` and/or `row.split_part_2`
# b. Append the `row.replace_with_this` (which is the corrected, unsplit version)
# c. If relevant, update with the `row.type` and `row.gender`

def add_delim(str):
    return f"{str}|"

def update_row_if_needed(performers_list_str, row):
    if add_delim(row['split_part_1']) in performers_list_str and add_delim(row['split_part_2']) in performers_list_str:
        updated = add_delim(row['replace_with_this']) + performers_list_str.replace(add_delim(row['split_part_1']), '').replace(add_delim(row['split_part_2']), '')
        print(f"{performers_list_str} => {updated}")
        return updated
    return performers_list_str

for index, row in gsheet_incorrect_split_corrections_df.iterrows():
    # Drop rows from `fixed_raw_performers_df` that contain `row.split_part_1` or `row.split_part_2`
    fixed_gender_df = fixed_gender_df[
        ~fixed_gender_df['performer'].str.contains(add_delim(row['split_part_1']), regex=False) &
        ~fixed_gender_df['performer'].str.contains(add_delim(row['split_part_2']), regex=False)
    ]
    # Append the corrected unsplit version
    fixed_gender_df = pd.concat([
        fixed_gender_df,
        pd.DataFrame({
            'performer': [row['replace_with_this']],
            'type': [row['type']],
            'gender': [row['gender']]
        })
    ], ignore_index=True)

    # Now handle the performers in `fixed_raw_performers_df`
    fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].apply(
        lambda performers_list_str: update_row_if_needed(performers_list_str, row)
    )



# PART 2: apply hand fixes to unkNowns
for index, row in gsheet_unknown_corrections_df.iterrows():
    performer = row['performer']
    # Clear, we'll leave it out or add it in later based on the corrections
    fixed_gender_df = fixed_gender_df[fixed_gender_df.performer != performer]
    # DROP MISTAKES
    if row['drop_it'] == 'y':
        # Go through each row in the fixed_raw_performers_df, removing the string f'{performer}|' from performers_list_str, if it exists
        fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].str.replace(f'{performer}|', '')
    elif row['type'] != 'unknown':
        if row['performer'] == 'Reg Owen & ':
            print('DEBUG')
        corrected_name = row['correct_name']
        performer_name = corrected_name if not pd.isnull(corrected_name) else performer
        fixed_gender_df = pd.concat([
            fixed_gender_df,
            pd.DataFrame(
                {
                    'performer': [performer_name],
                    'type': [row['type']],
                    'gender': [row['gender']]
                }
            )
        ])
        fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].str.replace(f'{performer}|', f'{performer_name}|')
    else:
        # Anything remaining should have been handled in PART 1
        if add_delim(row['performer']) in fixed_raw_performers_df['performers_list_str'].values:
            print('ERROR: unhandled')
        if add_delim(row['performer']) in fixed_gender_df['performer'].values:
            print('ERROR: unhandled')

# There are some gender labels which were incorrect when fed to chatGPT, so we can drop them after corrections
fixed_gender_df = fixed_gender_df[
    fixed_gender_df.type != 'unknown'
]

assert(len(fixed_gender_df) != len(gender_df))
assert(len(fixed_raw_performers_df) == len(raw_performers_df))


?|The Mysterians| => ? and the Mysterians|
Travis|Bob| => Travis & Bob|
James|Bobby Purify| => James & Bobby Purify|
Dale|Grace| => Dale & Grace|
Dan|Shay|Justin Bieber| => Dan + Shay|Justin Bieber|
Derek|The Dominos| => Derek and the Dominos|
Disco Tex|The Sex-O-Lettes|Sir Monti Rock III| => Disco-Tex and the Sex-O-Lettes|Sir Monti Rock III|
Paul|Paula| => Paul & Paula|
Cornelius Brothers|Sister Rose| => Cornelius Brothers & Sister Rose|
Jimmy Harnen|Synch| => Jimmy Harnen with Synch|
Somethin' For The People|Trina|Tamara| => Trina & Tamara|Somethin' For The People|
Apollo|Tom Parker| => Apollo featuring Tom Parker|
Billy Joe|The Checkmates| => Billy Joe & The Checkmates|
Rick Dees|His Cast Of Idiots| => Rick Dees & His Cast Of Idiots|
Santo|Johnny| => Santo & Johnny|
Herb Alpert|The Tijuana Brass| => Herb Alpert And Tijuana Brass|The 
Herb Alpert|Tijuana Brass| => Herb Alpert And Tijuana Brass|


## Handle special case fixes:

In [26]:
# Handle "Orchestra" special case: it's NOT a meaningful performer
# First, let's find it 
ORCHESTRA_WORD = 'Orchestra'

fixed_raw_performers_df[
    fixed_raw_performers_df['performers_list_str'].str.startswith(f'{ORCHESTRA_WORD}|') |
    fixed_raw_performers_df['performers_list_str'].str.contains(f'|{ORCHESTRA_WORD}|', regex=False)
]

# Manually remove the "Orchestra" performer: replace "|Orchestra|" with "|" in the performers_list_str
fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].str.replace(f'|{ORCHESTRA_WORD}|', '|')

In [27]:
# Handle "The " special case: it's NOT a meaningful performer
OFFENDING_WORD = 'The '

fixed_raw_performers_df[
    fixed_raw_performers_df['performers_list_str'].str.startswith(f'{OFFENDING_WORD}|') |
    fixed_raw_performers_df['performers_list_str'].str.contains(f'|{OFFENDING_WORD}|', regex=False) |
    fixed_raw_performers_df['performers_list_str'].str.endswith(f'|{OFFENDING_WORD}')
]

def remove_thingy(x):
    if x.endswith(f'|{OFFENDING_WORD}'):
        return x.replace(f'|{OFFENDING_WORD}', '|')
    else:
        return x

remove_thingy('Herb Alpert And Tijuana Brass|The ')

# Manually remove
fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].apply(remove_thingy)

# Sanity check our fixes

In [28]:
assert(len(fixed_raw_performers_df) == len(raw_performers_df))
fixed_performers_list = np.unique(
    np.concatenate(fixed_raw_performers_df['performers_list_str'].str.split("|").apply(lambda x: [i for i in x if i]).values)
)

unique_values, value_counts = np.unique(np.concatenate(fixed_raw_performers_df['performers_list_str'].str.split("|").apply(lambda x: [i for i in x if i]).values), return_counts=True)
sorted_output = sorted(zip(unique_values, value_counts), key=lambda x: x[1], reverse=True)
sorted_output # Note: this is not TOTAL SONG COUNT, this is just "total appearances in unique performer strings" which is a bit different

fixed_performers_list

array(['"Weird Al" Yankovic', "'N Sync", "'Til Tuesday", ...,
       'the Blackhearts', 'twenty one pilots', 'will.i.am'], dtype='<U43')

In [29]:
remaining_unknowns = fixed_gender_df[
    fixed_gender_df.type == 'unknown'
]
assert(len(remaining_unknowns) == 0)
fixed_gender_df.type.value_counts()

type
individual    2007
group         1132
Name: count, dtype: int64

In [30]:
FEMALE = 'female'
N0N_BINARY = 'non-binary'

remaining_unknowns = fixed_gender_df[
    fixed_gender_df.gender == 'unknown'
]
for index, row in remaining_unknowns.iterrows():
    print(row['performer'])
assert(len(remaining_unknowns) == 0)
print(len(remaining_unknowns))

fixed_gender_df.gender.value_counts()


0


gender
male                  2173
female                 699
mixed gender group     258
non-binary               9
Name: count, dtype: int64

In [31]:
# Finally, check that every name in fixed_raw_performers_df has a corresponding entry in fixed_gender_df

# Print any performers that are not in both performers_list
for p in fixed_performers_list:
    try:
        gender = fixed_gender_df[fixed_gender_df.performer == p].iloc[0]
        # print(gender)
    except:
        print(f"Performer '{p}' not found in gender table")
        print(_all_performers_df[_all_performers_df.performers_list_str.str.contains(p)], '\n')
        
        assert(False) # raise the alarm

# Gender non-conforming is tricky, so we're just gonna set it by hand

In [32]:
fixed_gender_df[
    fixed_gender_df.gender == N0N_BINARY
]

Unnamed: 0,performer,type,gender
180,Billie Eilish,individual,non-binary
485,Demi Lovato,individual,non-binary
744,Halsey,individual,non-binary
814,Jaden Smith,individual,non-binary
828,Janelle Monae,individual,non-binary
1579,Sam Smith,individual,non-binary
2475,Demi Lovato,individual,non-binary
2622,Janelle Monae,individual,non-binary
2960,Sam Smith,individual,non-binary


In [44]:
GENDER_NON_CONFORMING_PERFORMERS = [
    "Sam Smith",
    "Halsey",
    "Demi Lovato",
    "Jaden Smith",
    "Janelle Monae",
    "Harry Styles"
]

NON_BINARY_MISLABELS = {
    "Billie Eilish": FEMALE
}

def fix_mislabels(row):
    if row['performer'] in NON_BINARY_MISLABELS:
        print(row['performer'], '=>', NON_BINARY_MISLABELS[row['performer']])
        return NON_BINARY_MISLABELS[row['performer']]
    return row['gender']


# Fix the labels. We know there's only one mislable (Billie Eilish), so just fix that
fixed_gender_df.gender = fixed_gender_df.apply(fix_mislabels, axis=1)

assert(len(
    fixed_gender_df[
    (fixed_gender_df.gender == N0N_BINARY) &
    (~fixed_gender_df.performer.apply(lambda x: x in GENDER_NON_CONFORMING_PERFORMERS))
]) == 0)

Billie Eilish => female


# Ok, now let's create a gender column on the split table

Use 
- female: f
- male: m
- mixed gender group: x
- non-binary: n

In other words, for example:
- Taylor Swift|Sam Smith|
- f|n|

In [45]:
PERFORMER_TYPE_ABBREVIATIONS = {
    'individual': 'i',
    'group': 'g'
}
GENDER_ABBREVIATIONS = {
    'female': 'f',
    'male': 'm',
    'non-binary': 'n',
    'mixed gender group': 'x', # OPTIMIZATION: technically this is redundant, so we could drop "g" for this.
}

def create_type_and_gender_string(performers_list_str):
    performers = performers_list_str.split("|")
    type_and_gender_string = ''
    for performer in performers:
        if not performer:
            continue
        row = fixed_gender_df[fixed_gender_df.performer == performer].iloc[0]
        performer_type = row['type']
        gender = row['gender']
        type_and_gender_string += f"{PERFORMER_TYPE_ABBREVIATIONS[performer_type]}{GENDER_ABBREVIATIONS[gender]}|"
    return type_and_gender_string

        
create_type_and_gender_string('Taylor Swift|Sam Smith|The Beatles|The Black Eyed Peas|')

fixed_raw_performers_df['type_and_gender_list_str'] = fixed_raw_performers_df['performers_list_str'].apply(create_type_and_gender_string)

In [46]:
fixed_raw_performers_df

Unnamed: 0,performer,performers_list_str,type_and_gender_list_str
0,10cc,10cc|,gm|
1,ABBA,ABBA|,gx|
2,AWB,AWB|,gm|
3,Aaliyah,Aaliyah|,if|
4,Aaron Neville,Aaron Neville|,im|
...,...,...,...
2558,Tom Clay,Tom Clay|,im|
2559,Bill Black's Combo,Bill Black's Combo|,gm|
2560,CJ,CJ|,im|
2561,The Kid LAROI & Miley Cyrus,The Kid LAROI|Miley Cyrus|,im|if|


# Export as a key value lookup table & the corrected name split dataset

In [47]:
GENDER_LOOKUP_TABLE_PATH = './data/22.5-OUTPUT-1-gender-lookup-table-SEP-2023-UPDATE.csv'
fixed_gender_df.to_csv(GENDER_LOOKUP_TABLE_PATH, index=False)

CORRECTED_PERFORMER_NAME_SPLITS_AND_GENDER_PATH = './data/22.5-OUTPUT-2-corrected-name-splits-with_type_and_gender-SEP-2023-UPDATE.csv'
fixed_raw_performers_df.to_csv(CORRECTED_PERFORMER_NAME_SPLITS_AND_GENDER_PATH, index=False)


In [48]:
# # double checking :D
pd.read_csv(GENDER_LOOKUP_TABLE_PATH)
pd.read_csv(CORRECTED_PERFORMER_NAME_SPLITS_AND_GENDER_PATH)

Unnamed: 0,performer,performers_list_str,type_and_gender_list_str
0,10cc,10cc|,gm|
1,ABBA,ABBA|,gx|
2,AWB,AWB|,gm|
3,Aaliyah,Aaliyah|,if|
4,Aaron Neville,Aaron Neville|,im|
...,...,...,...
2558,Tom Clay,Tom Clay|,im|
2559,Bill Black's Combo,Bill Black's Combo|,gm|
2560,CJ,CJ|,im|
2561,The Kid LAROI & Miley Cyrus,The Kid LAROI|Miley Cyrus|,im|if|
