In [24]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display import display, Markdown

import pandas as pd
import numpy as np

import json

np.__version__

'1.26.3'

# 0. Import & prep data

In [129]:
PERFORMER_SPLIT_FILE = "./data/22.4-OUTPUT-performer_list_str-with-multi-name-artists-merged.csv"

raw_performers_df = pd.read_csv(PERFORMER_SPLIT_FILE)
raw_performers_df

performers_list = raw_performers_df['performers_list_str'].str.split("|").apply(lambda x: [i for i in x if i]).values
performers_list = np.unique(
    np.concatenate(performers_list)
)
raw_performers_df

Unnamed: 0,performer,performers_list_str
0,10cc,10cc|
1,ABBA,ABBA|
2,AWB,AWB|
3,Aaliyah,Aaliyah|
4,Aaron Neville,Aaron Neville|
...,...,...
2457,Michael Jackson & Justin Timberlake,Michael Jackson|Justin Timberlake|
2458,Owl City,Owl City|
2459,"Pitbull Featuring Ne-Yo, Afrojack & Nayer",Pitbull|Ne-Yo|Afrojack|Nayer|
2460,Soko,Soko|


# Setup GPT4 call

In [26]:
import os
from openai import OpenAI

# First, activate the './.env' file which sets the OPENAI_API_KEY environment variable using python-dotenv
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [38]:
LOVE_SONG_LABELING_PROMPT = """
You are a pop music expert, familiar with the songs and their performers featured in the US Billboard Top 10 hits.

You are also an expert in the gender identities of these performers, including those who have updated their genders, eg Sam Smith who is now non-binary.

Your task is, given a performer, label them with:
- "individual" or "group"
- their most recent gender identity, or, in the case of a group, the gender identity mix of the the group: "male", "female", "non-binary", "mixed gender group"
- If a performer is unclear or simply appears to be a mistake, label them as "unknown"

Here's an example input:

```json
[
	"Sam Smith",
    "Demi Lovato",
    "Adele",
    "SZA",
    "John Legend",
    "The Beatles",
    "Black Eyed Peas",
    "The Carpenters",
    "The Supremes",
    "Destiny's Child",
    "The Rolling Stones",
    "One Direction",
    "His Twangy Guitar",
    "asdf ghjkl"
]
```

Here's the sample corresponding output:

```json
{"output": [
	["Sam Smith", "individual", "non-binary"],
    ["Demi Lovato", "individual", "non-binary"],
    ["Adele", "individual", "female"],
    ["SZA", "individual", "female"],
    ["John Legend", "individual", "male"],
    ["The Beatles", "group", "male"],
    ["Black Eyed Peas", "group", "mixed gender group"],
    ["The Carpenters", "group", "mixed gender group"]
    ["The Supremes", "group", "female"],
    ["Destiny's Child", "group", "female"],
    ["The Rolling Stones", "group", "male"],
    ["One Direction", "group", "male"],
    ["His Twangy Guitar", "unknown", "unknown"],
    ["asdf ghjkl", "unknown", "unknown"]
]}
```

Ok, now generate the output for this input:
"""

In [39]:
import ast

def eval_this_str(s):
    return ast.literal_eval(s)

In [40]:
def get_love_song_labels_for_rows(row_subset_str):
    try: 
        completion = client.chat.completions.create(
          model="gpt-4-1106-preview",
          response_format={'type': "json_object"},
          messages=[
            {"role": "user", "content": LOVE_SONG_LABELING_PROMPT},
            {"role": "user", "content": row_subset_str},
          ]
        )
        content = completion.choices[0].message.content
        # return content
        response_content = eval_this_str(content)['output']
        # print(f"Response contains {completion['usage']['completion_tokens']} tokens out of 4096 maximum allowed tokens.")
        return response_content
    except Exception as e:
        print('ERROR FETCHING', e)
        return []

### Setup API call batching code

In [41]:
# Convert to a string:
json.dumps(performers_list[:5].tolist())

'["\\"The Sting\\"", "\\"Weird Al\\" Yankovic", "\'N Sync", "\'Til Tuesday", "100 Proof"]'

In [42]:
def fetch_all_data_in_batches(performers_list, increment=50):
    gpt_labeled_result = []
    for i in range(101):
        subset = performers_list[i*increment : (i+1)*increment]
        if len(subset):
            print([i*increment, (i+1)*increment])
            print(subset)
            subset_as_json_string = json.dumps(subset.tolist())
            print(subset_as_json_string, '\n')
            # fetched_and_formatted_res = [['David Mora', 'individual', 'male']]
            fetched_and_formatted_res = get_love_song_labels_for_rows(subset_as_json_string)
            print(fetched_and_formatted_res)
            gpt_labeled_result = [*gpt_labeled_result, *fetched_and_formatted_res]
    return gpt_labeled_result

def get_file_path(postfix):
    return f'./data/22.5-TEMP-gpt4-label-gender-{postfix}.csv'

def save_to_temp_csv(gpt_labeled_result, run_num):
    first_label_pass_df = pd.DataFrame(
        gpt_labeled_result, 
        columns=['performer', 'type', 'gender']
    )
    first_label_pass_df.to_csv(get_file_path(run_num), index=False)

# (run a test call to be sure things look ok)
For 100 rows at once:
For GPT4 turbo: 3min, about 3400 tokens out of 4096 maximum allowed tokens 

In [43]:
TEST_POSTFIX = 'JUST A TEST, DELETE ME'
save_to_temp_csv(
    fetch_all_data_in_batches(
        # just get first 4:
        performers_list[:4]
    ), 
    TEST_POSTFIX
)

[0, 50]
['"The Sting"' '"Weird Al" Yankovic' "'N Sync" "'Til Tuesday"]
["\"The Sting\"", "\"Weird Al\" Yankovic", "'N Sync", "'Til Tuesday"] 

[['The Sting', 'unknown', 'unknown'], ['Weird Al Yankovic', 'individual', 'male'], ["'N Sync", 'group', 'male'], ["'Til Tuesday", 'group', 'mixed gender group']]


In [44]:
pd.read_csv(get_file_path(TEST_POSTFIX))

Unnamed: 0,performer,type,gender
0,The Sting,unknown,unknown
1,Weird Al Yankovic,individual,male
2,'N Sync,group,male
3,'Til Tuesday,group,mixed gender group


# Execute calls (in batched batches in case of failure)

In [46]:
len(performers_list)
performers_list[0:10]

array(['"The Sting"', '"Weird Al" Yankovic', "'N Sync", "'Til Tuesday",
       '100 Proof', '10cc', '112', '1910 Fruitgum Co.', '2 Chainz',
       '21 Savage'], dtype='<U43')

In [49]:
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         performers_list[0:1000]
#     ), 
#     1
# )

In [55]:
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         performers_list[1000:2000]
#     ), 
#     2
# )

In [56]:
# save_to_temp_csv(
#     fetch_all_data_in_batches(
#         performers_list[2000:3000]
#     ), 
#     3
# )

In [53]:
print('done!')



done!


# Merge & clean the fetched data

In [57]:
gender_df = pd.concat([
    pd.read_csv(get_file_path(1)),
    pd.read_csv(get_file_path(2)),
    pd.read_csv(get_file_path(3))
])
gender_df


Unnamed: 0,performer,type,gender
0,"""The Sting""",unknown,unknown
1,"""Weird Al"" Yankovic",individual,male
2,'N Sync,group,male
3,'Til Tuesday,group,mixed gender group
4,100 Proof,group,mixed gender group
...,...,...,...
281,the Blackhearts,group,mixed gender group
282,the Curls,unknown,unknown
283,the Delons,unknown,unknown
284,twenty one pilots,group,male


## Do some sanity checks & identify issues

In [107]:
gender_df.type.value_counts()
gender_df.gender.value_counts()

gender
male                  1538
female                 461
mixed gender group     199
unknown                 79
non-binary               9
Name: count, dtype: int64

In [82]:
assert(len(performers_list) == len(gender_df))
# Assert that a set of all performers has the same length as performers_lsit
# assert(
#     len(performers_list) == len(
#             set([*performers_list, *gender_df['performer']])
#         )
# )


gender_df.gender.value_counts()
combnined = set([*performers_list, *gender_df['performer']])
# Print any performers that are not in both performers_list AND gender_df['performer']
culprit = None
for p in combnined:
    if p not in performers_list:
        culprit = p
        print(f"Performer '{p}' not in performers_list")

for performer in performers_list:
    if performer not in combnined:
        print(f"Performer '{performer}' not in combined")

gender_df[
    gender_df.performer.str.contains(culprit)
]

print('What IS in performers_list:', [performer for performer in performers_list if performer.startswith('Henry Mancini')])


# NA: write out WHAT the mistake is. Figure out how to fix it. (in this notebook of the last?)
# The mistakes is that I failed to TRIM the whitespace from the performer names, in the last notebook.
# 

Performer 'Henry Mancini And' not in performers_list
What IS in performers_list: ['Henry Mancini And ']


### We detected an underlying data mistake: several performers end in some variation of `" And "`

Note, however: these were all, helpfully, marketed as `unknown`. So we can address them within that larger type.

In [86]:
# The issue: seems GPT strugled with chopping of the ending "and ", thankfully pretty easy to identify.
[performer for performer in performers_list if performer.endswith(' ')]

['David Rose and ',
 'Henry Mancini And ',
 'Lawrence Welk And ',
 'Lloyd Price and ',
 'Reg Owen & ']

In [85]:
gender_df[
    gender_df.performer.str.endswith(' And ')
    | gender_df.performer.str.endswith(' and ')
    | gender_df.performer.str.endswith(' & ')
    | gender_df.performer.str.endswith(' And')
]

Unnamed: 0,performer,type,gender
479,David Rose and,unknown,unknown
787,Henry Mancini And,unknown,unknown
92,Lawrence Welk And,unknown,unknown
157,Lloyd Price and,unknown,unknown
551,Reg Owen &,unknown,unknown


# Make corrections

## Apply `unkown` fixes

### A. Export the `unknown`s, hand fix them in a spreadsheet, correct

Types of fixes provided by gsheet:
- It *is* a real performer. ACTION: Add the info, correct *just in the gender lookup table*
- Incorrectly *parsed* name. ACTION: fix the name, correct both *gender lookup and name split tables*
- The name shouldn't be in the dataset at all. ACTION: drop from BOTH *gender lookup and name split tables*


In [93]:
pd.set_option('display.max_rows', 100)
gender_df[
    gender_df.type == 'unknown'
]#.to_csv('./data/22.5-TEMP-unknown-gender-to-hand-correct.csv', index=False)

In [133]:
# Fetch the correction gSheet
G_SHEET_URL = "https://docs.google.com/spreadsheets/d/1xxE-vvzDUQTgYeigHMK_4lUznXAzDgNlAdY6JMOIOcc/export?format=csv&gid=1356757517"
gsheet_unknown_corrections_df = pd.read_csv(G_SHEET_URL)
gsheet_unknown_corrections_df.head(15)

Unnamed: 0,performer,type,gender,correct_name,drop_it
0,"""The Sting""",unknown,unknown,,y
1,?,unknown,unknown,,
2,Ali,individual,male,,
3,Alice Bowie,individual,male,,
4,Amanda Reifer,individual,female,,
5,Angie Mar,individual,female,Angie Martinez,
6,B-Rock & The Bizz,group,male,,
7,Bill Parsons,individual,male,,
8,Bob,unknown,unknown,,
9,Bobby Purify,unknown,unknown,,


In [132]:
# Fetch the incorrect split correction gSheet
INCORRECT_SPLIT_G_SHEET_URL = "https://docs.google.com/spreadsheets/d/1xxE-vvzDUQTgYeigHMK_4lUznXAzDgNlAdY6JMOIOcc/export?format=csv&gid=659955835"
gsheet_incorrect_split_corrections_df = pd.read_csv(INCORRECT_SPLIT_G_SHEET_URL)
gsheet_incorrect_split_corrections_df.head(15)

Unnamed: 0,split_part_1,split_part_2,replace_with_this,type,gender
0,?,The Mysterians,? and the Mysterians,group,male
1,Travis,Bob,Travis & Bob,group,male
2,James,Bobby Purify,James & Bobby Purify,group,male
3,Dale,Grace,Dale & Grace,group,mixed gender group
4,Dan,Shay,Dan + Shay,group,male
5,Derek,The Dominos,Derek and the Dominos,group,male
6,Disco Tex,The Sex-O-Lettes,Disco-Tex and the Sex-O-Lettes,group,mixed gender group
7,Paul,Paula,Paul & Paula,group,mixed gender group
8,Cornelius Brothers,Sister Rose,Cornelius Brothers & Sister Rose,group,mixed gender group
9,Jimmy Harnen,Synch,Jimmy Harnen with Synch,group,male


In [113]:
gender_df

Unnamed: 0,performer,type,gender
0,"""The Sting""",unknown,unknown
1,"""Weird Al"" Yankovic",individual,male
2,'N Sync,group,male
3,'Til Tuesday,group,mixed gender group
4,100 Proof,group,mixed gender group
...,...,...,...
281,the Blackhearts,group,mixed gender group
282,the Curls,unknown,unknown
283,the Delons,unknown,unknown
284,twenty one pilots,group,male


### B. Finicky fixes listed below by removing the SPLIT names, replacing them with the FULL name

*Groups that were erroniously split* (these have been marked as `drop_it` in the gsheet)
- "? and the Mysterians" should be the entry, not a split out "?" and "The Mysterians"
- "Travis & Bob" is right, not "Travis" and "Bob"
- "James & Bobby Purify" is a group (don't split)
- "Dale and Grace" eg on I'm Leaving it Up to You
- "Dan + Shay" not "Dan" "Shay"
- "Derek and the Dominos" not "Derek" "The Dominos"
- "Disco-Tex and the Sex-O-Lettes"
- "Paul & Paula"
- "Cornelius Brothers & Sister Rose"
- "Jimmy Harnen with Synch"
- "Trina & Tamara"

In [160]:
# Create copies
fixed_gender_df = gender_df.copy()
fixed_raw_performers_df = raw_performers_df.copy()



# PART 1: apply hand fixes to incorrect splits

# The algorithm to update:
# 1. For each `row` in `gsheet_incorrect_split_corrections_df`:
# 2. For both `fixed_raw_performers_df` and `fixed_gender_df`, for every `performer` column that contains `row.split_part_1` or `row.split_part_2`
# a. Remove both `row.split_part_1` and/or `row.split_part_2`
# b. Append the `row.replace_with_this` (which is the corrected, unsplit version)
# c. If relevant, update with the `row.type` and `row.gender`

def add_delim(str):
    return f"{str}|"

def update_row_if_needed(performers_list_str, row):
    if add_delim(row['split_part_1']) in performers_list_str and add_delim(row['split_part_2']) in performers_list_str:
        updated = add_delim(row['replace_with_this']) + performers_list_str.replace(add_delim(row['split_part_1']), '').replace(add_delim(row['split_part_2']), '')
        print(f"{performers_list_str} => {updated}")
        return updated
    return performers_list_str

for index, row in gsheet_incorrect_split_corrections_df.iterrows():
    # Drop rows from `fixed_raw_performers_df` that contain `row.split_part_1` or `row.split_part_2`
    fixed_gender_df = fixed_gender_df[
        ~fixed_gender_df['performer'].str.contains(add_delim(row['split_part_1']), regex=False) &
        ~fixed_gender_df['performer'].str.contains(add_delim(row['split_part_2']), regex=False)
    ]
    # Append the corrected unsplit version
    fixed_gender_df = pd.concat([
        fixed_gender_df,
        pd.DataFrame({
            'performer': [row['replace_with_this']],
            'type': [row['type']],
            'gender': [row['gender']]
        })
    ], ignore_index=True)

    # Now handle the performers in `fixed_raw_performers_df`
    fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].apply(
        lambda performers_list_str: update_row_if_needed(performers_list_str, row)
    )



# PART 2: apply hand fixes to unkNowns
for index, row in gsheet_unknown_corrections_df.iterrows():
    performer = row['performer']
    # Clear, we'll leave it out or add it in later based on the corrections
    fixed_gender_df = fixed_gender_df[fixed_gender_df.performer != performer]
    # DROP MISTAKES
    if row['drop_it'] == 'y':
        # Go through each row in the fixed_raw_performers_df, removing the string f'{performer}|' from performers_list_str, if it exists
        fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].str.replace(f'{performer}|', '')
    elif row['type'] != 'unknown':
        corrected_name = row['correct_name']
        performer_name = corrected_name if not pd.isnull(corrected_name) else performer
        fixed_gender_df = pd.concat([
            fixed_gender_df,
            pd.DataFrame(
                [
                    performer_name,
                    row['type'],
                    row['gender']
                ]
            )
        ])
        fixed_raw_performers_df['performers_list_str'] = fixed_raw_performers_df['performers_list_str'].str.replace(f'{performer}|', f'{performer_name}|')
    else:
        # Anything remaining should have been handled in PART 1
        if add_delim(row['performer']) in fixed_raw_performers_df['performers_list_str'].values:
            print('ERROR: unhandled')
        if add_delim(row['performer']) in fixed_gender_df['performer'].values:
            print('ERROR: unhandled')

assert(len(fixed_gender_df) != len(gender_df))
assert(len(fixed_raw_performers_df) == len(raw_performers_df))

?|The Mysterians| => ? and the Mysterians|
Travis|Bob| => Travis & Bob|
James|Bobby Purify| => James & Bobby Purify|
Dale|Grace| => Dale & Grace|
Dan|Shay|Justin Bieber| => Dan + Shay|Justin Bieber|
Derek|The Dominos| => Derek and the Dominos|
Disco Tex|The Sex-O-Lettes|Sir Monti Rock III| => Disco-Tex and the Sex-O-Lettes|Sir Monti Rock III|
Paul|Paula| => Paul & Paula|
Cornelius Brothers|Sister Rose| => Cornelius Brothers & Sister Rose|
Jimmy Harnen|Synch| => Jimmy Harnen with Synch|
Somethin' For The People|Trina|Tamara| => Trina & Tamara|Somethin' For The People|


# Export as a key value lookup table & the corrected name split dataset

In [53]:
OUTPUT_PATH = './data/22.5-OUTPUTu.csv'
merged_df.to_csv(OUTPUT_PATH, index=False)

In [54]:
pd.read_csv(OUTPUT_PATH)

Unnamed: 0,performer,song,justification,love_song_sub_type
0,10cc,I'm Not In Love,"despite constant affirmations, the speaker den...",It's Complicated
1,3OH!3 Featuring Ke$ha,My First Kiss,"recounts the physical act of a first kiss, foc...",
2,ABBA,Waterloo,depicts one's surrender to love as a moment of...,Serenade
3,AWB,Cut The Cake,uses wedding imagery as metaphor for a desirab...,
4,Aaliyah,Back & Forth,focuses on the youthful experience of going ou...,
...,...,...,...,...
5011,matchbox twenty,Unwell,"though the speaker does not feel well, he reas...",Courtship & Anticipation
5012,twenty one pilots,Heathens,"about social outcasts, not romance",
5013,twenty one pilots,Ride,"introspective about life's journey, not focuse...",
5014,twenty one pilots,Stressed Out,reflects on the pressures of adulthood and lon...,
