In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display import display, Markdown

import pandas as pd
import numpy as np

import json

np.__version__

'1.19.5'

# Compress data to unique songs, add a "contains 'love'" column for reference

In [4]:
gender_df = pd.read_csv('./data/11-OUTPUT-with-gender-from-gpt4.csv')
date_as_decimal = (gender_df.chart_debut_date_normalized_0_to_1 / 100) * (2023 - 1959) + 1959
date_as_decimal
gender_df['date_as_decimal'] = date_as_decimal
gender_df

Unnamed: 0,performer,song,generic_genre,lyric_line,chart_debut,x,y,chart_debut_date_normalized_0_to_1,genre_position_band,near_neighbors_count,gender,date_as_decimal
0,John Lennon,#9 Dream,ROCK,So long ago,1974-12-21,62.357605,41.475302,25.450,10.0,14,m,1975.28800
1,John Lennon,#9 Dream,ROCK,"Was it in a dream, was it just a dream?",1974-12-21,55.684884,27.032071,25.450,10.0,14,m,1975.28800
2,John Lennon,#9 Dream,ROCK,"I know, yes I know",1974-12-21,69.964586,46.613546,25.450,10.0,0,m,1975.28800
3,John Lennon,#9 Dream,ROCK,"Seemed so very real, it seemed so real to me",1974-12-21,60.181471,69.527274,25.450,10.0,14,m,1975.28800
4,John Lennon,#9 Dream,ROCK,Took a walk down the street,1974-12-21,49.938720,41.965371,25.450,10.0,0,m,1975.28800
...,...,...,...,...,...,...,...,...,...,...,...,...
167355,J. Cole & Lil Baby,pride.is.the.devil,HIP_HOP,"I'm addicted to promethazine, it's crazy, yeah...",2021-05-29,41.610927,50.449968,97.562,20.0,0,m,2021.43968
167356,J. Cole & Lil Baby,pride.is.the.devil,HIP_HOP,"All this money coming in, it drive me crazy no...",2021-05-29,45.775471,52.982471,97.562,20.0,0,m,2021.43968
167357,J. Cole & Lil Baby,pride.is.the.devil,HIP_HOP,I'll be crazy if I blow it,2021-05-29,45.995316,47.344963,97.562,20.0,5,m,2021.43968
167358,J. Cole & Lil Baby,pride.is.the.devil,HIP_HOP,And it left so many R.I.P.,2021-05-29,57.715991,46.589529,97.562,20.0,0,m,2021.43968


In [5]:
IS_LOVE_SONG_COL = 'is_love_song'

def contains_love(lyric_lines):
    # Short-circuit as soon as "love" is found in any lyric line.
    return any('love' in lyric.lower() for lyric in lyric_lines)

def list_unique_songs_labeled_if_they_contain_the_word_love(df):
        # Create a boolean mask indicating whether each lyric line contains "love".
    df['contains_love'] = df['lyric_line'].str.contains('love', case=False)
    
    # Define aggregation functions for each column
    # - 'contains_love' will use 'any' to find if any lyric line contains 'love'
    # - other columns will use 'first' to take the first value from each group
    agg_functions = {col: 'first' for col in df.columns if col not in ['contains_love', 'lyric_line']}
    agg_functions['contains_love'] = 'any'
    
    # Group by 'performer' and 'song' and apply aggregation functions
    result = df.groupby(['performer', 'song']).agg(agg_functions)#.reset_index()
    
    # Rename 'contains_love' column to 'is_love_song'
    result.rename(columns={'contains_love': IS_LOVE_SONG_COL}, inplace=True)
    
    return result.reset_index(drop=True)

# Example DataFrame with additional columns
data = {
    'lyric_line': ['I love you', 'This is a test', 'Love is in the air', 'No mentions here', 'Love', 'Something else'],
    'performer': ['Artist1', 'Artist1', 'Artist2', 'Artist2', 'Artist3', 'Artist3'],
    'song': ['Song1', 'Song1', 'Song2', 'Song2', 'Song3', 'Song3'],
    'album': ['Album1', 'Album1', 'Album2', 'Album2', 'Album3', 'Album3'],
    'year': [2000, 2000, 2005, 2005, 2010, 2010]
}

# Create DataFrame
df = pd.DataFrame(data)

# Transform DataFrame
list_unique_songs_labeled_if_they_contain_the_word_love(df)

Unnamed: 0,performer,song,album,year,is_love_song
0,Artist1,Song1,Album1,2000,True
1,Artist2,Song2,Album2,2005,True
2,Artist3,Song3,Album3,2010,True


In [6]:
contains_love_df = list_unique_songs_labeled_if_they_contain_the_word_love(gender_df)
contains_love_df

Unnamed: 0,performer,song,generic_genre,chart_debut,x,y,chart_debut_date_normalized_0_to_1,genre_position_band,near_neighbors_count,gender,date_as_decimal,is_love_song
0,"""Weird Al"" Yankovic",White & Nerdy,ROCK,2006-10-14,50.912490,44.870985,74.845,10.0,0,m,2006.90080,False
1,'N Sync,(God Must Have Spent) A Little More Time On You,POP,1998-12-05,81.288336,39.382591,62.655,0.0,12,m,1999.09920,True
2,'N Sync,Bye Bye Bye,POP,2000-01-29,71.402847,30.184848,64.441,0.0,14,m,2000.24224,True
3,'N Sync,It's Gonna Be Me,POP,2000-05-06,68.789522,53.089582,64.860,0.0,0,m,2000.51040,True
4,'N Sync,This I Promise You,POP,2000-09-30,74.214520,30.421922,65.481,0.0,14,m,2000.90784,True
...,...,...,...,...,...,...,...,...,...,...,...,...
5011,matchbox twenty,Unwell,ROCK,2003-03-22,56.149341,35.176103,69.317,10.0,7,m,2003.36288,False
5012,twenty one pilots,Heathens,ROCK,2016-07-09,58.202752,54.398868,89.969,10.0,0,m,2016.58016,True
5013,twenty one pilots,Ride,ROCK,2016-04-02,48.387025,35.286120,89.550,10.0,1,m,2016.31200,False
5014,twenty one pilots,Stressed Out,ROCK,2015-05-16,60.782772,37.133766,88.183,10.0,1,m,2015.43712,False


In [7]:
contains_love_df[
    contains_love_df.song == 'WAP'
]

Unnamed: 0,performer,song,generic_genre,chart_debut,x,y,chart_debut_date_normalized_0_to_1,genre_position_band,near_neighbors_count,gender,date_as_decimal,is_love_song
752,Cardi B Featuring Megan Thee Stallion,WAP,POP,2020-08-22,38.099264,42.503936,96.366,0.0,13,f,2020.67424,False


# Setup Chat GPT API calls

In [8]:
import os
import openai
# openai.organization = "personal"
openai.api_key = os.getenv("OPENAI_API_KEY")
[model.id for model in openai.Model.list()['data']]

['babbage',
 'davinci',
 'text-davinci-edit-001',
 'babbage-code-search-code',
 'text-similarity-babbage-001',
 'code-davinci-edit-001',
 'ada',
 'babbage-code-search-text',
 'babbage-similarity',
 'gpt-3.5-turbo-16k-0613',
 'code-search-babbage-text-001',
 'text-curie-001',
 'gpt-3.5-turbo-0301',
 'gpt-3.5-turbo-16k',
 'code-search-babbage-code-001',
 'text-ada-001',
 'text-davinci-003',
 'text-similarity-ada-001',
 'text-davinci-002',
 'curie-instruct-beta',
 'ada-code-search-code',
 'ada-similarity',
 'code-search-ada-text-001',
 'text-search-ada-query-001',
 'davinci-search-document',
 'whisper-1',
 'ada-code-search-text',
 'text-search-ada-doc-001',
 'davinci-instruct-beta',
 'text-similarity-curie-001',
 'code-search-ada-code-001',
 'ada-search-query',
 'text-search-davinci-query-001',
 'curie-search-query',
 'davinci-search-query',
 'babbage-search-document',
 'ada-search-document',
 'text-search-curie-query-001',
 'text-babbage-001',
 'text-search-babbage-doc-001',
 'curie-sear

# Fetch with chatGPT

In [16]:
LOVE_SONG_LABELING_PROMPT = """
I will to give you a python list where each item is an array that contains an array where the first item is the performer and the second is the name of one of their songs from the Billboard Top 10 from 1958-2021.

Here's an example input:

```
TEST_SET_ROWS = [
    ['twenty one pilots', 'Heathens'],
    ['"Weird Al" Yankovic', 'White & Nerdy'],
    ['will.i.am & Britney Spears','Scream & Shout'],
    ["21 Savage & Metro Boomin", "Runnin"],
    ["6ix9ine Featuring Nicki Minaj & Murda Beatz", "FEFE"],
    ['2Pac', "Dear Mama/Old School"],
  
    ["'N Sync", 'This I Promise You'],
    ["'N Sync & Gloria Estefan", "Music Of My Heart"],
    ["112", "Peaches & Cream"],
    ["10cc","I'm Not In Love"],
    ["50 Cent Featuring Nate Dogg", "21 Questions"],
]
```

In each array, I want you to respond by 
1. inserting "love" if it is a love song, inserting "no" if it is not, and "sex" if the song primarily revolves around human sexuality or sexual desire.
2. in the second index, add a brief justification for your classification
3. (the peformer and song name follow)

I want you to use Wikipedia's definition of a "love song": "A love song is a song about romantic love, falling in love, heartbreak after a breakup, and the feelings that these experiences bring." 

Here's an example correct response given the input above.

```
VALIDATION_SET_ROWS = [
    ['no', 'about social outcasts', 'twenty one pilots', 'Heathens'],
    ['no', 'parody about nerd culture', '"Weird Al" Yankovic', 'White & Nerdy'],
    ['no', 'generic dance tune', 'will.i.am & Britney Spears','Scream & Shout'],
    ['no', 'gangster rap', "21 Savage & Metro Boomin", "Runnin"],
    ['sex', 'about sexual prowess and uses many sexual references', "6ix9ine Featuring Nicki Minaj & Murda Beatz", "FEFE"],
    ['no', 'love for his mother, not romantic love', '2Pac', "Dear Mama/Old School"],
  
    ['love', 'promising to always love someone',"'N Sync", 'This I Promise You'],
    ['love', 'gratitude for how a lover has changed them', "'N Sync & Gloria Estefan", "Music Of My Heart"],
    ['sex', 'a song about sex & being "addicted" to a "tasty" woman', "112", "Peaches & Cream"],
    ['love', 'the title is ironic, he is actually in love with her (his wife)', "10cc","I'm Not In Love"],
    ['love', 'asking for fidelity & support no matter what', "50 Cent Featuring Nate Dogg", "21 Questions"],
]
```

It is crucial that you respond ONLY with the correct array, do not include anything else in your response. In other words, your responce should be able to compiled to valid Python, do not include extraneous text outside the responce array.

Ok, start with this:

```
TEST_SET_ROWS = [
    ['100 Proof Aged in Soul', "Somebody's Been Sleeping"],
    ["50 Cent", "Disco Inferno"],
    ["24kGoldn Featuring iann dior", "Mood"],
  
    ["'N Sync Featuring Nelly", "Girlfriend"],
    ['702',"Get It Together"],
    ['10cc', "The Things We Do For Love"],
    ['Cardi B Featuring Megan Thee Stallion', 'WAP'],
]
```
"""

In [17]:
import ast
ast.literal_eval("['Michael Jackson']")

['Michael Jackson']

In [18]:


def get_gender_for_performers(performers):
    try: 
        completion = openai.ChatCompletion.create(
          model="gpt-3.5-turbo",
          messages=[
            {"role": "user", "content": LOVE_SONG_LABELING_PROMPT},
#                           {"role": "user", "content": GENDER_PROMPT},
            {"role": "user", "content": str(performers)},
          ]
        )
        return completion.choices[0].message['content']
#         return ast.literal_eval(completion.choices[0].message['content'])
    except Exception as e:
        print('ERROR FETCHING', e)
        return {}
    
TEST_SET_ROWS = [
    ['100 Proof Aged in Soul', "Somebody's Been Sleeping"],
    ["50 Cent", "Disco Inferno"],
    ["24kGoldn Featuring iann dior", "Mood"],
  
    ["'N Sync Featuring Nelly", "Girlfriend"],
    ['702',"Get It Together"],
    ['10cc', "The Things We Do For Love"],
    ['Cardi B Featuring Megan Thee Stallion', 'WAP'],
]

res = get_gender_for_performers(TEST_SET_ROWS)

In [19]:
ast.literal_eval(res)

[['no',
  'about infidelity and suspicion',
  '100 Proof Aged in Soul',
  "Somebody's Been Sleeping"],
 ['no', 'a party and club anthem', '50 Cent', 'Disco Inferno'],
 ['love',
  'a song about love and romantic feelings',
  '24kGoldn Featuring iann dior',
  'Mood'],
 ['love',
  'a song about romantic pursuit and attraction',
  "'N Sync Featuring Nelly",
  'Girlfriend'],
 ['no',
  'a song about relationship struggles and finding a solution',
  '702',
  'Get It Together'],
 ['love',
  'expressing love and the things done for it',
  '10cc',
  'The Things We Do For Love'],
 ['sex',
  'a sexually explicit song',
  'Cardi B Featuring Megan Thee Stallion',
  'WAP']]

In [None]:
# INCREMENT = 60

# unique_performers = df.performer.unique()#[0:120]

# gender_dictionary = {}

# for i in range(42):
#     subset = unique_performers[i*INCREMENT : (i+1)*INCREMENT]
#     if subset.any():
#         print([i*INCREMENT, (i+1)*INCREMENT])
#         print(subset, '\n')
#         res = get_gender_for_performers(subset)
#         gender_dictionary = {**gender_dictionary, **res}