In [4]:
# this version reads a chunk of posts with the score embedded
# ranks by summing scores

import os
import glob
import pickle
from datetime import datetime
import time
import dotenv
import re
from tqdm import tqdm
from schema import Schema
import csv

import pandas as pd
import pandas_dedupe

import requests
import requests.auth

import praw

import openai
import tiktoken

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# load secrets from .env into environment variables
dotenv.load_dotenv()

print(f"{'Praw:':<20} {praw.__version__ :>10}")
print(f"{'OpenAI:':<20} {openai.version.VERSION :>10}")


Praw:                     7.7.0
OpenAI:                  0.27.4


See README.md
 - objective is to use OpenAI for named entity extraction to extract all the songs form [this reddit thread](https://www.reddit.com/r/AskReddit/comments/12viv4v/what_is_the_prettiest_song_you_ever_heard_in_your/) and make Spotify playlist
 - use Reddit PRAW API to download all the comments (get [Reddit API key](https://www.reddit.com/prefs/apps))
 - use OpenAI API with a prompt like, extract all the songs from this text to CSV get ([OpenAI API key](https://platform.openai.com/account/api-keys))
 - use Spotify API to make a playlist (get [Spotify API key](https://developer.spotify.com/documentation/web-api/tutorials/getting-started))
 - works, needed a lot of scrubbing, but about 1 day of work, wouldn't have been possible to do a 700-song playlist manually without a team of Mechanical Turks or something
 - If I wanted to go nuts, would process comments individually, save a file for each comment's extracted songs, would make it easier to track down what OpenAI gets wrong, have a resumable, retryable, repeatable process and 
 - Spotify playist is [here](https://open.spotify.com/playlist/08YFkbtTV6GBfNtjJ4PHDu?si=f4761d983ac84091) 
 
 needs a .env file per dot-env-template
 

# Configs

In [8]:
# model
gptmodel = 'gpt-3.5-turbo'

# a thread 
submission = "12viv4v"

# minimum karma to process a reply 
minkarma = 1

# an output file to accumulate all the responses
savefile = 'bronze.txt'

# main prompt 
prompt_prefix1="""You will act as a research assistant extracting all the artists and track titles mentioned in a series of posts about music.
Your goal is to extract structured information from a series of posts in the form below and return them in a structured CSV format.
Define a post delimited below by ===
===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

When extracting information please make sure it matches the CSV format below exactly. Do not add any attributes that do not appear in the schema below delimited by ---
---
"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"
---

You will extract all artists and tracks from each post below delimited by ~~~ .
You will return a list of records containing the artist and track extracted from the input, and the post_id and post_score of the post the artist and track is mentioned in.
You will return the records in a CSV format.
The header row should contain `"post_id","post_score","artist","track"`. 
The input is:
"""

# an output file to accumulate all the responses
outdir = 'out'
logdir = 'logs'

savefile = 'bronze.txt'

openai.api_key=os.environ["OPENAI_API_KEY"]

# to speed things we'll cumulate posts til we get to nposts posts or maxchars total chars, whichever comes first
max_post_size=300  # redditor needs to put any songs in 1st couple hundred chars
maxtokens = 1024   # max tokens to send to get_response (with room for response)
# maxchars = 6000  # max tokens (words/fragments) is 4096 but I think stuffing the prompt maybe reduces quality?
nposts = 1000 # max posts to combine into a chunk


# Get all comments from a reddit posting

In [9]:
model_lst = openai.Model.list()

for i in model_lst['data']:
    print(i['id'])
    

davinci
text-davinci-001
text-search-curie-query-001
gpt-3.5-turbo
babbage
text-babbage-001
curie-instruct-beta
davinci-similarity
code-davinci-edit-001
text-similarity-curie-001
ada-code-search-text
gpt-3.5-turbo-0613
text-search-ada-query-001
gpt-3.5-turbo-16k-0613
gpt-4-0314
babbage-search-query
ada-similarity
text-curie-001
gpt-4
gpt-3.5-turbo-16k
text-search-ada-doc-001
text-search-babbage-query-001
code-search-ada-code-001
curie-search-document
davinci-002
text-search-davinci-query-001
text-search-curie-doc-001
babbage-search-document
babbage-002
babbage-code-search-text
text-embedding-ada-002
davinci-instruct-beta
davinci-search-query
text-similarity-babbage-001
text-davinci-002
code-search-babbage-text-001
text-davinci-003
text-search-davinci-doc-001
code-search-ada-text-001
ada-search-query
text-similarity-ada-001
gpt-4-0613
ada-code-search-code
whisper-1
text-davinci-edit-001
davinci-search-document
curie-search-query
babbage-similarity
ada
ada-search-document
text-ada-001
te

In [None]:
z = [x for x in model_lst['data'] if x['id']=='gpt-4']

In [None]:
z

In [None]:
def getPraw():
    return praw.Reddit(user_agent="prettiest_song/0.001", 
                       client_id=os.getenv('CLIENT_ID'), 
                       client_secret=os.getenv('CLIENT_SECRET'))


def getAll(r, submissionId, verbose=True):
    submission = r.submission(submissionId)
    submission.comments.replace_more(limit=None)
    commentsList=submission.comments.list()
    return commentsList


In [None]:
# print(datetime.now())
# r = getPraw()
# res = getAll(r, submission)
# print(datetime.now())

# print("retrieved ", len(res), 'comments')


In [None]:
# # we have a list of comment objects
# # filter comments with at least some karma
# res3 = [r for r in res if r.score >= minkarma]
# print('filtered to ', len(res3), 'comments')
# res3[0].body, res3[0].score


In [None]:
# save so we can reload it later without downloading

# with open('reddit_full.pkl', 'wb') as f:
#     pickle.dump(res3, f)
    
with open('reddit_full.pkl', 'rb') as f:
    res3 = pickle.load(f)


# Extract artists and song titles using OpenAI

In [None]:
# check lengths of posts
shorties = []
big_ones = []
for i in range(len(res3)):
    if len(res3[i].body) <3:
        print (i, res3[i].body)
        shorties.append(i)
    if len(res3[i].body) > 1024:
        print(i, len(res3[i].body))
        big_ones.append(i)
        

In [None]:
# avg length
sum([len(r.body) for r in res3]) / len(res3)

In [None]:
[i for i in range(len(res3)) if res3[i].score <= 0]

In [None]:
# already truncated
print (res3[big_ones[0]].body[:500])

In [None]:
csv_validate_re = re.compile(r'''
    \s*                # Any whitespace.
    (                  # Start capturing here.
      [^,"']+?         # Either a series of non-comma non-quote characters.
      |                # OR
      "(?:             # A double-quote followed by a string of characters...
          [^"\\]|\\.   # That are either non-quotes or escaped...
       )*              # ...repeated any number of times.
      "                # Followed by a closing double-quote.
      |                # OR
      '(?:[^'\\]|\\.)*'# Same as above, for single quotes.
    )                  # Done capturing.
    \s*                # Allow arbitrary space before the comma.
    (?:,|$)            # Followed by a comma or the end of a string.
    ''', re.VERBOSE)


In [None]:
# use tokenizer to get accurate token count

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(gptmodel)
assert enc.decode(enc.encode("hello world")) == "hello world"

def count_tokens(s):
    return len(enc.encode(s))

count_tokens('four score and 7 years go our forefathers brought forth')

In [None]:
def grab_tokens(s, c):
    encoded = enc.encode(s)
    return enc.decode(encoded[:c])

grab_tokens('four score and 7 years go our forefathers brought forth', 4)


In [None]:
openai.api_key = os.getenv('OPENAI_API_KEY')

models = openai.Model.list()
print([(i, m.id,) for i, m in enumerate(models["data"])])
models['data'][2]

In [None]:
MAX_INPUT_TOKENS = 4096   # truncate 
MAX_OUTPUT_TOKENS = 500   # pass this to OpenAI 
MAX_RETRIES = 3
TEMPERATURE= 0.4
MODEL='gpt-3.5-turbo'
# for future consideration
# start, end character to strip starting, ending boilerplate
# list of regexps to apply to e.g. strip boilerplate
# additional llm kwargs: top_p, n, presence_penalty, frequency_penalty, logit_bias
# support streaming into browser


def get_response(messages, 
                 verbose=False,
                 model = MODEL,
                 max_input_tokens = MAX_INPUT_TOKENS,
                 max_output_tokens = MAX_OUTPUT_TOKENS,
                 max_retries = MAX_RETRIES,
                 temperature = TEMPERATURE,
                ):

    if type(messages) != list:   # allow passing one string for convenience
        messages=[{"role": "user",
                   "content": messages}]
        
    if verbose:
        print("\n".join([str(msg) for msg in messages]))

    # truncate number of tokens
    # retry loop, have received untrapped 500 errors like too busy
    for i in range(max_retries):
        if i > 0:
            print(f"Attempt {i+1}...")
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0,
                max_tokens=max_output_tokens
            )
            # no exception thrown
            return response
        except Exception as error:
            print(f"An exception occurred on attempt {i+1}:", error)
            time.sleep(5)
            continue  # try again
        # retries exceeded if you got this far
    print("Retries exceeded.")
    return None



In [None]:
BIGTEXT = """
please summarize the following: 
Stefani Joanne Angelina Germanotta (/ˈstɛfəni ˌdʒɜːrməˈnɒtə/ (listen) STEF-ən-ee JUR-mə-NOT-ə; born March 28, 1986), known professionally as Lady Gaga, is an American singer, songwriter, and actress. She is known for her image reinventions and versatility in the entertainment industry. Gaga began performing as a teenager, singing at open mic nights and acting in school plays. She studied at Collaborative Arts Project 21, through the New York University Tisch School of the Arts, before dropping out to pursue a career in music. After Def Jam Recordings canceled her contract, she worked as a songwriter for Sony/ATV Music Publishing, where she signed a joint deal with Interscope Records and KonLive Distribution, in 2007. Gaga had her breakthrough the following year with her debut studio album, The Fame, and its chart-topping singles "Just Dance" and "Poker Face". The album was later reissued to include the extended play The Fame Monster (2009), which yielded the successful singles "Bad Romance", "Telephone", and "Alejandro".

Gaga's five succeeding albums all debuted atop the US Billboard 200. Her second full-length album, Born This Way (2011), explored electronic rock and techno-pop and sold more than one million copies in its first week. The title track became the fastest-selling song on the iTunes Store, with over one million downloads in less than a week. Following her EDM-influenced third album, Artpop (2013), and its lead single "Applause", Gaga released the jazz album Cheek to Cheek (2014) with Tony Bennett, and the soft rock album Joanne (2016). She ventured into acting, winning awards for her leading roles in the miniseries American Horror Story: Hotel (2015–2016) and the musical film A Star Is Born (2018). Her contributions to the latter's soundtrack, which spawned the chart-topping single "Shallow", made her the first woman to win an Academy Award, BAFTA Award, Golden Globe Award, and Grammy Award in one year. Gaga returned to dance-pop with her sixth studio album, Chromatica (2020), which yielded the number-one single "Rain on Me". She both released her second collaborative album with Bennett, Love for Sale, and starred in the biopic House of Gucci in 2021.

Having sold an estimated 170 million records, Gaga is one of the world's best-selling music artists and the only female artist to achieve four singles each selling at least 10 million copies globally. Her accolades include 13 Grammy Awards, two Golden Globe Awards, 18 MTV Video Music Awards, awards from the Songwriters Hall of Fame and the Council of Fashion Designers of America, and recognition as Billboard's Artist of the Year (2010) and Woman of the Year (2015). She has also been included in several Forbes' power rankings and ranked fourth on VH1's Greatest Women in Music (2012). Time magazine named her one of the 100 most influential people in the world in 2010 and 2019 and placed her on their All-Time 100 Fashion Icons list. Her philanthropy and activism focus on mental health awareness and LGBT rights; she has her own non-profit organization, the Born This Way Foundation, which supports the wellness of young people. Gaga's business ventures include Haus Labs, a vegan cosmetics brand launched in 2019.

Life and career
1986–2004: Early life
Stefani Joanne Angelina Germanotta was born on March 28, 1986, at Lenox Hill Hospital in Manhattan, New York City,[1] to an upper middle class Catholic family. Both of her parents have Italian ancestry.[2] Her parents are Cynthia Louise (née Bissett), a philanthropist and business executive, and Internet entrepreneur Joseph Germanotta,[3] and she has a younger sister named Natali.[4] Brought up on the Upper West Side of Manhattan, Gaga said in an interview that her parents came from lower-class families and worked hard for everything.[5][6] From age 11, she attended the Convent of the Sacred Heart, a private all-girls Roman Catholic school.[7] Gaga has described her high-school self as "very dedicated, very studious, very disciplined" but also "a bit insecure". She considered herself a misfit and was mocked for "being either too provocative or too eccentric".[8]

Gaga began playing the piano at age four when her mother insisted she become "a cultured young woman". She took piano lessons and practiced through her childhood. The lessons taught her to create music by ear, which Gaga preferred over reading sheet music. Her parents encouraged her to pursue music and enrolled her in Creative Arts Camp.[9] As a teenager, she played at open mic nights.[10] Gaga played the lead roles of Adelaide in the play Guys and Dolls and Philia in the play A Funny Thing Happened on the Way to the Forum at Regis High School.[11] She also studied method acting at the Lee Strasberg Theatre and Film Institute for ten years.[12] Gaga unsuccessfully auditioned for New York shows, though did appear in a small background role as a high-school student in a 2001 episode of The Sopranos titled "The Telltale Moozadell".[13][14] She later said of her inclination towards music:

I don't know exactly where my affinity for music comes from, but it is the thing that comes easiest to me. When I was like three years old, I may have been even younger, my mom always tells this really embarrassing story of me propping myself up and playing the keys like this because I was too young and short to get all the way up there. Just go like this on the low end of the piano ... I was really, really good at piano, so my first instincts were to work so hard at practicing piano, and I might not have been a natural dancer, but I am a natural musician. That is the thing that I believe I am the greatest at.[15]

In 2003, at age 17, Gaga gained early admission to Collaborative Arts Project 21, a music school at New York University (NYU)'s Tisch School of the Arts, and lived in an NYU dorm. She studied music there, and improved her songwriting skills by writing essays on art, religion, social issues and politics, including a thesis on pop artists Spencer Tunick and Damien Hirst.[16][17] In 2005, Gaga withdrew from school during the second semester of her second year to focus on her music career.[18] That year, she also played an unsuspecting diner customer for MTV's Boiling Points, a prank reality television show.[19]

In a 2014 interview, Gaga said she had been raped at age 19, and later underwent mental and physical therapy for this.[20] She has post-traumatic stress disorder (PTSD) and attributes it to the incident, and has credited support from doctors, family and friends with helping her.[21] Gaga later gave additional details about the rape, including that "the person who raped me dropped me off pregnant on a corner at my parents' house because I was vomiting and sick. Because I'd been being abused. I was locked away in a studio for months."[22]

2005–2007: Career beginnings
In 2005, Gaga recorded two songs with hip-hop artist Melle Mel for an audio book accompanying Cricket Casey's children's novel The Portal in the Park.[23] She also formed a band called the SGBand with some friends from NYU.[11][24] They played gigs around New York and became a fixture of the downtown Lower East Side club scene.[11] After the 2006 Songwriters Hall of Fame New Songwriters Showcase at the Cutting Room in June, talent scout Wendy Starland recommended her to music producer Rob Fusari.[25] Fusari collaborated with Gaga, who traveled daily to New Jersey, helping to develop her songs and compose new material.[26] The producer said they began dating in May 2006, and claimed to have been the first person to call her "Lady Gaga", which was derived from Queen's song "Radio Ga Ga".[27] Their relationship lasted until January 2007.[28]

A scantily-clad Gaga singing on a stage. She has a microphone and black stockings.
Gaga performing at Lollapalooza in 2007
Fusari and Gaga established a company called "Team Lovechild, LLC" to promote her career.[27] They recorded and produced electropop tracks, sending them to music industry executives. Joshua Sarubin, the head of Artists and repertoire (A&R) at Def Jam Recordings, responded positively and, after approval from Sarubin's boss Antonio "L.A." Reid, Gaga was signed to Def Jam in September 2006.[29][30] She was dropped from the label three months later[31] and returned to her family home for Christmas. Gaga began performing at neo-burlesque shows, which according to her represented freedom.[32] During this time, she met performance artist Lady Starlight, who helped mold her onstage persona.[33] The pair began performing at downtown club venues like the Mercury Lounge, the Bitter End, and the Rockwood Music Hall. Their live performance art piece, known as "Lady Gaga and the Starlight Revue" and billed as "The Ultimate Pop Burlesque Rockshow", was a tribute to 1970s variety acts.[34][35] They performed at the 2007 Lollapalooza music festival.[34]

Having initially focused on avant-garde electronic dance music, Gaga began to incorporate pop melodies and the glam rock style of David Bowie and Queen into her songs. While Gaga and Starlight were performing, Fusari continued to develop the songs he had created with her, sending them to the producer and record executive Vincent Herbert.[36] In November 2007, Herbert signed Gaga to his label Streamline Records, an imprint of Interscope Records, established that month.[37] Gaga later credited Herbert as the man who discovered her.[38] Having served as an apprentice songwriter during an internship at Famous Music Publishing, Gaga struck a music publishing deal with Sony/ATV. As a result, she was hired to write songs for Britney Spears, New Kids on the Block, Fergie, and the Pussycat Dolls.[39] At Interscope, musician Akon was impressed with her singing abilities when she sang a reference vocal for one of his tracks in studio.[40] Akon convinced Jimmy Iovine, chairman and CEO of Interscope Geffen A&M Records (a brother company for Def Jam), to form a joint deal by having Gaga also sign with his own label KonLive, making her his "franchise player".[31][41]

In late 2007, Gaga met with songwriter and producer RedOne.[42] She collaborated with him in the recording studio for a week on her debut album, signing with Cherrytree Records, an Interscope imprint established by producer and songwriter Martin Kierszenbaum; she also wrote four songs with Kierszenbaum.[39] Despite securing a record deal, she said that some radio stations found her music too "racy", "dance-oriented", and "underground" for the mainstream market, to which she replied: "My name is Lady Gaga, I've been on the music scene for years, and I'm telling you, this is what's next."[7]

2008–2010: Breakthrough with The Fame and The Fame Monster
By 2008, Gaga had relocated to Los Angeles to work extensively with her record label to complete her debut album, The Fame, and to set up her own creative team called the Haus of Gaga, modeled on Andy Warhol's The Factory.[43][44] The Fame was released on August 19, 2008,[45] and reached number one in Austria, Canada, Germany, Ireland, Switzerland and the UK, as well as the top five in Australia and the US.[46][47] Its first two singles, "Just Dance" and "Poker Face",[48] reached number one in the United States,[49] Australia,[50] Canada[51] and the UK.[52] The latter was also the world's best-selling single of 2009, with 9.8 million copies sold that year, and spent a record 83 weeks on Billboard magazine's Digital Songs chart.[53][54] Three other singles, "Eh, Eh (Nothing Else I Can Say)", "LoveGame" and "Paparazzi", were released from the album;[55] the lattermost reached number one in Germany.[56] Remixed versions of the singles from The Fame, except "Eh, Eh (Nothing Else I Can Say)", were included on Hitmixes in August 2009.[57] At the 52nd Annual Grammy Awards, The Fame and "Poker Face" won Best Dance/Electronica Album and Best Dance Recording, respectively.[58]

A young woman on stage. She's wearing a leopard printed shirt.
Gaga on The Monster Ball Tour in 2010. It grossed $227 million and became the highest-grossing concert tour for a debut headlining artist.[59]
Following her opening act on the Pussycat Dolls' 2009 Doll Domination Tour in Europe and Oceania, Gaga headlined her worldwide The Fame Ball Tour, which ran from March to September 2009.[60] While traveling the globe, she wrote eight songs for The Fame Monster, a reissue of The Fame.[61] Those new songs were also released as a standalone EP on November 18, 2009.[62] Its first single, "Bad Romance", was released one month earlier[63] and went number one in Canada[51] and the UK,[52] and number two in the US,[49] Australia[64] and New Zealand.[65] "Telephone", with Beyoncé, followed as the second single from the EP and became Gaga's fourth UK number one.[66][67] Its third single was "Alejandro",[68] which reached number one in Finland[69] and attracted controversy when its music video was deemed blasphemous by the Catholic League.[70] Both tracks reached the top five in the US.[49] The video for "Bad Romance" became the most watched on YouTube in April 2010, and that October, Gaga became the first person with more than one billion combined views.[71][72] At the 2010 MTV Video Music Awards, she won eight awards from 13 nominations, including Video of the Year for "Bad Romance".[73] She was the most nominated artist for a single year, and the first woman to receive two nominations for Video of the Year at the same ceremony.[74] The Fame Monster won the Grammy Award for Best Pop Vocal Album, and "Bad Romance" won Best Female Pop Vocal Performance and Best Short Form Music Video at the 53rd Annual Grammy Awards.[75]

In 2009, Gaga spent a record 150 weeks on the UK Singles Chart and became the most downloaded female act in a year in the US, with 11.1 million downloads sold, earning an entry in the Guinness Book of World Records.[76][77] Worldwide, The Fame and The Fame Monster together have sold more than 15 million copies, and the latter was 2010's second best-selling album.[78][79][80] Its success allowed Gaga to start her second worldwide concert tour, The Monster Ball Tour, and release The Remix, her final record with Cherrytree Records[81] and among the best-selling remix albums of all time.[82][83] The Monster Ball Tour ran from November 2009 to May 2011 and grossed $227.4 million, making it the highest-grossing concert tour for a debut headlining artist.[59][84] Concerts performed at Madison Square Garden in New York City were filmed for an HBO television special, Lady Gaga Presents the Monster Ball Tour: At Madison Square Garden.[85] Gaga also performed songs from her albums at the 2009 Royal Variety Performance, the 52nd Annual Grammy Awards, and the 2010 Brit Awards.[86] Before Michael Jackson's death, Gaga was set to take part in his canceled This Is It concert series at the O2 Arena in the UK.[87]

During this era, Gaga ventured into business, collaborating with consumer electronics company Monster Cable Products to create in-ear, jewel-encrusted headphones called Heartbeats by Lady Gaga.[88] She also partnered with Polaroid in January 2010 as their creative director and announced a suite of photo-capture products called Grey Label.[89][90] Her collaboration with her past record producer and ex-boyfriend Rob Fusari led to a lawsuit against her production team, Mermaid Music LLC.[a] At this time, Gaga was tested borderline positive for lupus, but claimed not to be affected by the symptoms and hoped to maintain a healthy lifestyle.[93][94]

2011–2014: Born This Way, Artpop, and Cheek to Cheek
In February 2011, Gaga released "Born This Way", the lead single from her studio album of the same name. The song sold more than one million copies within five days, earning the Guinness World Record for the fastest selling single on iTunes.[95] It debuted atop the Billboard Hot 100, becoming the 1,000th number-one single in the history of the charts.[96] Its second single "Judas" followed two months later,[97] and "The Edge of Glory" served as its third single.[98] Both reached the top 10 in the US and the UK.[49][52] Her music video for "The Edge of Glory", unlike her previous work, portrays her dancing on a fire escape and walking on a lonely street, without intricate choreography and back-up dancers.[99]

Gaga performing onstage wearing black leather jacket and bodysuit. She has blue hair
Gaga promoting Born This Way with performances in Sydney, Australia
Born This Way was released on May 23, 2011,[97] and debuted atop the Billboard 200 with first-week sales of 1.1 million copies.[100] The album sold eight million copies worldwide and received three Grammy nominations, including Gaga's third consecutive nomination for Album of the Year.[101][102] Rolling Stone listed it among "The 500 Greatest Albums of All Time" in 2020.[103] Born This Way's following singles were "You and I" and "Marry the Night",[104] which reached numbers six and 29 in the US, respectively.[49] While filming the former's music video, Gaga met and started dating actor Taylor Kinney in July 2011, who played her love interest.[105][106] She also embarked on the Born This Way Ball tour in April 2012, which was scheduled to conclude the following March, but ended one month earlier when Gaga canceled the remaining dates due to a labral tear of her right hip that required surgery.[107] While refunds for the cancellations were estimated to be worth $25 million,[108] the tour grossed $183.9 million globally.[109]

In 2011, Gaga also worked with Tony Bennett on a jazz version of "The Lady Is a Tramp",[110] with Elton John on "Hello Hello" for the animated feature film Gnomeo & Juliet,[111] and with The Lonely Island and Justin Timberlake on "3-Way (The Golden Rule)".[112] She also performed a concert at the Sydney Town Hall in Australia that year to promote Born This Way and to celebrate former US President Bill Clinton's 65th birthday.[113] In November, she was featured in a Thanksgiving television special titled A Very Gaga Thanksgiving, which attracted 5.7 million American viewers and spawned the release of her fourth EP, A Very Gaga Holiday.[114] In 2012, Gaga guest-starred as an animated version of herself in an episode of The Simpsons called "Lisa Goes Gaga",[115] and released her first fragrance, Lady Gaga Fame, followed by a second one, Eau de Gaga, in 2014.[b]

Gaga began work on her third studio album, Artpop, in early 2012, during the Born This Way Ball tour; she crafted the album to mirror "a night at the club".[118][119][120] In August 2013, Gaga released the album's lead single "Applause",[121] which reached number one in Hungary, number four in the US, and number five in the UK.[52][49][122] A lyric video for Artpop track "Aura" followed in October to accompany Robert Rodriguez's Machete Kills, where she plays an assassin named La Chameleon.[123] The film received generally mixed reviews and earned less than half of its $33 million budget.[124][125] The second Artpop single, "Do What U Want", featured singer R. Kelly and was released later that month,[126] topping the charts in Hungary and reaching number 13 in the US.[49][127] Artpop was released on November 6, 2013, to mixed reviews.[128] Helen Brown in The Daily Telegraph criticized Gaga for making another album about her fame and doubted the record's originality, but found it "great for dancing".[129] The album debuted atop the Billboard 200 chart, and sold more than 2.5 million copies worldwide as of July 2014.[130][131] "G.U.Y." was released as the third single in March 2014 and peaked at number 76 in the US.[49][132]

A man and a woman standing closely together. The man (left) is wearing a grey suit, white shirt and a black tie while the woman (right) is wearing a black gown, black gloves and a black headpiece. They both hold a microphone in their left hand.
With the Cheek to Cheek era, Gaga (seen here performing on the Cheek to Cheek Tour alongside Tony Bennett) ushered in an overhaul of her image.[133]
Gaga hosted an episode of Saturday Night Live in November 2013.[134] After holding her second Thanksgiving Day television special on ABC, Lady Gaga and the Muppets Holiday Spectacular, she performed a special rendition of "Do What U Want" with Christina Aguilera on the fifth season of the American reality talent show The Voice.[135][136] In March 2014, Gaga had a seven-day concert residency commemorating the last performance at New York's Roseland Ballroom before its closure.[137] Two months later, she embarked on the ArtRave: The Artpop Ball tour, building on concepts from her ArtRave promotional event. Earning $83 million, the tour included cities canceled from the Born This Way Ball tour itinerary.[138] In the meantime, Gaga split from longtime manager Troy Carter over "creative differences",[139] and by June 2014, she and new manager Bobby Campbell joined Artist Nation, the artist management division of Live Nation Entertainment.[140] She briefly appeared in Rodriguez's Sin City: A Dame to Kill For, and was confirmed as Versace's spring-summer 2014 ambassador with a campaign called "Lady Gaga For Versace".[141][142]

In September 2014, Gaga released a collaborative jazz album with Tony Bennett titled Cheek to Cheek. The inspiration behind the album came from her friendship with Bennett, and fascination with jazz music since her childhood.[143] He stated that Gaga is "the most talented artist I have ever met".[144] Before the album was released, it produced the singles "Anything Goes" and "I Can't Give You Anything but Love".[145] Cheek to Cheek received generally favorable reviews;[146] The Guardian's Caroline Sullivan praised Gaga's vocals and Howard Reich of the Chicago Tribune wrote that "Cheek to Cheek serves up the real thing, start to finish".[147][148] The record was Gaga's third consecutive number-one album on the Billboard 200,[149] and won a Grammy Award for Best Traditional Pop Vocal Album.[150] The duo recorded the concert special Tony Bennett and Lady Gaga: Cheek to Cheek Live!,[151] and embarked on the Cheek to Cheek Tour from December 2014 to August 2015.[152]

2015–2017: American Horror Story, Joanne, and Super Bowl performances
In February 2015, Gaga became engaged to Taylor Kinney.[153] After the lukewarm response to Artpop, Gaga began to reinvent her image and style. According to Billboard, this shift started with the release of Cheek to Cheek and the attention she received for her performance at the 87th Academy Awards, where she sang a medley of songs from The Sound of Music in a tribute to Julie Andrews.[133] Considered one of her best performances by Billboard, it triggered more than 214,000 interactions per minute globally on Facebook.[154][155] She and Diane Warren co-wrote the song "Til It Happens to You" for the documentary The Hunting Ground, which earned them the Satellite Award for Best Original Song and an Academy Award nomination in the same category.[156] Gaga won Billboard Woman of the Year and Contemporary Icon Award at the 2015 Annual Songwriters Hall of Fame Awards.[157][158]

Gaga had spent much of her early life wanting to be an actress, and achieved her goal when she starred in American Horror Story: Hotel.[159] Running from October 2015 to January 2016, Hotel is the fifth season of the television anthology horror series, American Horror Story, in which Gaga played a hotel owner named Elizabeth.[160][161] At the 73rd Golden Globe Awards, Gaga received the Best Actress in a Miniseries or Television Film award for her work on the season.[159] She appeared in Nick Knight's 2015 fashion film for Tom Ford's 2016 spring campaign[162] and was guest editor for V fashion magazine's 99th issue in January 2016, which featured 16 different covers.[163] She received Editor of the Year award at the Fashion Los Angeles Awards.[164]

Lady Gaga standing behind a microphone stand with a pink guitar in her hands, wearing black leather fringe.
Gaga performing on the Joanne World Tour in 2017
In February 2016, Gaga sang the US national anthem at Super Bowl 50,[165] partnered with Intel and Nile Rodgers for a tribute performance to the late David Bowie at the 58th Annual Grammy Awards,[166] and sang "Til It Happens to You" at the 88th Academy Awards, where she was introduced by Joe Biden and was accompanied on-stage by 50 people who had suffered from sexual assault.[167] She was honored that April with the Artist Award at the Jane Ortner Education Awards by The Grammy Museum, which recognizes artists who have demonstrated passion and dedication to education through the arts.[168] Her engagement to Taylor Kinney ended in July; she later said her career had interfered with their relationship.[169]

Gaga played a witch named Scathach in American Horror Story: Roanoke, the series' sixth season,[170] which ran from September to November 2016.[171][172] Her role in the fifth season of the show ultimately influenced her future music, prompting her to feature "the art of darkness".[173] In September 2016, she released her fifth album's lead single, "Perfect Illusion", which topped the charts in France and reached number 15 in the US.[174][175][176] The album, titled Joanne, was named after Gaga's late aunt, who was an inspiration for the music.[177] It was released on October 21, 2016, and became Gaga's fourth number one album on the Billboard 200, making her the first woman to reach the US chart's summit four times in the 2010s.[178] The album's second single, "Million Reasons", followed the next month and reached number four in the US.[176][179] She later released a piano version of the album's title track in 2018,[180] which won a Grammy for Best Pop Solo Performance.[181] To promote the album, Gaga embarked on the three-date Dive Bar Tour.[182]

Gaga performed as the headlining act during the Super Bowl LI halftime show on February 5, 2017. Her performance featured a group of hundreds of lighted drones forming various shapes in the sky above Houston's NRG Stadium—the first time robotic aircraft appeared in a Super Bowl program.[183] It attracted 117.5 million viewers in the United States, exceeding the game's 111.3 million viewers and making it the second most-watched Super Bowl halftime show to date.[184] The performance led to a surge of 410,000 song downloads in the United States for Gaga and earned her an Emmy nomination in the Outstanding Special Class Program category.[185][186] CBS Sports included her performance as the second best in the history of Super Bowl halftime shows.[187] In April, Gaga headlined the Coachella Valley Music and Arts Festival.[188] She also released a standalone single, "The Cure", which reached the top 10 in Australia.[189][190] Four months later, Gaga began the Joanne World Tour, which she announced after the Super Bowl LI halftime show.[191] Gaga's creation of Joanne and preparation for her halftime show performance were featured in the documentary Gaga: Five Foot Two, which premiered on Netflix that September.[192] Throughout the film, she was seen suffering from chronic pain, which was later revealed to be the effect of a long-term condition called fibromyalgia.[193] In February 2018, it prompted Gaga to cancel the last ten shows of the Joanne World Tour, which ultimately grossed $95 million from 842,000 tickets sold.[194][195]

2018–2019: A Star Is Born and Las Vegas residency
A picture of Lady Gaga in a burgundy one shoulder dress, looking to the right.
Gaga at the 2018 Toronto International Film Festival prior to the screening of A Star Is Born, which was her first lead role in a film.[196]
In March 2018, Gaga supported the March for Our Lives gun-control rally in Washington, D.C.,[197] and released a cover of Elton John's "Your Song" for his tribute album Revamp.[198] Later that year, she starred as struggling singer Ally in Bradley Cooper's musical romantic drama A Star Is Born, a remake of the 1937 film of the same name. The film follows Ally's relationship with singer Jackson Maine (played by Cooper), which becomes strained after her career begins to overshadow his. It received acclaim from critics, with a consensus that the movie had "appealing leads, deft direction, and an affecting love story".[199] Cooper approached Gaga after seeing her perform at a cancer research fundraiser. An admirer of Cooper's work, Gaga agreed to the project due to its portrayal of addiction and depression.[200][201] A Star Is Born premiered at the 2018 Venice Film Festival, and was released worldwide that October.[202] Gaga's performance was acclaimed by film critics, with Peter Bradshaw of The Guardian labeling the film "outrageously watchable" and stating that "Gaga's ability to be part ordinary person, part extraterrestrial celebrity empress functions at the highest level";[203] Stephanie Zacharek of Time magazine similarly highlighted her "knockout performance" and found her to be "charismatic" without her usual makeup, wigs and costumes.[204] For the role, Gaga won the National Board of Review and Critics' Choice awards for Best Actress, in addition to receiving nominations for the Academy Award, Golden Globe Award, Screen Actors Guild Award and BAFTA Award for Best Actress.[205]

Gaga and Cooper co-wrote and produced most of the songs on the soundtrack for A Star Is Born, which she insisted they perform live in the film.[206] Its lead single, "Shallow", performed by the two, was released on September 27, 2018[207] and topped the charts in various countries including Australia, the UK and the US.[208] The soundtrack contains 34 tracks, including 17 original songs, and received generally positive reviews;[209] Mark Kennedy of The Washington Post called it a "five-star marvel" and Ben Beaumont-Thomas of The Guardian termed it an "instant classics full of Gaga's emotional might".[210][211] Commercially, the soundtrack debuted at number one in the US, making Gaga the first woman with five US number-one albums in the 2010s, and breaking her tie with Taylor Swift as the most for any female artist this decade;[212] Swift tied with her again in 2019.[213] It additionally topped the charts in Australia, Canada, Ireland, New Zealand, Switzerland and the UK.[214] As of June 2019, the soundtrack had sold over six million copies worldwide.[215] The album won Gaga four Grammy Awards—Best Compilation Soundtrack for Visual Media and Best Pop Duo/Group Performance and Best Song Written for Visual Media for "Shallow", as well as the latter category for "I'll Never Love Again"—and a BAFTA Award for Best Film Music.[181][216][217] "Shallow" also won her the Academy Award, Golden Globe Award, Critics' Choice Award, and Satellite Award for Best Original Song.[205] Gaga gave live performances of the song at the 61st Annual Grammy Awards and the 91st Academy Awards.[218][219]

In October, Gaga announced her engagement to talent agent Christian Carino whom she had met in early 2017.[220] They ended the engagement in February 2019.[221] Gaga signed a concert residency, named Lady Gaga Enigma + Jazz & Piano, to perform at the MGM Park Theater in Las Vegas.[222] The residency consists of two types of shows: Enigma, which focused on theatricality and included Gaga's biggest hits,[223] and Jazz & Piano, which involved tracks from the Great American Songbook and stripped-down versions of Gaga's songs. The Enigma show opened in December 2018 and the Jazz & Piano in January 2019.[224] Gaga launched her vegan makeup line, Haus Laboratories, in September 2019 exclusively on Amazon. Consisting of 40 products, including liquid eyeliners, lip glosses and face mask sticker, it reached number-one on Amazon's list of best-selling lipsticks.[225]

2020–present: Chromatica, Love for Sale, and House of Gucci
In February 2020, Gaga began a relationship with entrepreneur Michael Polansky.[226] Her sixth studio album, Chromatica, was released on May 29, 2020, to positive reviews.[227][228] It debuted atop the US charts, becoming her sixth consecutive number-one album in the country, and reached the top spot in more than a dozen other territories including Australia, Canada, France, Italy and the UK.[229] Chromatica was preceded by two singles, "Stupid Love", on February 28, 2020,[230] and "Rain on Me", with Ariana Grande, on May 22.[231] The latter won the Best Pop Duo/Group Performance at the 63rd Annual Grammy Awards, and debuted at number one in the US, making Gaga the third person to top the country's chart in the 2000s, 2010s and 2020s.[232][233] At the 2020 MTV Video Music Awards, Gaga won five awards, including the inaugural Tricon Award recognizing artists accomplished in different areas of the entertainment industry.[234] In September 2020, she appeared in the video campaign for Valentino's Voce Viva fragrance, singing a stripped-down version of Chromatica track "Sine from Above", along with a group of models.[235]

A blonde woman with wet-look hairstyle singing to a microphone on stage. She is wearing a black leather jacket.
Gaga performing on The Chromatica Ball in 2022, which is her first all-stadium concert tour.[236]
During the inauguration of Joe Biden as the 46th President of the United States on January 20, 2021, Gaga sang the US national anthem.[237] In February 2021, her dog walker Ryan Fischer was hospitalized after getting shot in Hollywood. Two of her French Bulldogs, Koji and Gustav, were taken while a third dog named Miss Asia escaped and was subsequently recovered by police. Gaga later offered a $500,000 reward for the return of her pets.[238][239] Two days later, on February 26, a woman brought the dogs to a police station in Los Angeles. Both were unharmed. Los Angeles Police initially said the woman who dropped off the dogs did not appear to be involved with the shooting,[240] but on April 29, she was one of five people charged in connection with the shooting and theft.[241] In December 2022, James Howard Jackson, the man who shot Fischer, was sentenced to 21 years in prison.[242]

In April 2021, Gaga teamed up with Champagne brand Dom Pérignon, and appeared in an ad shot by Nick Knight.[243] On September 3, she released her third remix album, Dawn of Chromatica.[244] This was followed by her second collaborative album with Tony Bennett, titled Love for Sale, on September 30.[245] The record received generally favorable reviews, and debuted at number eight in the US.[246][247] The album's promotional rollout included the television special One Last Time: An Evening with Tony Bennett and Lady Gaga, released in November 2021, on CBS, which featured select performances from the duo's August 3 and 5 performances at Radio City Music Hall.[248][249] Another taped performance by the duo recorded for MTV Unplugged was released that December.[250] At the 64th Annual Grammy Awards, Love for Sale won Gaga and Bennett the award for Best Traditional Pop Vocal Album.[251]

After an appearance in the television special Friends: The Reunion, in which Gaga sang "Smelly Cat" with Lisa Kudrow,[252] she portrayed Patrizia Reggiani, who was convicted of hiring a hitman to murder her ex-husband and former head of the Gucci fashion house Maurizio Gucci (played by Adam Driver), in Ridley Scott's biographical crime film titled House of Gucci.[253][254] For the part, Gaga learned to speak with an Italian accent. She also stayed in character for 18 months, speaking with an accent for nine months during that period.[255] Her method acting approach took a toll on her mental wellbeing, and towards the end of filming she had to be accompanied on-set by a psychiatric nurse.[256] The film was released on November 24, 2021, to mixed reviews, though critics praised Gaga's performance as "note-perfect".[257] She earned the New York Film Critics Circle Award, and nominations for the BAFTA Award, Critics' Choice Award, Golden Globe Award and Screen Actors Guild Award for Best Actress.[258] Gaga co-wrote the song "Hold My Hand" for the 2022 film Top Gun: Maverick,[259] and also composed the score alongside Hans Zimmer and Harold Faltermeyer.[260] She performed "Hold My Hand" live at the 95th Academy Awards, where it was nominated for Best Original Song.[261] The track also earned Gaga her third win for Best Original Song at the Satellite Awards.[262] In July 2022, she embarked on The Chromatica Ball stadium tour,[263] which had twenty dates and grossed $112.4 million from 834,000 tickets sold.[236] By the end of the year, she became the highest grossing female artist touring in 2022.[264]

In April 2023, Gaga was appointed as co-chair of the President's Committee on the Arts and Humanities by President Joe Biden.[265] A documentary called The Lady and the Legend, which will include footage from the making of Gaga and Tony Bennett's two collaborative albums and "chronicles their 10-year relationship", will be available exclusively on Paramount+ in September 2023.[249][266] Gaga is set to star with Joaquin Phoenix in Joker: Folie à Deux, which will be released in 2024.[267]

Artistry
Influences
A woman with plaited hair, blue eyes and red lipstick wearing a colorful dress and guitar strap.
A man smiling; he has brown hair and wears a suit jacket and vest, and a white shirt open at the collar. His blue tie is not fastened.
Musicians such as Madonna and David Bowie have influenced Gaga.
Gaga grew up listening to artists such as Michael Jackson,[268] the Beatles, Stevie Wonder, Queen, Bruce Springsteen, Pink Floyd, Led Zeppelin, Whitney Houston,[269] Elton John, Prince,[270][271] En Vogue, TLC,[272] Christina Aguilera,[273] Janet Jackson,[274] and Blondie,[275] who have all influenced her music.[276] Gaga's musical inspiration varies from dance-pop singers such as Madonna and Michael Jackson to glam rock artists such as David Bowie and Freddie Mercury, as well as the theatrics of the pop artist Andy Warhol and her own performance roots in musical theater.[31][277] She has been compared to Madonna, who has said that she sees herself reflected in Gaga.[278] Gaga says that she wants to revolutionize pop music as Madonna has.[279] Gaga has also cited heavy metal bands as an influence, specifically Iron Maiden, Black Sabbath and Marilyn Manson.[280][281][282][283] She credits Beyoncé as a key inspiration to pursue a musical career.[284]

Gaga was inspired by her mother to be interested in fashion, which she now says is a major influence and integrated with her music.[18][285] Stylistically, Gaga has been compared to Leigh Bowery, Isabella Blow, and Cher;[286][287] she once commented that as a child, she absorbed Cher's fashion sense and made it her own.[287] Gaga became friends with British fashion designer Alexander McQueen shortly before his suicide in 2010, and became known for wearing his designs, particularly his towering armadillo shoes.[93][288] She considers fashion designer Donatella Versace her muse; Versace has called Lady Gaga "the fresh Donatella".[289][290] Gaga has also been influenced by Princess Diana, whom she has admired since her childhood.[291]

Gaga has called the Indian alternative medicine advocate Deepak Chopra a "true inspiration",[292] and has also quoted Indian leader Osho's book Creativity on Twitter. Gaga says she was influenced by Osho's work in valuing rebellion through creativity and equality.[293]

Musical style and themes
Critics have analyzed and scrutinized Gaga's musical and performance style, as she has experimented with new ideas and images throughout her career. She says the continual reinvention is "liberating" herself, which she has been drawn to since childhood.[294] Gaga combines a variety of music genres, particularly incorporating elements of rock into her pop and dance music. She has also branched out into jazz and other non-pop musical genres.[295] Gaga is a contralto, with a range spanning from B♭2 to B5.[296][297][298] She has changed her vocal style regularly, and considers Born This Way "much more vocally up to par with what I've always been capable of".[299][300] In summing up her voice, Entertainment Weekly wrote: "There's an immense emotional intelligence behind the way she uses her voice. Almost never does she overwhelm a song with her vocal ability, recognizing instead that artistry is to be found in nuance rather than lung power."[301]

According to Evan Sawdey of PopMatters, Gaga "manage[s] to get you moving and grooving at an almost effortless pace".[302] Gaga believes that "all good music can be played on a piano and still sound like a hit".[303] Simon Reynolds wrote in 2010, "Everything about Gaga came from electroclash, except the music, which wasn't particularly 1980s, just ruthlessly catchy naughties pop glazed with Auto-Tune and undergirded with R&B-ish beats."[304]

Gaga's songs have covered a wide variety of concepts; The Fame discusses the lust for stardom, while the follow-up The Fame Monster expresses fame's dark side through monster metaphors. The Fame is an electropop and dance-pop album that has influences of 1980s pop and 1990s Europop,[305] whereas The Fame Monster displays Gaga's taste for pastiche, drawing on "Seventies arena glam, perky ABBA disco, and sugary throwbacks like Stacey Q".[306] Born This Way has lyrics in English, French, German, and Spanish and features themes common to Gaga's controversial songwriting such as sex, love, religion, money, drugs, identity, liberation, sexuality, freedom, and individualism.[307] The album explores new genres, such as electronic rock and techno.[308]

The themes in Artpop revolve around Gaga's personal views of fame, love, sex, feminism, self-empowerment, overcoming addiction, and reactions to media scrutiny.[309] Billboard describes Artpop as "coherently channeling R&B, techno, disco and rock music".[310] With Cheek to Cheek, Gaga dabbled in the jazz genre.[311] Joanne, exploring the genres of country, funk, pop, dance, rock, electronic music and folk, was influenced by her personal life.[312] The A Star Is Born soundtrack contains elements of blues rock, country and bubblegum pop.[210] Billboard says its lyrics are about wanting change, its struggle, love, romance, and bonding, describing the music as "timeless, emotional, gritty and earnest. They sound like songs written by artists who, quite frankly, are supremely messed up but hit to the core of the listener."[313] On Chromatica, Gaga returned to her dance-pop roots, and discussed her struggles with mental health.[314] Her second album with Tony Bennett, Love for Sale, consists of a tribute to Cole Porter.[315]

Videos and stage
A pale-skinned woman holding her hands crossed and intertwined in the air. She has yellow hair and wears a low-cut bodysuit. Her chest is marked with red liquid.
Gaga during a "blood soaked" performance in 2010
Featuring constant costume changes and provocative visuals, Gaga's music videos are often described as short films.[316] The video for "Telephone" earned Gaga the Guinness World Record for Most Product Placement in a Video.[317] According to author Curtis Fogel, she explores bondage and sadomasochism and highlights prevalent feminist themes. The main themes of her music videos are sex, violence, and power. She calls herself "a little bit of a feminist" and asserts that she is "sexually empowering women".[318] Billboard ranked her sixth on its list of "The 100 Greatest Music Video Artists of All Time" in 2020, stating that "the name 'Lady Gaga' will forever be synonymous with culture-shifting music videos".[319]

Regarded as "one of the greatest living musical performers" by Rolling Stone,[320] Gaga has called herself a perfectionist when it comes to her elaborate shows.[321] Her performances have been described as "highly entertaining and innovative";[322] the blood-spurting performance of "Paparazzi" at the 2009 MTV Video Music Awards was described as "eye-popping" by MTV News.[323] She continued the blood-soaked theme during The Monster Ball Tour, causing protests in England from family groups and fans in the aftermath of the Cumbria shootings, in which a taxi driver had killed 12 people, then himself.[324] At the 2011 MTV Video Music Awards, Gaga appeared in drag as her male alter ego, Jo Calderone, and delivered a lovesick monologue before a performance of her song "You and I".[325] As Gaga's choreographer and creative director, Laurieann Gibson provided material for her shows and videos for four years before she was replaced by her assistant Richard Jackson in 2014.[326]

In an October 2018 article for Billboard, Rebecca Schiller traced back Gaga's videography from "Just Dance" to the release of A Star Is Born. Schiller noted that following the Artpop era, Gaga's stripped-down approach to music was reflected in the clips for the singles from Joanne, taking the example of the music video of lead single "Perfect Illusion" where she eschewed "the elaborate outfits for shorts and a tee-shirt as she performed the song at a desert party". It continued with her performances in the film as well as her stage persona.[327] Reviewing The Chromatica Ball in 2022, Chris Willman of Variety wrote that Gaga "could have further played the authenticity card for all it's worth" after the release of Joanne and A Star Is Born, but instead "has determined to keep herself weird — or just weird enough to provide necessarily ballast to her more earnest inclinations".[328]

Public image
A realistic mannequin of a pale-skinned woman with blonde hair wearing a hat in the design of an old-fashioned telephone.
In 2010, eight wax figures of Gaga were installed at the museum Madame Tussauds.[329]
Public reception of her music, fashion sense, and persona is polarized. Because of her influence on modern culture, and her rise to global fame, sociologist Mathieu Deflem of the University of South Carolina has offered a course titled "Lady Gaga and the Sociology of the Fame" since early 2011 with the objective of unraveling "some of the sociologically relevant dimensions of the fame of Lady Gaga".[330] When Gaga met briefly with then-president Barack Obama at a Human Rights Campaign fundraiser, he found the interaction "intimidating" as she was dressed in 16-inch heels, making her the tallest woman in the room.[331] When interviewed by Barbara Walters for her annual ABC News special 10 Most Fascinating People in 2009, Gaga dismissed the claim that she is intersex as an urban legend. Responding to a question on this issue, she expressed her fondness for androgyny.[332]

Gaga's outlandish fashion sense has also served as an important aspect of her character.[286][289] During her early career, members of the media compared her fashion choices to those of Christina Aguilera.[289] In 2011, 121 women gathered at the Grammy Awards dressed in costumes similar to those worn by Gaga, earning the 2011 Guinness World Record for Largest Gathering of Lady Gaga Impersonators.[95] The Global Language Monitor named "Lady Gaga" as the Top Fashion Buzzword with her trademark "no pants" a close third.[333] Entertainment Weekly put her outfits on its end of the decade "best-of" list, saying that she "brought performance art into the mainstream".[334] People ranked her number one on their "Best Dressed Stars of 2021" list, writing that Gaga "strutted the streets in high-fashion designs, from a sculptural seersucker number to a black lace corseted gown—accessorizing each with elegant updos, sky-high heels and retro shades—like it was no sweat."[335]

Time placed Gaga on their All-Time 100 Fashion Icons list, stating: "Lady Gaga is just as notorious for her outrageous style as she is for her pop hits ... [Gaga] has sported outfits made from plastic bubbles, Kermit the Frog dolls, and raw meat."[336] Gaga wore a dress made of raw beef to the 2010 MTV Video Music Awards, which was supplemented by boots, a purse, and a hat also made out of raw beef.[337] Partly awarded in recognition of the dress, Vogue named her one of the Best Dressed people of 2010 and Time named the dress the Fashion Statement of the year.[338][339] It attracted the attention of worldwide media; the animal rights organization PETA found it offensive.[340] The meat dress was displayed at the National Museum of Women in the Arts in 2012,[341] and entered the Rock and Roll Hall of Fame in September 2015.[342]

Gaga's fans call her "Mother Monster", and she often refers to them as "Little Monsters", a phrase she had tattooed on herself in dedication.[343] In his article "Lady Gaga Pioneered Online Fandom Culture As We Know It" for Vice, Jake Hall wrote that Gaga inspired several subsequent fan-brandings, such as those of Taylor Swift, Rihanna and Justin Bieber.[344] In July 2012, Gaga also co-founded the social networking service LittleMonsters.com, devoted to her fans.[345] Scott Hardy, CEO of Polaroid, praised Gaga for inspiring fans and for her close interactions with them on social media.[346]

Censorship
In 2011, the Ministry of Culture of the People's Republic of China acting on behalf of the State Administration of Radio, Film and Television, banned Gaga for "being vulgar."[347] The ban was lifted in 2014. However, conditions for Artpop to go on sale legally in China were placed on the album artwork, covering her almost naked body. Officials also changed the title of the song "Sexxx Dreams" to "X Dreams."[348]

In 2016, Gaga was banned in China again after she publicly talked with the Dalai Lama.[349][350] The Chinese government added Gaga to a list of hostile foreign forces, and Chinese websites and media organizations were ordered to stop distributing her songs. The Publicity Department of the Chinese Communist Party also issued an order for state-controlled media to condemn this meeting.[351] In the following years, Gaga's image was blacked out in reporting of the 91st Academy Awards in China and her appearance was cut from Friends: The Reunion; both incidents received backlash from her Chinese fans.[352][353]

Activism
Philanthropy
After declining an invitation to appear on the single "We Are the World 25 for Haiti", because of rehearsals for her tour, to benefit victims of the 2010 Haiti earthquake, Gaga donated the proceeds of her January 2010 Radio City Music Hall concert to the country's reconstruction relief fund.[354] All profits from her online store that day were also donated, and Gaga announced that $500,000 was collected for the fund.[355] Hours after the 2011 Tōhoku earthquake and tsunami hit Japan, Gaga tweeted a link to Japan Prayer Bracelets. All revenue from a bracelet she designed in conjunction with the company was donated to relief efforts;[356] these raised $1.5 million.[357] In June 2011, Gaga performed at MTV Japan's charity show in Makuhari Messe, which benefited the Japanese Red Cross.[358]

In 2012, Gaga joined the campaign group Artists Against Fracking.[359] That October, Yoko Ono gave Gaga and four other activists the LennonOno Grant for Peace in Reykjavík, Iceland.[360] The following month, Gaga pledged to donate $1 million to the American Red Cross to help the victims of Hurricane Sandy. Gaga also contributes in the fight against HIV and AIDS, focusing on educating young women about the risks of the disease. In collaboration with Cyndi Lauper, Gaga joined forces with MAC Cosmetics to launch a line of lipstick under their supplementary cosmetic line, Viva Glam.[361] Sales have raised more than $202 million to fight HIV and AIDS.[362]

In April 2016, Gaga joined Vice President Joe Biden at the University of Nevada, Las Vegas to support Biden's It's On Us campaign as he traveled to colleges on behalf of the organization, which has seen 250,000 students from more than 530 colleges sign a pledge of solidarity and activism.[363] Two months later, Gaga attended the 84th Annual US Conference of Mayors in Indianapolis where she joined with the Dalai Lama to talk about the power of kindness and how to make the world a more compassionate place.[364]

In April 2020, Gaga curated the televised benefit concert, One World: Together at Home, a collaboration with Global Citizen to benefit the World Health Organization's COVID-19 Solidarity Response Fund.[365][366] The special raised $127 million, which according to Forbes "puts it on par with the other legendary fundraiser, Live Aid, as the highest grossing charity concert in history."[367] In recognition of her contribution to the Black Lives Matter movement, Gaga received the Yolanda Denise King High Ground Award from the King Center's Beloved Community Awards in January 2021. In her acceptance speech, she denounced racism and white supremacy and addressed her social responsibility as a high-profile artist and white woman.[368]

Born This Way Foundation
Main article: Born This Way Foundation
Refer to caption.
Gaga during an event for the Born This Way Foundation in Europe, 2013
In 2012, Gaga launched the Born This Way Foundation (BTWF), a non-profit organization that focuses on youth empowerment. It takes its name from her 2011 single and album. Media proprietor Oprah Winfrey, writer Deepak Chopra, and US Secretary of Health and Human Services Kathleen Sebelius spoke at the foundation's inauguration at Harvard University.[369] The foundation's original funding included $1.2 million from Gaga, $500,000 from the MacArthur Foundation, and $850,000 from Barneys New York.[370] In July 2012, the BTWF partnered with Office Depot, which donated 25% of the sales, a minimum of $1 million of a series of limited edition back-to-school products.[371] The foundation's initiatives have included the "Born Brave Bus" that followed her on tour as a youth drop-in center as an initiative against bullying.[372][373]

In October 2015, at the Yale Center for Emotional Intelligence, Gaga joined 200 high school students, policy makers, and academic officials, including Peter Salovey, to discuss ways to recognize and channel emotions for positive outcomes.[374] In 2016, the foundation partnered with Intel, Vox Media, and Recode to fight online harassment.[375] The sales revenue of the 99th issue of the V magazine, which featured Gaga and Kinney, was donated to the foundation.[163] Gaga and Elton John released the clothing and accessories line Love Bravery at Macy's in May. 25% of each purchase support Gaga's foundation and the Elton John AIDS Foundation.[376] Gaga partnered with Starbucks for a week in June 2017 with the "Cups of Kindness" campaign, where the company donated 25 cents from some of the beverages sold to the foundation.[377] She also appeared in a video by Staples Inc. to raise funds for the foundation and DonorsChoose.org.[378]

On the 2018 World Kindness Day, Gaga partnered with the foundation to bring food and relief to a Red Cross shelter for people who have been forced to evacuate homes due to the California wildfires. The foundation also partnered with Starbucks and SoulCycle to thank California firefighters for their relief work during the crisis. The singer had to previously evacuate her own home during the Woolsey Fire which spread through parts of Malibu.[379]

In March 2019, she penned a letter to supporters of the Born This Way Foundation, announcing the launch of a new pilot program for a teen mental health first aid project with the National Council for Behavioral Health. Gaga revealed her personal struggles with mental health in her letter and how she was able to get support which saved her life: "I know what it means to have someone support me and understand what I'm going through, and every young person in the world should have someone to turn to when they're hurting. It saved my life, and it will save theirs."[380][381] In September 2020, Gaga released an anthology book, Channel Kindness: Stories of Kindness and Community, featuring fifty-one stories about kindness, bravery, and resilience from young people all over the world collected by the Born This Way Foundation, and introduced by herself.[382] She had been promoting it with a 21 days of kindness challenge on her social media, using the "BeKind21" hashtag.[383] In 2021, Gaga collaborated with the Champagne house Dom Pérignon to release a limited edition of Rosé Vintage 2005 bottles along with a sculpture designed by her. The 110 exclusive pieces will be sold at private sales, and the profits will benefit the foundation.[384] On the 2021 World Kindness Day, Gaga released a 30-minute special, titled The Power of Kindness, as part of the foundation's Channel Kindness program, in which together with a mental health expert and a group of eleven young people, she explored the connection between kindness and mental health.[385]

LGBT advocacy
A woman with blonde hair speaking at a podium into several microphones. She wears large glasses. The background is a series of red and white horizontal stripes.
Gaga speaking against "don't ask, don't tell" in Portland, Maine (2010)
A bisexual woman,[c] Gaga actively supports LGBT rights worldwide.[386] She attributes much of her early success as a mainstream artist to her gay fans and is considered a gay icon.[387][388] Early in her career, Gaga had difficulty getting radio airplay, and stated, "The turning point for me was the gay community."[389] She thanked FlyLife, a Manhattan-based LGBT marketing company with whom her label Interscope works, in the liner notes of The Fame.[390] One of her first televised performances was in May 2008 at the NewNowNext Awards, an awards show aired by the LGBT television network Logo.[391]

Gaga spoke at the 2009 National Equality March in Washington, D.C. to support the LGBT rights movement.[392] She attended the 2010 MTV Video Music Awards accompanied by four gay and lesbian former members of the United States Armed Forces who had been unable to serve openly under the US military's "don't ask, don't tell" policy, which banned open homosexuality in the military.[393] Gaga urged her fans via YouTube to contact their senators in an effort to overturn the policy. In September 2010, she spoke at a Servicemembers Legal Defense Network's rally in Portland, Maine. Following this event, The Advocate named her a "fierce advocate" for gays and lesbians.[394]

Gaga appeared at Europride, an international event dedicated to LGBT pride, in Rome in June 2011. She criticized the poor state of gay rights in many European countries and described gay people as "revolutionaries of love".[395] Later that year, she was referenced by teenager Jamey Rodemeyer in the hours prior to his death, with Rodemeyer having tweeted "@ladygaga bye mother monster, thank you for all you have done, paws up forever". Rodemeyer's suicide prompted Gaga to meet with then-President Barack Obama in order to address anti-gay bullying in American schools.[396] In 2011, she was also ordained as a minister by the Universal Life Church Monastery so that she could officiate the wedding of two female friends.[397]

In June 2016, during a vigil held in Los Angeles for victims of the attack at the gay nightclub Pulse in Orlando, Gaga read aloud the names of the 49 people killed in the attack, and gave a speech.[398] Later that month, Gaga appeared in Human Rights Campaign's tribute video to the victims of the attack.[399] She opposed the presidency of Donald Trump and his military transgender ban.[400][401] She supported former Secretary of State Hillary Clinton for president in 2016.[402] In 2018, a leaked memo from Trump's office revealed that his administration wanted to change the legal definition of sex to exclude transgender Americans. Gaga was one of the many celebrities to call him out and spread the #WontBeErased campaign to her 77 million Twitter followers.[403][404] In January 2019, during one of her Enigma shows, she criticized Vice President Mike Pence for his wife Karen Pence working at an evangelical Christian school where LGBTQ people are turned away, calling him "the worst representation of what it means to be a Christian". Gaga also stated "I am a Christian woman, and what I do know about Christianity is that we bear no prejudice, and everybody is welcome".[405] Gaga made a congratulatory speech commemorating the 50th anniversary of the Stonewall riots and the LGBTQ+ community's accomplishments at WorldPride NYC 2019 outside the Stonewall Inn, birthplace of the modern gay rights movement.[406]

Legacy
Gaga kneeling down wearing a shiny black upper garment, fishnet stockings and black high-heeled boots. Her hair is pale yellow.
Gaga performing on the ArtRave: The Artpop Ball tour in 2014
Gaga was named the "Queen of Pop" in a 2011 ranking by Rolling Stone based on record sales and social media metrics. In 2012, she ranked fourth in VH1's Greatest Women in Music[407][408] and became a feature of the temporary exhibition The Elevated. From the Pharaoh to Lady Gaga, which marked the 150th anniversary of the National Museum in Warsaw.[409]

Gaga has often been praised for using controversy to bring attention to various issues.[410][411] According to Frankie Graddon of The Independent, Gaga—who wore a meat dress to highlight her distaste for the US military's "don't ask, don't tell" policy—influenced protest dressing on red carpet.[412] Billboard named her "the Greatest Pop Star of 2009", asserting that "to say that her one-year rise from rookie to MVP was meteoric doesn't quite cut it, as she wasn't just successful, but game-changing—thanks to her voracious appetite for reinvention."[413] Because of The Fame's success—it was listed as one of the 100 Greatest Debut Albums of All-Time by Rolling Stone in 2013[414]—Gaga has been credited as one of the musicians that popularized synth-pop in the late 2000s and early 2010s.[415]

According to Kelefa Sanneh of The New Yorker, "Lady Gaga blazed a trail for truculent pop stars by treating her own celebrity as an evolving art project."[416] Including Born This Way as one of the 50 best female albums of all time, Rolling Stone's Rob Sheffield considers it "hard to remember a world where we didn't have Gaga, although we're pretty sure it was a lot more boring".[417] In 2015, Time also noted that Gaga had "practically invented the current era of pop music as spectacle".[418] A 2017 journal published by Psychology of Aesthetics, Creativity, and the Arts studying structural patterns in melodies of earworm songs compiled lists of catchiest tracks from 3,000 participants, in which Gaga's "Bad Romance", "Alejandro", and "Poker Face" ranked number one, eight, and nine, respectively.[419] In 2018, NPR named her the second most influential female artist of the 21st century, noting her as "one of the first big artists of the 'Internet age'".[420] Gaga and her work have influenced various artists including Miley Cyrus,[421] Nicki Minaj,[422] Ellie Goulding,[423] Halsey,[424] Jennifer Lopez,[425] Beyoncé,[426] Nick Jonas,[427] Sam Smith,[428] Noah Cyrus,[429] Katherine Langford,[430] MGMT,[431] Allie X,[432] Greyson Chance,[433] Cardi B,[434] Rina Sawayama,[435] Blackpink,[436] Madison Beer,[437] Ren,[438] Slayyyter,[439] Bebe Rexha,[440] Bree Runway,[441] Celeste,[442] Kim Petras,[443] Jojo Siwa,[444] Pabllo Vittar,[445] Ava Max,[446] Doja Cat,[447] Chaeyoung of Twice,[448] Kanye West,[449] Rachel Zegler,[450] SZA,[451] Raye,[452] and Grace Gaustad.[453]

A new genus of ferns, Gaga, and three species, G. germanotta, G. monstraparva and Kaikaia gaga, have been named in her honor. The name monstraparva alluded to Gaga's fans, known as Little Monsters, since their symbol is the outstretched "monster claw" hand, which resembles a tightly rolled young fern leaf prior to unfurling.[454][455] Gaga also has an extinct mammal, Gagadon minimonstrum,[456] and a parasitic wasp, Aleiodes gaga, named for her.[457][458]

In Taichung, Taiwan, July 3 is designated as "Lady Gaga Day" marking the first day Gaga visited the country in 2011.[459] In May 2021, to celebrate the tenth anniversary of Born This Way and its cultural impact, West Hollywood mayor, Lindsey P. Horvath, presented a key to the city to Gaga and declared May 23 as "Born This Way Day." A street painting with the Daniel Quasar's version of the pride flag featuring the album's title was also unveiled on Robertson Boulevard as a tribute to the album, and how it has inspired the LGBT community over the years.[460]

Achievements
See also: List of awards and nominations received by Lady Gaga
Gaga has won thirteen Grammy Awards,[461] an Academy Award,[205] two Golden Globe Awards,[462] a BAFTA Award,[205] three Brit Awards,[463] sixteen Guinness World Records,[464] and the inaugural Songwriters Hall of Fame's Contemporary Icon Award.[158] She received a National Arts Awards' Young Artist Award, which honors individuals who have shown accomplishments and leadership early in their career,[465] the Jane Ortner Artist Award from the Grammy Museum in 2016,[168] and a National Board of Review Award for Best Actress in 2018.[205] Gaga has also been recognized by the Council of Fashion Designers of America (CFDA) with the Fashion Icon award.[466] In 2019, she became the first woman to win an Academy Award, a BAFTA Award, a Golden Globe Award and a Grammy Award in one year for her contribution to A Star Is Born's soundtrack.[467] At the 2020 MTV Video Music Awards, she was honored with the inaugural Tricon Award representing achievement in three (or more) fields of entertainment.[234]

Acknowledged by Billboard as the Greatest Pop Star in 2009, with honorable mention in 2010 and 2011, and Woman of the Year in 2015, Gaga has consecutively appeared on the magazine's Artists of the Year chart (scoring the definitive title in 2010), and ranked 11th on its Top Artists of the 2010s chart.[468][469][470] She is the longest-reigning act of Billboard's Dance/Electronic Albums chart with 244 weeks at number one, while The Fame (2008) holds the record for the most time on top in the chart's history, with 175 non-consecutive weeks.[471][472] Her album Born This Way (2011) featured on Rolling Stone's 2020 revision of their 500 Greatest Albums of All Time, and the song "Bad Romance" and its music video were among Rolling Stone's 500 Greatest Songs of All Time and 100 Greatest Music Videos of All Time, respectively, in 2021.[473] In 2023, the magazine included Gaga among the 200 Greatest Singers of All Time.[474]

With estimated sales of 170 million records as of 2018,[475] Gaga is one of the world's best-selling music artists, and has produced some of the best-selling singles of all time.[476] As of 2022, she has grossed more than $689.5 million in revenue from concert tours and residencies with attendance of 6.3 million, being the fifth woman to pass the half-billion total as reported to Billboard Boxscore,[236][477] receiving the Pollstar Award for Pop Touring Artist of the Decade (2010s).[478] She is the fourteenth top digital singles artist in the US, with 87.5 million equivalent units certified according to Recording Industry Association of America (RIAA),[d] was the first woman to receive the Digital Diamond Award certification from RIAA, one of the few artists with at least three Diamond certified songs ("Bad Romance", "Poker Face" and "Just Dance"),[480][481] and the first and only artist to have two songs pass seven million downloads ("Poker Face" and "Just Dance").[482] In 2020, she became the first female artist to have four singles ("Just Dance", "Poker Face", "Bad Romance" and "Shallow") sell at least 10 million copies globally.[483]

According to Guinness World Records, she was the most followed person on Twitter from 2011 to 2013,[484] the most famous celebrity in 2013,[485] and the most powerful popstar in 2014.[486] She was included on Forbes' Celebrity 100 from 2010 to 2015 and then from 2018 to 2020, having topped the list in 2011. She earned $62 million, $90 million, $52 million, $80 million, $33 million, and $59 million from 2010 through 2015, and $50 million, $39 million and $38 million between 2018 and 2020.[487][488] Gaga also appeared on their list of the World's Most Powerful Women from 2010 to 2014.[489][490] She was named one of the 100 most influential people in the world by Time magazine in 2010 and 2019,[491][492] and ranked second in its most influential people of the past ten years readers' poll in 2013.[493]

In March 2012, Gaga was ranked fourth on Billboard's list of top moneymakers of 2011 with earnings of $25 million, which included sales from Born This Way and her Monster Ball Tour.[494] The following year, she topped Forbes' List of Top-Earning Celebs Under 30,[488] which she also topped in 2011,[495] and in February 2016, the magazine estimated her net worth to be $275 million.[496] In December 2019, Gaga placed 10th on Forbes' list of Top-Earning Musicians of the Decade with earnings of $500 million in the 2010s. She was the fourth highest-earning female musician on the list.[497]


"""

In [None]:

count_tokens(BIGTEXT)

In [None]:
three = grab_tokens(BIGTEXT, 3589)


In [None]:
z = get_response([
              {"role": "user", "content": three}
             ])

In [None]:
count_tokens(z['choices'][0]['message']['content'])

In [None]:
z['choices'][0]['message']['content']

In [None]:
get_response([{"role": "system", "content": "respond in the voice of John Cleese"},
              {"role": "user", "content": "what is the airspeed velocity of an unladen swallow"}
             ])

In [None]:
MAX_TOKENS = 4096   # https://platform.openai.com/docs/models

def get_response(messages, prompt_prefix="", verbose=False):

    prompt = prompt_prefix
    
    if type(messages) == list:
        for msg in messages:
            prompt += f"""
~~~
{msg}
~~~
"""
    else:
        prompt += messages
        
    if verbose:
        print(prompt)
        
    # retry loop, have received untrapped 502 error
    if count_tokens(prompt) > MAX_TOKENS:
        print("WARNING: %d tokens > %d" % (count_tokens(prompt), MAX_TOKENS))
        
    RETRIES = 3
    success = False    
    for i in range(RETRIES):
        try:
            response = openai.ChatCompletion.create(
                model=gptmodel,
                messages=[{"role":"user", 
                           "content": prompt}],
                temperature=0,
            )
            # no exception thrown
            success=True
            break   
        except Exception as error:
            print("An exception occurred:", error)
            print("Retrying get_response...")
            time.sleep(5)
            continue  # try again
    if success:
        # check response payload for any error message?
        response_msg = response['choices'][0]['message']
        if len(response_msg['content'])==0:
            print("there was a problem, content is empty, full payload follows:")
            print(response)
        if verbose:
            print(response_msg)
        return response_msg['content']
    else:
        return None



In [None]:
def file_validate(response):
    header_array = []
    lines = response.split("\n")
    if header_array:
        inp_array = csv_validate_re.findall(lines[0])
        if len(header_array) != len(inp_array) \
            or any(validation != inp.strip() for (validation, inp) in zip(header_array, inp_array)):
                print("bad header: ")
                print("got:    ", lines[0])
                print("expected:", ",".join(header_array))
                return False
    return True

def row_validate(row, header_array=None, schema=None):
    csv_values = csv_validate_re.findall(row)
    if header_array:
        if len(csv_values) != len(header_array):
            return False
    if schema:
        try:
            schema.validate([row])
        except Exception as error:
            print(row)
            print(error)
            return False
    return True

def get_csv_from_chat_gpt(message, header_array):
    
    # maybe make more general by passing 2 validation functions, file_validate, row_validate 
    # could use csv module and pydantic to validate, pass only pydantic class, construct expected header from pydantic
    for i in range(3):
        if i > 0:
            print(f"attempt {i}")
        response = get_response(messages, prompt_prefix1, verbose=False)
        if response is None:   # FAIL - retries exhausted
            print('retrying get_csv')
            continue

        # do basic validation and cleanup
        # should check first line is valid header and doesn't reorder columns
        csv_valid, csv_err = [], []
        lines = response.split("\n")
        if header_array:
            inp_array = csv_validate_re.findall(lines[0])
            if len(header_array) != len(inp_array) \
                or any(validation != inp.strip() for (validation, inp) in zip(header_array, inp_array)):
                print("bad header: ")
                print("got:    ", lines[0])
                print("expected:", ",".join(header_array))
                continue

        for line in lines:
            try:
                # csv_values = csv_validate_re.findall(line)
                if True: # better workflow to put everything in one file and schema validate file later, I think
                    # at cost of getting some lines like 'nothing found'
                    # if len(csv_values) == len(header_array):
                    csv_valid.append(line)
                else:
                    csv_err.append(line)
            except:
                csv_err.append(line)
        return csv_valid, csv_err, response


In [None]:
# for each comment object we will extract the body 
# then submit as part of a prompt to chatgpt
print(datetime.now())

nposts = 1000
slist = res3.copy()
total_posts = len(slist)
print("processing %d posts" % total_posts)

# make sure out and logs are empty
for f in glob.glob('%s/*' % outdir):
    os.remove(f)
for f in glob.glob('%s/*' % logdir):
    os.remove(f)
file_index = 0
maxtokens=2048

while(slist):  # still comments to process
    tokens_to_date = count_tokens(prompt_prefix1)
    reply_ids = []
    messages = []
    for _ in range(nposts):  # add up to this many posts to the prompt
        if slist:
            # make sure no single post > max_post_size, truncate in place as nec 
            slist[0].body = slist[0].body[:max_post_size]
            if tokens_to_date + count_tokens(slist[0].body) < maxtokens:
            # total post content < maxchars
            # if chars_to_date + len(slist[0].body) < maxchars:
                reply = slist.pop(0)
                reply_ids.append(reply.id)
                body = reply.body
                
                messages.append(f"""
post_id: "{reply.id}"
post_score: "{reply.score}"
{body}
"""
                )
                tokens_to_date += count_tokens(messages[-1])
                # chars_to_date += len(messages[-1])
            
    expected_header = ['"post_id"', '"post_score"', '"artist"', '"track"']
    csv_valid, csv_err, response = get_csv_from_chat_gpt(messages, header_array=expected_header)
    csv_output = "\n".join(csv_valid)

    with open("%s/%04d.csv" % (outdir, file_index), 'w') as outfile:
        outfile.write(csv_output)
    
    if csv_err:
        with open("%s/%04d.err" % (outdir, file_index), 'w') as outfile:
            outfile.write("\n".join(csv_err))
        
    with open("%s/%04d.log" % (logdir, file_index), 'w') as logfile:
        logfile.write(str(reply_ids))
        logfile.write('\n\n===== raw prompt =====\n\n')        
        logfile.write("\n=====\n".join(messages))
        logfile.write('\n\n===== raw response =====\n\n')
        logfile.write(response)
        logfile.write('\n\n===== failed validation =====\n\n')
        logfile.write("\n".join(csv_err))
 
    file_index += 1
    outcount = total_posts-len(slist)
    print(outcount, end=' ')
    
    
print()
print(datetime.now())



In [None]:
## concatenate outputs as bronze.txt
# may still have to tweak the files to get them to load


In [None]:
# filelist = glob.glob('%s/*.csv' % outdir)

# output_df = None
# count = 0
# for f in sorted(filelist):
#     print(f)
#     try:
#         tempdf = pd.read_csv("%s" % (f), header=None)
#     except Exception as exc:
#         print(str(exc))
#         continue
#     colcount = len(tempdf.columns)
#     if len(tempdf.columns) != 4:
#         print('%s has %d columns, skipped' % (f, colcount))
#         continue
        
#     # ok
#     # truncate header row if it looks like a header
#     if tempdf.iloc[0][0]=='post_id':
#         tempdf = tempdf[1:]
#     # set the header explicitly
#     tempdf.columns=["post_id","post_score","artist","track"]

#     if output_df is not None:        
#         output_df = pd.concat([output_df, tempdf], axis=0)
#     else:
#         output_df = tempdf
#     count += 1
#     if count % 10 == 0:
#         print(count, end=' ')

        
        
        

In [None]:
def valid_post_id(s):
    s = s.strip()
    valid = 3 < len(s) < 10
    return valid
# validator.add_record_check(check_post_id)

def valid_post_score(s):
    s = s.strip()
    valid = all([c.isdigit() for c in s]) and int(s) < 99999
    return valid
    
schema = Schema([{'post_id': valid_post_id,
                  'post_score': valid_post_score, 
                  'artist': str,
                  'track': str,
                 }])

filelist = glob.glob('%s/*.csv' % outdir)

objlist = []
with open(savefile, 'w') as outfile:
    for f in tqdm(filelist, desc = 'File concat'):
        with open(f, 'r') as csvfile:
            reader = csv.DictReader(csvfile, delimiter=",", fieldnames=[
                "post_id",
                "post_score",
                "artist",
                "track"
            ])
            data=[row for row in reader]
                                   
        data = data[1:]
        
        try:
            objlist.extend(schema.validate(data))
        except Exception as error:
            print(f)
            print(error)
            break



In [None]:
tempdf = pd.DataFrame.from_dict(objlist)
tempdf.columns=['post_id','post_score','artist','track']

with open('tempdf.pkl', 'wb') as f:
    pickle.dump(tempdf, f)


tempdf


In [None]:
def fix_leading_trailing(s):
    """Fix where it encloses in quotes etc."""
    # regex prob better if re.match('^\W+(.*)\W+$',playerName): 
    closers={'(': ')', # rest prob no factor but anyway
             '“':'”',
             '‘':'’',
             '{': '}',
             '[': ']',
             '<': '>'}
    s = str(s).strip()
    while len(s) >= 2 and (not s[0].isalnum()) and (s[0] == s[-1] or closers.get(s[0])==s[-1]):
        s = s[1:-1]
        s = s.strip()
            
    return s


In [None]:
tempdf = tempdf.drop_duplicates() \
    .sort_values(["post_score", "artist", "track"], ascending=False)
# drop header row
tempdf = tempdf.loc[~(tempdf['post_id'].str.strip()=='post_id')]
# na to ""
tempdf.loc[tempdf['post_id'].isna(), 'post_id'] = ''
tempdf.loc[tempdf['post_score'].isna(), 'post_score'] = ''
tempdf.loc[tempdf['artist'].isna(), 'artist'] = ''
tempdf.loc[tempdf['track'].isna(), 'track'] = ''
# strip spaces
tempdf['post_id'] = tempdf['post_id'].apply(fix_leading_trailing)
tempdf['post_score'] = tempdf['post_score'].apply(fix_leading_trailing)
tempdf['artist'] = tempdf['artist'].apply(fix_leading_trailing)
tempdf['track'] = tempdf['track'].apply(fix_leading_trailing)
# clean up post_score to valid int
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: "".join([c for c in s if c.isdigit()]))
tempdf['post_score'] = tempdf['post_score'].apply(lambda x: x[-5:])
tempdf['post_score'] = tempdf['post_score'].apply(lambda s: int(s) if s else 1)
# drop missing tracks, cleanup track
tempdf = tempdf.drop(tempdf.loc[tempdf['track']==''].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='unknown'].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='n/a'].index)
tempdf = tempdf.drop(tempdf.loc[tempdf['track'].str.lower()=='track'].index)
tempdf = tempdf.sort_values(["post_score", "artist", "track"], ascending=False)
tempdf.loc[tempdf['post_score']==0, 'post_score'] = 1
# any test examples
tempdf = tempdf.loc[~(tempdf['post_id']=='abcdefg')]
tempdf


In [None]:
df = tempdf
df.loc[df['artist']=='N/A', 'artist']=''
df.loc[df['artist'].str.lower().str.startswith('unknown'), 'artist']=''
df.loc[df['artist'].str.lower().str.startswith('various'), 'artist']=''

artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values(["artist", "track"], ascending=False) \
    .reset_index()

artist_df


In [None]:
tempdf.to_csv('bronze.csv', index=False)

tempdf.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(tempdf, f)

len(tempdf)



# Impute missing artists
if someone just says 'Clair de Lune', or 'Let it be', without specifying the artist, maybe we can impute the artist

In [None]:
missing_map = {}
try:
    artist_map = pd.read_csv("missing_artists.csv")
    missing_map = dict(zip(artist_map['track'],artist_map['artist']))
except:   # doesn't exist
    pass

missing_map

In [None]:
df.loc[df['artist']=='Claire de lune', 'track']='Clair de Lune'
df.loc[df['track']=='Claire de Lune', 'track']='Clair de Lune'
df.loc[df['track']=='Clair de Lune', 'artist']='Claude Debussy'


In [None]:
df['artist2'] = df.apply(lambda row: missing_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']]



In [None]:
df['artist'] = df.apply(lambda row: missing_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_map else row.artist, axis=1)


In [None]:
missing_artist_df = df.loc[(df['artist']=='')]
missing_artist_df


In [None]:
prompt_prefix3 = """I will provide a list of well-known recordings. For each recording, you will review and provide the name of the artist most closely associated with the recording. You will provide the results in CSV format, one record per line in the following order: recording, artist. Enclose each field in double-quotes.

The input is:

"""
missing_artist_df = df.loc[(df['artist']=='')]

def missing_artists(missing_artist_df):
    
    missing_track_map = {}
    
    slist = missing_artist_df['track'] \
        .dropna() \
        .str.lower() \
        .str.strip() \
        .drop_duplicates() \
        .tolist()

    slist.sort()
    n_missing = len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        tokens_to_date = count_tokens(prompt_prefix3)
        prompt = ''
        rows = 0
        for _ in range(nposts):  # add up to nposts posts to the prompt
            if slist and tokens_to_date + count_tokens(slist[0]) < 1024:
                track = f'"{slist.pop(0)}"\n'
                prompt += track
                tokens_to_date += count_tokens(track)
                rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")
        response = get_response(prompt, prompt_prefix3, verbose=False)

        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("nothing returned ... check returned dict for errors")

        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        c=0        
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line)
                if len(csv_values) != 2:
                    print(f"{len(csv_values)} values found: ", line)
                    continue
                track_input, artist_correct = csv_values[0], csv_values[1]
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_correct) >=2 and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                while len(track_input) >=2 and (not track_input[0].isalnum()) and track_input[0] == track_input[-1]:
                    track_input = track_input[1:-1]
                # if it wasn't found then skip
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                # store in dict to update df
                c += 1
                missing_track_map[track_input]=artist_correct
                print(f'{track_input}: {artist_correct}')                    
            except Exception as error:
                print('error', line)
                print(error)
                continue
                
        print(f"{c} lines processed, total {n_missing-len(slist)}, {len(slist)} of {n_missing} remaining")
        
    return missing_track_map
                
missing_track_map = missing_artists(missing_artist_df)



In [None]:
missing_track_map 


In [None]:
# check for reasonableness, clean up and apply
df['track']=df['track'].astype(str)
df['artist2'] = df.apply(lambda row: missing_track_map[row.track.lower().strip()] if row.artist=="" and row.track.lower().strip() in missing_track_map else row.artist, axis=1)
df.loc[df['artist'] != df['artist2']].head(20)



In [None]:
df['artist'] = df.apply(lambda row: missing_track_map[row.track.lower()] if row.artist=="" and row.track.lower() in missing_track_map else row.artist, axis=1)



In [None]:
uniques = {k:v for k,v in missing_track_map.items() if k not in missing_map}
dupes = {k:v for k,v in missing_track_map.items() if k in missing_map}

# these should be equal since we already applied missing_map
len(missing_track_map), len(uniques), len(dupes)

In [None]:
# add new ones to missing_artists.csv
temp = pd.DataFrame({'track': uniques.keys(),
              'artist': uniques.values()}) \
    .sort_values(["artist", "track"])

temp.to_csv('missing_artists_new.csv',index=False)

# Fix typos, abbreviations, etc. using ChatGPT

In [None]:
artist_map = {}
try:
    artist_map = pd.read_csv("artist_map.csv")
    artist_map = dict(zip(artist_map['artist_orig'],artist_map['artist_corrected']))
except:
    pass
artist_map

In [None]:
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.loc[df['artist'] != df['artist2']]


In [None]:
# apply the map
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)


In [None]:
prompt_prefix2 = """You will act as a proofreader. I will provide you a list of recording artists or composers.
You will review each input artist for any spelling errors or abbreviations and provide the corrected full artist without abbreviation. 
You will provide them in CSV format, one record per line in the following order: input_artist, corrected_artist. Enclose each field in double-quotes.
The input is:

"""


In [None]:
# proofread / dedupe artists
# may want to run this whole sequence a couple of times and update df, silver.csv

def dedupe_artists(artist_df):
    
    nposts = 1000
    artist_map

    slist = sorted(artist_df['artist'].tolist())
    n_artists=len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        prompt = ""
        tokens_to_date = count_tokens(prompt_prefix2)
        rows = 0
        for _ in range(nposts):  # add up to 100 posts to the prompt
            if slist:
                if tokens_to_date + count_tokens(slist[0]) < 1024:
                    artist = f'{slist.pop(0)}\n'
                    prompt += artist
                    tokens_to_date += count_tokens(artist)
                    rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")

        response = get_response(prompt, prompt_prefix2, verbose=False)
        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("there was a problem, check the payload")


        lines = response.split("\n")
        print(f"received {len(lines)} lines...")
        # sometimes doesn't match, chatgpt monkeys skip some

        c=0
        for line in lines:
            try:
                csv_values = csv_validate_re.findall(line) 
                if len(csv_values) != 2:
                    print('%d values found' % len(csv_values), line)
                    continue
                artist_input, artist_correct = csv_values[0].strip(), csv_values[1].strip()
                # fix artist enclosed in quotes, parens, etc.
                while len(artist_input) >= 2 and (not artist_input[0].isalnum()) and artist_input[0] == artist_input[-1]:
                    artist_input = artist_input[1:-1]
                while len(artist_correct) and (not artist_correct[0].isalnum()) and artist_correct[0] == artist_correct[-1]:
                    artist_correct = artist_correct[1:-1]
                # if it matches modulo case then skip
                if artist_input.lower() == artist_correct.lower():
                    continue
                # if it wasn't found then skip
                if len(artist_correct) == 0:
                    print("empty artist returned")
                    continue
                if artist_correct.lower().startswith('unknown'):
                    continue
                if artist_correct.lower() == 'n/a':
                    continue
                if artist_correct.lower() == "no correction needed":
                    continue
                # store in dict to update df
                c+=1
                artist_map[artist_input]=artist_correct
                print(f'"{artist_input}", "{artist_correct}"')
            except Exception as error:
                print('error', line)
                print(error)
                continue
        print(f"{c} lines processed, total {n_artists-len(slist)}, {len(slist)} of {n_artists} remaining")
        
    return artist_map

artist_map=dedupe_artists(artist_df)

print(datetime.now())


In [None]:
print(artist_map)


In [None]:
len(old_artist_map)

In [None]:
len(artist_map)

In [None]:
# save in artist_map.csv but no dupes
old_artist_map = pd.read_csv("artist_map.csv")
old_artist_map = dict(zip(old_artist_map['artist'],old_artist_map['map']))
not_dupes = {k: artist_map[k] for k in artist_map.keys() if k not in old_artist_map}
not_dupes
len(not_dupes)


In [None]:
pd.DataFrame({'artist': not_dupes.keys(), 'map': not_dupes.values()}).to_csv('artist_map_new.csv', index=False)

In [None]:
# check the map for reasonableness
# it does pretty smart stuff like map nin to Nine Inch Nails 
# but if it screws up that artist probably won't show up in spotify
df['artist2'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.loc[df['artist'] != df['artist2']]


In [None]:
# run again if desired

In [None]:
df.loc[df['artist'].isna(), 'artist']=""
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('various')].index)
df = df.drop(df.loc[df['artist'].str.lower().str.startswith('no correction needed')].index)
df = df.drop(df.loc[df['artist']==''].index)

artist_df = df[['artist','track']].groupby('artist') \
    .count() \
    .sort_values(["artist", "track"]) \
    .reset_index()

artist_df.head(20)


# Dedupe with pandas_dedupe

In [None]:
df['artist'] = df['artist'].apply(fix_leading_trailing)
df['artist_dedupe'] = df['artist'].str.lower()
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('various')].index)
df = df.drop(df.loc[df['artist_dedupe'].str.startswith('no artist found')].index)
df = df.drop(df.loc[df['artist_dedupe']=='null'].index)
df = df.drop(df.loc[df['artist_dedupe']=='none'].index)
df = df.drop(df.loc[df['artist_dedupe']==''].index)
df = df.drop(df.loc[df['artist_dedupe']=='post_score'].index)



In [None]:
df['artist_dedupe'] = df['artist_dedupe'].apply(lambda s: s[4:] if s[:4].lower()=='the ' else s)

df.loc[df['artist_dedupe']=='band', 'artist_dedupe']='the band'



In [None]:
dedupe_df = df[['artist', 'artist_dedupe', 'post_score']] \
    .groupby(['artist', 'artist_dedupe']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() 

dedupe_df


In [None]:
# reset dedupe learned settings
!rm dedupe_dataframe_learned_settings 
!rm dedupe_dataframe_training.json   
dedupe_df2 = pandas_dedupe.dedupe_dataframe(dedupe_df, ['artist_dedupe'])



In [None]:
dedupe_df2


In [None]:
dedupe_df['cluster id'] = dedupe_df2['cluster id']
name2i = {a: i for i, a in zip(dedupe_df['cluster id'].tolist(), dedupe_df['artist_dedupe'].tolist())}
df['artist_index'] = df['artist_dedupe'].apply(lambda s: name2i[s])
df



In [None]:
tempdf1 = df[['artist', 'artist_index', 'post_score']] \
    .groupby(['artist', 'artist_index']) \
    .agg(post_score=('post_score', 'sum'),
        count=('post_score', 'count')) \
    .reset_index() \
    .reset_index(drop=True) \
    .reset_index() \
    .sort_values('post_score', ascending=False)

tempdf1 



In [None]:
with pd.option_context("display.max_rows", 9999, "display.max_cols", 999):
        display(tempdf1.loc[tempdf1['count']>1])

        

In [None]:
i2name = {}
for i, a, ai, s in tempdf1[['artist','artist_index', 'post_score']].sort_values('post_score').itertuples():
    i2name[ai]=a
len(i2name)

In [None]:
i2name[3]

In [None]:
df['artist2'] = df.apply(lambda r: i2name[r.artist_index], axis=1)
(df.loc[df['artist'] !=  df['artist2']])[['artist', 'artist2']] \
    .drop_duplicates() \
    .sort_values('artist2').to_csv('z.csv', index=False)


In [None]:
artist_map2=pd.read_csv('artist_map2.csv')
artist_map2 = dict(zip(artist_map2['artist'], artist_map2['artist2']))
df['artist2'] = df.apply(lambda r: artist_map2[r['artist']] if r['artist'] in artist_map2 else r['artist'], axis=1)
df.loc[df['artist'] !=  df['artist2']]


In [None]:
df['artist'] = df.apply(lambda r: artist_map2[r['artist']] if r['artist'] in artist_map2 else r['artist'], axis=1)


In [None]:
df.loc[(df['artist_index'].isna())]
df.loc[(df['artist_index']==0)]

In [None]:
dedupe_df

In [None]:
dedupe_df2.loc[dedupe_df2['cluster id'].isin(z['cluster id'])].sort_values(['cluster id', 'post_score']).head(20)

In [None]:
dedupe_df2.loc[dedupe_df2['cluster id'].isin(z)].sort_values(["confidence","cluster id"]).head(20)

In [None]:
tempdf=dedupe_df2.groupby(['cluster id', 'artist_dedupe']).count().reset_index()
z = tempdf.loc[tempdf['index'] > 1]
z

In [None]:
df

In [None]:
# map to artist 
tempdf = dedupe_df[['artist_dedupe', 'artist', 'cluster id', 'post_score']] \
    .groupby(['artist_dedupe', 'cluster id']) \
    .agg( \
         count=('post_score', 'count'), \
         artist=('artist', 'first') \
        ) \
    .reset_index() \
    .sort_values('count', ascending=False) 
with pd.option_context("display.max_rows", 9999):
    display(tempdf.head(100))

In [None]:
tempdf.loc[tempdf['cluster id']==6]

In [None]:
i2name = {i: a for i, a in zip(tempdf['cluster id'].tolist(), tempdf['artist'].tolist())}
df['artist2'] = df.apply(lambda r: i2name[r.artist_index], axis=1)
df.loc[df['artist'] !=  df['artist2']]

In [None]:
df.loc[df['artist'].str.lower().str.find('carp') >=0]

In [None]:
df.groupby('track') \
    .count() \
    .reset_index() \
    .sort_values('artist', ascending=False) \
    .head(20)



In [None]:
df['track2'] = df['track'].str.lower()


In [None]:
df = df.drop(df.loc[df['track2'].str.startswith('unknown')].index)
df = df.drop(df.loc[df['track2'].str.startswith('no track')].index)
df = df.drop(df.loc[df['track2']=='cover'].index)
df = df.drop(df.loc[df['track2']=='version'].index)
df = df.drop(df.loc[df['track2']=='anything'].index)
df = df.drop(df.loc[df['track2']=='none'].index)
df = df.drop(df.loc[df['track2'].str.startswith('no artist')].index)
df = df.drop(df.loc[df['track2'].str.startswith('various')].index)
df = df.drop(df.loc[df['track2']==''].index)
len(df)

In [None]:
df[['artist', 'track', 'post_score', 'track2']] \
    .groupby(['artist', 'track2']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values('sum', ascending=False) \
    .reset_index(drop=True) \
    .rename(columns={'sum': 'score'})



In [None]:
dedupe_track_df = df[['artist', 'track', 'track2', 'post_score']] \
    .groupby(['artist', 'track', 'track2']) \
    .count() \
    .sort_values('post_score', ascending=False) \
    .reset_index() \
    .reset_index() \
    .rename(columns={'post_score': 'count'})


dedupe_track_df


In [None]:
!rm dedupe_dataframe_learned_settings 
!rm dedupe_dataframe_training.json   
dedupe_track_df2 = pandas_dedupe.dedupe_dataframe(dedupe_track_df[['artist', 'track2']], 
                                                  ['artist','track2'],
                                                  canonicalize=True,)y
y


In [None]:
dedupe_track_df2


In [None]:
dedupe_track_df['track_id']=dedupe_track_df2['cluster id']
dedupe_track_df['confidence']=dedupe_track_df2['confidence']
dedupe_track_df

In [None]:
dedupe_track_df3 = dedupe_track_df.loc[dedupe_track_df['confidence'] > 0.4][['artist', 'track', 'track_id']]
dedupe_track_df3


In [None]:
names2i = {(artist, track): id for artist, track, id in zip(dedupe_track_df3['artist'], 
                               dedupe_track_df3['track'],
                               dedupe_track_df3['track_id'])
         }
names2i

In [None]:
df['track_index']= df.apply(lambda r: str(names2i[(r['artist'], r['track'])]) if (r['artist'], r['track']) in names2i else r['track'], axis=1)

In [None]:
df

In [None]:
tempdf = df[['artist', 'track', 'post_score']] \
    .groupby(['artist']) \
    .agg(sum= ('post_score', 'sum'), \
         track= ('track', 'first') \
        ) \
    .reset_index() \
    .sort_values(['sum'], ascending=False) \
    .rename(columns={'sum': 'score'}) \
    .reset_index(drop=True)
tempdf

In [None]:
df

In [None]:
tempdf = tempdf[['artist', 'track', 'score']]
display(tempdf.loc[tempdf['score'] > 4].head(20))
display(tempdf.loc[tempdf['score'] > 4].tail(20))


In [None]:
df = tempdf.loc[tempdf['score'] > 4]
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [None]:
# I think dedupe may not be matching most popular form of artist
z = {'Hans Zimmer Radiohead':'Hans Zimmer',
'Fleetwood Mac / Peter Green':'Fleetwood Mac',
'Jeremy Soule featuring Asja':'Jeremy Soule',
'Cardi B featuring Megan Thee Stallion':'Cardi B',
'FWM':'Fleetwood Mac',
'Eric Whitacre ft. Voces8':'Eric Whitacre',
'Gorillaz feat. Little Dragon':'Gorillaz',
'RAC (feat. Katie Herzig)':'RAC',
'Jamie XX featuring Romy':'Jamie XX',
'Spell Songs featuring Julie Fowlis':'Spell Songs',
'Kenny G featuring Aaron Neville':'Kenny G',
'Charlie Haden Quartet featuring Norah Jones':'Charlie Haden Quartet',
'Wiz Khalifa feat. Charlie Puth':'Wiz Khalifa',
'Marshmello feat. Khalid':'Marshmello',
'YOSHIKI feat. HYDE':'YOSHIKI',
'Charlie Hunter Quartet featuring Norah Jones':'Charlie Hunter Quartet',
'Alison Krauss featuring Natalie MacMaster':'Alison Krauss',
'Hodson featuring Jay-Z':'Hodson',
'Nujabes feat. Cise Starr':'Nujabes',
'Flume feat. Tove Lo':'Flume',
'Drake feat. Yebba':'Drake',
'Radwimps featuring Toaka':'Radwimps',
'Weezer featuring Hayley Williams':'Weezer',
'Post Malone featuring Swae Lee':'Post Malone',
'Polyphia featuring Ichika':'Polyphia',
'Eminem featuring Dido':'Eminem',
'PJ Morton featuring Yebba':'PJ Morton',
'Aurora featuring Pomme':'Aurora',
'Kaskade featuring Haley':'Kaskade',
'XXXTENTACION featuring Scott James':'XXXTENTACION',
'Snow Patrol feat. Martha Wainwright':'Snow Patrol',
'Direct featuring Danyka Nadeau':'Direct',
'Erra feat. Courtney LaPlante from Spirit Box':'Erra',
'Ursine Vulpine featuring Annaca':'Ursine Vulpine',
'Howard Shore featuring Sir James Galway':'Howard Shore',
'Daft Punk feat. Paul Williams':'Daft Punk',
'Wooli featuring Delaney Kai':'Wooli',
'UNKLE featuring Thom Yorke':'UNKLE',
'Bastille feat. The Chamber Orchestra of London':'Bastille',
"Marty O'Donnell, Stan LePard, & Michael Salvatori":"Marty O'Donnell",
"Stan Getz, João Gilberto":"Stan Getz",
'RAC (feat. Katie Herzig)':'RAC',
'Taylor Swift ft. Bon Iver':'Taylor Swift',
'Calvin Harris ft. Florence Welch':'Calvin Harris',
'Brad Paisley ft. Alison Krauss':'Brad Paisley',
'Delirium ft. Sarah McLachlan':'Delirium',
'T-Pain ft. Akon, Mary J. Blige':'T-Pain',
'Manchester Orchestra ft. Daniel Radcliffe and Paul Dano':'Manchester Orchestra',
'Nujabes ft. MINMI':'Nujabes',
'Samurai Champloo ft. MINMI & Nujabes':'Samurai Champloo',
'MINMI ft. Nujabes':'MINMI',
'ODESZA ft. MARO':'ODESZA',
'Sharon Jones & The Dap-Kings ft. Lee Fields':'Sharon Jones & The Dap-Kings',
'Sarah Barrios ft. Eric Nam':'Sarah Barrios',
'David Arkenstone ft. Charlee Brooks':'David Arkenstone',
'XXYYXX ft. Anneka':'XXYYXX',
'Leonard Cohen, Brandi Carlile':'Leonard Cohen',
'Harry Waters Jr., Marvin Berry, and the Starlighters':'Harry Waters Jr.',
'Roberta Flack, Donny Hathaway':'Roberta Flack',
'Zedd, Maren Morris, Grey':'Zedd',
'Frank Ocean, James Blake':'Frank Ocean',
'Susan Suh, Robert Koch':'Susan Suh',
'Black Country, New Road':'Black Country',
'Khruangbin, Leon Bridges':'Khruangbin',
'Friendship, Emily Warren':'Friendship',
'Jonsi, Alex':'Jonsi',
'Khalid, Future':'Khalid',
'Duke Ellington, John Coltrane':'Duke Ellington',
'Clams Casino, Imogen Heap':'Clams Casino',
'Burial, Sacred Tapestry':'Burial',
'foudeqush, Ludwig Goransson':'foudeqush',
'Conjure One, Poe':'Conjure One',
'Cyril Giroux, Chloé Lacan':'Cyril Giroux',
'Carrie Underwood, Travis Cottrell, Debby Boone':'Carrie Underwood',
'Aska, Chage':'Aska',
'T-Pain ft. Akon, Mary J. Blige':'T-Pain',
'Dan Balan, Katerina Begu':'Dan Balan',
'Jose Padilla, Seal':'Jose Padilla',
'Ratso, Nick Cave':'Ratso',
'Slaughter Beach, Dog':'Slaughter Beach',
'Paganini, Liszt':'Paganini',
'May Erlewine, Woody Goss':'May Erlewine',
'Cyua, Hiroyuki Sawano':'Cyua',
'Dan Zanes, Natalie Merchant':'Dan Zanes',
'Sting, Ray Chen':'Sting',
'Coco and Clair Clair, Okthxbb':'Coco and Clair Clair',
'Edgar Meyer, Mike Marshall, Bela Fleck':'Edgar Meyer',
'Ray Charles, Willie Nelson':'Ray Charles',
'Wildlight, The Polish Ambassador and Ayla Nereo':'Wildlight',
'Max Richter, Dinah Washington':'Max Richter',
'Steve Martin, Dolly Parton, Vince Gill':'Steve Martin',
'Elis Regina, Antonio Carlos Jobim':'Elis Regina',
'James Blunt, The Righteous Brothers, Brad Paisley':'James Blunt',
'Leprous, Dream Theater, Periphery':'Leprous',
'Frank Sinatra, Glenn Miller, Van Morrison':'Frank Sinatra',
'Pink, Willow Sage Hart':'Pink',
'Jessye Norman, Stephen Adams, Christopher Bowers-Broadbent':'Jessye Norman',
'Nujabes, MINMI, and Samurai Champloo':'Nujabes',
'Nujabes, Samurai Champloo, and MINMI':'Nujabes',
'Steve Martin, Steep Canyon Rangers':'Steve Martin',
'Celine Dion, Barbra Streisand':'Celine Dion',
'Bryan Adams, Luciano Pavarotti':'Bryan Adams',
'MUZZ (Mat Zo, Olan and A&B)':'MUZZ',
'Pink, Sage (The Gemini':'Pink',
'Moby, Sinead Oconnor':'Moby',
'I Vow to Thee, My Country':'I Vow to Thee',
'Jose Padilla, Kirsty Keach':'Jose Padilla',
'Don Francisco, Wendy Francisco, Jerry Palmer':'Don Francisco',
'Dave Grohl, Josh Homme, & Trent Reznor':'Dave Grohl',
'Steve Conte, Maaya Sakamoto':'Steve Conte',
'Sting, Stevie Wonder':'Sting',
'Jacob Collier, Lizzy McAlpine, John Mayer':'Jacob Collier',
'Joseph Shabason, Nicholas Krgovich, Shabason & Krgovich':'Joseph Shabason',
'Khalid, Benny Blanco, Halsey':'Khalid',
'Ed Sheeran, Andrea Bocelli':'Ed Sheeran',
'Kim Petras, Nicki Minaj':'Kim Petras',
'Bryce Dessner, James McAlister, Nico Muhly, Sufjan Stevens':'Bryce Dessner',
'Appleseed, YouSeeBigGirl, T:T':'Appleseed',
'Solarstone, Andy Bury':'Solarstone',
'Carti, Summertime Sadness':'Carti',
'Dolly Parton, Linda Ronstadt, and Emmylou Harris':'Dolly Parton',
'LMM, Hwasa':'LMM',
'Snowgoons, Viro the Virus':'Snowgoons',
'Sarah Class, Cantamus Choir':'Sarah Class',
'Boy meets Girl, Brian McKnight, Vanessa Williams, Bonnie Tyler, Jax':'Boy meets Girl',
'Jeff Buckley, The Righteous Brothers, Johann Pachelbel':'Jeff Buckley',
'Nate J, Nate Traveller':'Traveller',
'Debussy, Flight Facilities':'Flight Facilities',
'Death Cab for Cutie/The Postal Service':'Death Cab for Cutie',
'Sting (musician)':'Sting',
'Mick Hucknall Simply Red':'Simply Red',
'Johnny Cash and Bob Dylan':'Johnny Cash',
'John Prine and Bonnie Raitt':'Bonnie Raitt',
'Andrea Bocelli and Celine Dion':'Andrea Bocelli',
'Bob Dylan & Johnny Cash':'Bob Dylan',
'Minnie Riperton and Richard Rudolph':'Minnie Riperton',
'Eric Whitacre featuring Voces8':'Eric Whitacre',
'Hans Zimmer and Benjamin Wallfisch':'Hans Zimmer',
'Grover Washington Jr. & Bill Withers':'Grover Washington Jr.',
'Barry DeVorzon and Perry Botkin Jr.':'Barry DeVorzon',
'Porter Robinson and Madeon':'Porter Robinson',
'Ed Sheeran ft Yebba':'Ed Sheeran',
'Sara Bareilles & Josh Groban':'Sara Bareilles',
'St. Vincent (musician)':'St. Vincent',
'Glen Hansard and Marketa Irglova':'Glen Hansard',
'Willie Nelson & Ray Charles':'Willie Nelson',
'Alina Baraz Galimatias':'Alina Baraz',
'Hans Zimmer & Lisa Gerrard':'Hans Zimmer' ,
'Louis Armstrong Jr.': 'Louis Armstrong',
'Ludovico Einaudi ft. Greta Svabo Bech': 'Ludovico Einaudi',
'Henry Mancini & Audrey Hepburn': 'Henri Mancini',
'Bon Iver and St. Vincent': 'Bon Iver',
'Coldplay Avicii': 'Coldplay',
'Bruce Springsteen, Melissa Etheridge': 'Bruce Springsteen',
'Billie Eilish ft. Khalid': 'Billie eilish',
'Andrea Bocelli and Josh Groban': 'Andrea Bocelli',
'Norah Jones and Danger Mouse': 'Norah Jones',
'Ennio Morricone & Joan Baez': 'Ennio Morricone',
"Des'ree": 'Desree',
'Porter Robinson, Madeon': 'Porter Robinson',
'Ray Charles & Willie Nelson': 'Ray Charles',
'Ludvig Forssell and Jenny Plant': 'Ludvig Forssell',
'Nicole Kidman & Ewan McGregor': 'Ewan McGregor',
'Deadmau5 and Kaskade': 'Deadmau5',
'Beabadoobee feat. Clairo': 'Beabadoobee',
'John Coltrane, Duke Ellington': 'John COltrane',
'Nicholas Britell and Spring 1 - Max Richter': 'Nicholas Britell',
'Dave Matthews & Tim Reynolds': 'Dave Matthews',
'MINMI & Nujabes': 'MINMI',
'Nu Deco Ensemble Kishi Bashi': 'Kishi Bashi',
'Ana Carolina Seu Jorge': 'Ana Carolina',
'Skillet (band': 'Skillet',
'Soccer Mommy (Sophie Allison': 'Soccer Mommy',

}

In [None]:
for k,v in z.items(): 
    tdf = df.loc[df['artist']==k]
    if len(tdf) > 0:
        print(k, len(tdf))        
        df.loc[df['artist']==k, 'artist']=v


In [None]:
# tempdf = df[['artist', 'post_score']] \
#     .groupby('artist') \
#     .sum() \
#     .reset_index() 

# tempdf.loc[tempdf['post_score']> 2].to_csv('x.csv', index=False)

In [None]:
df

In [None]:
df = df[['artist', 'track', 'score']].groupby(["artist", "track"]) \
    .sum() \
    .reset_index() \
    .sort_values(["score", "artist", "track"], ascending=False)

df.head(20)



## Filter by minimum score


In [None]:
df = df.loc[df['score'] >4]
df

In [None]:
df.to_csv('silver.csv', index=False)
with open('silver.pkl', 'wb') as f:
    pickle.dump(df, f)


In [None]:
with open('silver.pkl', 'rb') as f:
    df = pickle.load(f)
df.head(20)

# Load into a Spotify playlist


In [None]:
# log in
client_credentials_manager = SpotifyClientCredentials(client_id=os.getenv('SPOTIFY_CLIENT_ID'), 
                                                      client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                      )

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [None]:
df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values(["score", "artist", "track"], ascending=False)
df.to_csv('silver.csv', index=False)


In [None]:
# check artists
# update to spotify canonical name as necessary

df = pd.read_csv("silver.csv")
df = df.groupby(['artist','track']).sum().reset_index().sort_values(["score", "artist", "track"], ascending=False)

dedupe = {}
fail_list = []
artist_map = {}
for index, artist, title, score in df.itertuples():
    artist = str(artist)
    if artist in dedupe:
        continue
    dedupe[artist]=1
    query_str = 'artist:%s' % (artist)
    artist_results = sp.search(q=query_str, type='artist', limit=3, offset=0, market='US')
    artist_names = [artist['name'] for artist in artist_results['artists']['items']]
    if artist_names:
        if artist.lower() != artist_names[0].lower():
            artist_map[artist] = artist_names[0]
            print(artist, '->', artist_names[0])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)

# then clean up manually as appropriate

In [None]:
[f[0] for f in fail_list]

In [None]:
artist_map

In [None]:
ignore_list = [ 

'1',     #One Direction',
 'Hem', #'Natalie Hemby',
    'Priscilla',     #: 'Priscilla Chan',
     'Drake',     #: 'Nick Drake',
 'William Ackerman',     #: 'Mark Ackerman, William James Ross',
 'Jason',     #: 'Jason Mraz',
 'Juice',     #: 'Juice WRLD',
 'Origa',     #: 'Origami Angel',
 'Nico',     #: 'Nico & Vinz',
 'Mako',     #: 'Mako Road',
 'Low',     #: 'All Time Low',
 'La La Land Soundtrack',     #: 'LAND Soundtrack',
 'Flamingos',     #: 'Flamingosis',
 'BoA',     #: 'Boards of Canada',
 'Traditional',     #: 'Chinese Traditional',
 'Future',     #: 'Future Islands',
'ASAP Rocky',     # -> Seth Narley feat. ASAP Rocky
'Acoustic',     # -> Acoustic Alchemy
'Adeem',     #Adeem the Artist',
'Al Stewart',     #Alexander Stewart',
'Alpine' ,     #-> Alpine Universe
'America',     #The All-American Rejects',
'Arrow',     # -> Arrows in Action
'Berlin',     #Berliner Philharmoniker',
'Brian Wilson',     # -> Brian Courtney Wilson
'CSNY',     #Csnyee_',
'Choir Choir Choir!',     #Mav City Gospel Choir',
'Dallas Green',     # -> Jimmy Carter and Dallas County Green
'Death',     #Five Finger Death Punch',
'Dixie Chicks',     # -> Karaoke - Dixie Chicks
'Eileen',     #Eileen Walker',
'Eric Johnson',     #Eric D. Johnson',
'Frente',     #Frente Cumbiero',
'IZ',     #Izzamuzzic',
'Japanese House',     # -> The Japanese House
'Jewel',     # -> Run The Jewels
'LP',     #LP Giobbi',
'La La Land Soundtrack' ,     #-> LAND Soundtrack
'Live',     # -> DPR LIVE
'MCR',     #Tate McRae',
'Meatloaf',     #meatloafi',
'Múm',     #Mumford & Sons',
'Nico',     #Nicki Nicole',
'One',     # -> One Direction
'Phil',     #Phil Collins',
'Pink',     #PinkPantheress',
'Priscilla',     # -> Priscilla Block
'Rainbow',     #Rainbow Kitten Surprise',
'Seal',     #Seals and Crofts',
'South Park',     #South Park Mexican',
'The Band',     #The Band CAMINO',
'The La’s',     #The Kid LAROI',
'The Philadelphia Orchestra',     #The Philadelphia Virtuosi Chamber Orchestra',
'The Promise',     #Lukas Nelson and Promise of the Real',
'Train',     #Meghan Trainor',
'Vince',     #Vince Staples',
'a-ha',     #Daryl Hall & John Oates',

]

for k in ignore_list:
    try:
        print(k, artist_map.get(k))
        artist_map.pop(k)
    except:
        print('error', k)
        pass





In [None]:
artist_map.get('Train')

In [None]:
df['artist'] = df['artist'].apply(lambda s: artist_map[s] if s in artist_map else s)
df.head(20)


In [None]:
df = df.groupby(['artist','track']).sum().reset_index().sort_values(["score", "artist", "track"], ascending=False)
df.to_csv('silver.csv', index=False)


In [None]:
# check tracks
# possibly update tracks to spotify canonical name

df = pd.read_csv("silver-0524.csv")

dedupe = {}
mylist = []
fail_list = []
artist_list, track_list, uri_list, album_list, score_list = [], [], [], [], []
orig_artist, orig_track = [], []

for index, artist, title, score in df.itertuples():
    query_str = 'artist:%s track:%s' % (artist, title)
    track_results = sp.search(q=query_str, type='track', limit=1, offset=0, market='US')
    results = track_results['tracks']['items']
    
    if results:
        r = results[0]
        # failsafe to never put same track twice
        if dedupe.get(r['id']):
            continue
        dedupe[r['id']]=True
        if title.lower() != r['name'].lower():
            print ("%04d %s|%s : %s|%s" % (index, artist, title, r['artists'][0]['name'], r['name']))
        uri_list.append(r['uri'])
        artist_list.append(r['artists'][0]['name'])
        track_list.append(r['name'])
        album_list.append(r['album']['name'])
        orig_artist.append(artist)
        orig_track.append(title)
        score_list.append(score)
#         print('  ',
#               r['artists'][0]['name'],'|',
#               r['name'], '|',
#               r['album']['name'],'|',
#               r['album']['release_date'],'|',
#               r['popularity'])
    else:
        fail_list.append((artist, title))
        print("not found:", artist, "-", title)
        

In [None]:
print(len(fail_list))
fail_list



## Save gold.csv


In [None]:

gold_df = pd.DataFrame({'score': score_list,
                        'input_artist': orig_artist,
                        'artist': artist_list,
                        'input_track': orig_track,
                        'track': track_list,
                        'album': album_list,
                        'uri': uri_list})

with pd.option_context("display.max_rows", 9999):
    display(gold_df)



In [None]:
# inspect where the track name differs
with pd.option_context("display.max_rows", 999):
    display(gold_df.loc[gold_df['input_artist'].str.lower().str[:8] != gold_df['artist'].str.lower().str[:8]])
    

In [None]:
# these are songs that look like covers or otherwise not the expected response from spotify search 
# (which is a bit wonky, doesn't like quotes and such)
# remove from df and add manually
bad_lookups = [
421,
494,
557,
598,
669,
823,
]

for i in bad_lookups:
    print(gold_df.iloc[i])
    
# add manually, plus 'not found'


In [None]:
gold_df = gold_df.drop(
    axis='index',
    labels=bad_lookups)


In [None]:
gold_df

In [None]:
# this you could upload and make a new playlist
# existing playlist is result of multiple iterations

gold_df[['artist', 'track', 'score']].to_csv('gold.csv', index=False)

with pd.option_context("display.max_rows", 999):
    display(gold_df)

# Get Spotify playlist and add songs

In [None]:
# must follow an oauth workflow to write a playlist in Spotify
# running this cell should request a spotify login and then redirect to an url
# paste whole url with id into form to authenticate

scope = "playlist-modify-public"

sp = spotipy.Spotify(auth_manager=spotipy.SpotifyOAuth(scope=scope,
                                                       client_id=os.getenv('SPOTIFY_CLIENT_ID'),
                                                       client_secret=os.getenv('SPOTIFY_CLIENT_SECRET'),
                                                       redirect_uri="https://druce.ai"
                                                      ))


In [None]:
# get playlist id
# first create a playlist in Spotify UI to load songs
def get_playlist_id(playlist_name, verbose=False):
    playlists = sp.user_playlists(os.getenv('SPOTIFY_USERNAME'))
    while playlists:
        for i, playlist in enumerate(playlists['items']):
            if playlist['name'] == playlist_name:
                if verbose:
                    print('"%s": offset %d, URI %s' % (playlist['name'], i + 1 + playlists['offset'], playlist['uri']))
                return playlist['id']

        # not found yet, get next page if there is one
        if playlists['next']:
            playlists = sp.next(playlists)
        else:
            return None

playlist_id = get_playlist_id("RPS2")
print(playlist_id)


In [None]:
# add songs to playlist 

addlist = gold_df['uri'].to_list()
print (len(addlist))

# while(addlist):
#     sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
#                                 playlist_id=playlist_id, 
#                                 tracks=addlist[-100:])
#     addlist = addlist[:-100]
#     print("added items, remaining ", len(addlist))


In [None]:
# compare to playlist

def get_playlist_df(playlist_name):
    
    results = sp.user_playlist(os.getenv('SPOTIFY_USERNAME'), 
                               get_playlist_id(playlist_name),
                               fields='tracks,next,name')
    tracks = results['tracks']
    
    # get tracks, paging as needed
    track_list = []
    while tracks:
        for track_item in tracks['items']:
            track_list.append(track_item['track'])
        # more pages?
        tracks = sp.next(tracks) if tracks['next'] else None
                    
    return pd.DataFrame({'artist': [track['artists'][0]['name'] for track in track_list],
                         'track': [track['name'] for track in track_list],
                         'uri': [track['uri'] for track in track_list],
                         'id': [track['id'] for track in track_list],
                         'popularity': [track['popularity'] for track in track_list],
                        })

rps2_df = get_playlist_df("RPS2")
rps2_df


In [None]:
set(gold_df['uri'].to_list()).difference(set(rps2_df['uri'].to_list()))

In [None]:
set(rps2_df['uri'].to_list()).difference(set(gold_df['uri'].to_list()))

In [None]:
clean up stuff that is in 'reddits prettiest but not in rps2'
clean up stuff that is in rps but not rps2
add remaining stuff from rps3 to rp2
add rps2 to rps
add rps to reddits prettiest
delete rps3
with pd.option_context("display.max_rows", 9999):

    display(gold_df.loc[~gold_df['uri'].isin(rps2_df['uri'])])

In [None]:
gold_df.loc[~gold_df['uri'].isin(rps2_df['uri'])]

In [None]:
rps3_df.loc[rps3_df['uri'].isin(rps2_df['uri'])]

In [None]:
len(rps3_df)

In [None]:
# manually add the ones that weren't found for some reason


# Compare Spotify playlist to gold data
after initial population, we may want to run again and add new songs 


In [None]:
# compare to existing playlist
# can run again and add any new tracks, either because OpenAI is a bit random, or new replies in thread
results = sp.user_playlist(os.getenv('SPOTIFY_USERNAME'), playlist_id,
                                fields='tracks,next,name')
tracks = results['tracks']

playlist_dict_by_uri = {}
playlist_dict_by_str = {}

artist_list = []
track_list = []
uri_list = []
popularity_list = []
album_list=[]

while True:
    for track_item in tracks['items']:
        track_dict = track_item['track']
        track_str = track_dict['artists'][0]['name']  + ' | ' + track_dict['name'][:15]
        uri = track_dict['uri']
        if track_str in playlist_dict_by_str:
            print(track_str)
        playlist_dict_by_str[track_str] = uri
        playlist_dict_by_uri[uri] = track_str
        
        uri_list.append(uri)
        artist_list.append(track_dict['artists'][0]['name'])
        track_list.append(track_dict['name'])
        album_list.append(track_dict['album']['name'])
        popularity_list.append(track_dict['popularity'])
        
    # check if there are more pages
    if tracks['next']:
        tracks = sp.next(tracks)
    else:
        break

print (len(list(playlist_dict_by_str.keys())))
print (len(list(playlist_dict_by_uri.keys())))


In [None]:
with pd.option_context("display.max_rows", 9999):
    display(gold_df.loc[~gold_df['uri'].isin(playlist_dict_by_uri.keys())])
    

In [None]:
gold_df2 = gold_df.loc[~gold_df['uri'].isin(playlist_dict_by_uri.keys())]
dfz = gold_df2.loc[gold_df2.index> 500].copy()

In [None]:
addlist = dfz['uri'].to_list()
print (len(addlist))
while(addlist):
    sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
                                playlist_id=playlist_id, 
                                tracks=addlist[-100:])
    addlist = addlist[:-100]
    print("added items, remaining ", len(addlist))

In [None]:
dfz

In [None]:
playlist_df = pd.DataFrame({'artist': artist_list,
                           'track': track_list,
                           'album': album_list,
                           'popularity': popularity_list,
                           })



In [None]:
with pd.option_context("display.max_rows", 9999):
    display(playlist_df.sort_values('popularity'))
    

In [None]:
gold_dict_by_uri = {}
gold_dict_by_str = {}
addlist = []
c = 0
for i, artist, track, uri in gold_df[['artist', 'track', 'uri']].itertuples():
    # print(artist, track, uri)
    track_str = artist + ' | ' + track[:15]
    if track_str not in playlist_dict_by_str:
        addlist.append([artist, track, uri])
        print(artist, track, uri)
    gold_dict_by_uri[uri]=track_str
    gold_dict_by_str['track_str']= uri
#     if track_str not in playlist_dict_by_str:
#         c += 1
#         print (c, track_str)
        
print(len(gold_dict_by_str.items()))
print(len(gold_dict_by_uri.items()))

In [None]:
addlist

In [None]:
addlist = [['ABBA', 'One Of Us', 'spotify:track:6zgtBUEkAfilJ2YEOvNexR'],
 ['Gregorio Allegri',
  'Miserere mei, Deus',
  'spotify:track:6es7DmrhnDoKj5rsFvh3XU'],
 ['Amy Winehouse',
  'Love Is A Losing Game',
  'spotify:track:3uliGwmB52ZA7brgpZMzyH'],
 ['Barbara',
  "Ma plus belle histoire d'amour",
  'spotify:track:0qBVET4VkHsQAoboWlQ2pJ'],
 ['Ludwig van Beethoven',
  'Symphony No. 5 in C Minor, Op. 67: I. Allegro con brio',
  'spotify:track:2ygeBLTP9uu3OW3VTulD8N'],
 ['Benny Goodman', 'Sing, Sing, Sing', 'spotify:track:5L8ta4ECl5zeA6bGqY7G38'],
 ['Bill Withers', 'Lean on Me', 'spotify:track:3M8FzayQWtkvOhqMn2V4T2'],
 ['Billy Joel', 'Piano Man', 'spotify:track:70C4NyhjD5OZUMzvWZ3njJ'],
 ['Bob Dylan', 'Ballad of a Thin Man', 'spotify:track:0f5N14nB8xi0p3o4BlVvbx'],
 ['Bob Dylan', "Blowin' in the Wind", 'spotify:track:18GiV1BaXzPVYpp9rmOg0E'],
 ['Bob Dylan', 'Desolation Row', 'spotify:track:4n1ZGm3TxYmoYe1YR8cMus'],
 ['Bob Dylan', 'Duquesne Whistle', 'spotify:track:5kKW4bszhKSCYVPDO0sMbX'],
 ['Bob Dylan',
  'Forever Young - Slow Version',
  'spotify:track:4yWl0tnEanf3zmZzl9kbQn'],
 ['Bob Dylan', 'Gotta Serve Somebody', 'spotify:track:760420tYNmNjFgi8bWvbop'],
 ['Bob Dylan', 'Highway 61 Revisited', 'spotify:track:6os5B6xjuke9YfBKH3tu1e'],
 ['Bob Dylan',
  'I Shall Be Released - Studio Outtake - 1971',
  'spotify:track:5vyw005QQ42hrzrLxb3xEX'],
 ['Bob Dylan', 'I Want You', 'spotify:track:7tJQ4Ekp2vN3NlI3vJJW3v'],
 ['Bob Dylan', "It Ain't Me Babe", 'spotify:track:5nbNWAfT1S6V1vqj3snHxS'],
 ['Bob Dylan', 'Jokerman', 'spotify:track:6cuHkcRUqtQhtJ4sWCkd1q'],
 ['Bob Dylan',
  "Knockin' On Heaven's Door",
  'spotify:track:6HSXNV0b4M4cLJ7ljgVVeh'],
 ['Bob Dylan', 'Lay, Lady, Lay', 'spotify:track:4uYwlMp841PLJmj1gJJwIq'],
 ['Bob Dylan', 'Like a Rolling Stone', 'spotify:track:3AhXZa8sUQht0UEdBJgpGc'],
 ['Bob Dylan', 'Love Sick', 'spotify:track:3O1hpSOaJDW4SelgUG2XT3'],
 ['Bob Dylan', "Maggie's Farm", 'spotify:track:5rGD8FFgHw74cp3RPhucyg'],
 ['Bob Dylan',
  'Make You Feel My Love',
  'spotify:track:6rfGPGghQL7SJmZPXprXIc'],
 ['Bob Dylan',
  'Mississippi - Version 2',
  'spotify:track:6JWHNd8QMxTvojYkmZtKGI'],
 ['Bob Dylan', 'Mr. Tambourine Man', 'spotify:track:3RkQ3UwOyPqpIiIvGVewuU'],
 ['Bob Dylan', 'Murder Most Foul', 'spotify:track:1LfTvT9JPYuuZanwxLtZCr'],
 ['Bob Dylan', 'Not Dark Yet', 'spotify:track:1qbn6QrHG8XfnqVFKgNzKP'],
 ['Bob Dylan',
  'Rainy Day Women #12 & 35',
  'spotify:track:7BkAlVpGwXXl3sYNn5OoJ7'],
 ['Bob Dylan',
  'Sad-Eyed Lady of the Lowlands',
  'spotify:track:4jdtLLyEL7wY0TlCdMKhxq'],
 ['Bob Dylan', 'She Belongs to Me', 'spotify:track:2itBkHBUxGl4VfDj4HNyoD'],
 ['Bob Dylan',
  'Stuck Inside of Mobile with the Memphis Blues Again',
  'spotify:track:1NYTj6JEw3IOh4ggiBh82h'],
 ['Bob Dylan',
  'Subterranean Homesick Blues',
  'spotify:track:6k9DUKMJpWvu6eFG3O64Lg'],
 ['Bob Dylan', 'Tangled up in Blue', 'spotify:track:6Vcwr9tb3ZLO63F8DL8cqu'],
 ['Bob Dylan', 'Tempest', 'spotify:track:19scNzd4ogVsHrNWsms8Rg'],
 ['Bob Dylan',
  "The Times They Are A-Changin'",
  'spotify:track:52vA3CYKZqZVdQnzRrdZt6'],
 ['Bob Dylan',
  'Things Have Changed - Single Version',
  'spotify:track:5KOi77ameCimkAdw0DMNoy'],
 ['Bob Dylan',
  'Thunder on the Mountain',
  'spotify:track:4wo2eRp6aHcAlmhmfwiTAH'],
 ['Bob Dylan', 'Visions of Johanna', 'spotify:track:2rslQV48gNv3r9pPrQFPW1'],
 ['Brian Wilson', 'God Only Knows', 'spotify:track:2SznAUigFh6rMdGpcS5d7e'],
 ['Bright Eyes',
  'First Day of My Life',
  'spotify:track:0eBryM7ePQH3Klt3jz8xZd'],
 ['Crowded House',
  'Don’t Dream It’s Over - Home Demo',
  'spotify:track:0fiSpF9mvRFQWy0ca64d1g'],
 ['Léo Delibes', 'Flower Duet', 'spotify:track:5K8jqeLAxZIqHR6e5w5so1'],
 ['Dire Straits', 'Brothers In Arms', 'spotify:track:6XYBbVpu455ZdGWZNRLGbG'],
 ['Don McLean',
  'Vincent (Starry, Starry Night)',
  'spotify:track:2YDyH60Vro33KkDtNZCXIk'],
 ['Ed Sheeran', 'Photograph', 'spotify:track:41xNsY82OWtWbIfnRMK2ky'],
 ['Elvis Presley',
  'Can’t Help Falling in Love - Acoustic Cover',
  'spotify:track:0ghQkNDYLSl4GsqfkjTjWx'],
 ['Enya', 'Amarantine', 'spotify:track:0VmzazQQ0Mo1vJldr5NxTW'],
 ['Evan Rachel Wood', 'If I Fell', 'spotify:track:0gd3hRBQAEAw096YOcUrmR'],
 ['Fleetwood Mac', 'Rhiannon', 'spotify:track:05oETzWbd4SI33qK2gbJfR'],
 ['George Harrison',
  'All Things Must Pass - 2014 Remaster',
  'spotify:track:16OwZQuzMqnwn3FZsCBZly'],
 ['George Harrison',
  'Apple Scruffs - 2014 Remaster',
  'spotify:track:2K7WhpfZX3TCCMiwebp0W7'],
 ['George Harrison',
  'Art of Dying - 2014 Remaster',
  'spotify:track:6Jod7qrtYBhU3HcUmKk4hX'],
 ['George Harrison',
  'Awaiting on You All - 2014 Remaster',
  'spotify:track:0b65WkrBrg2qOkzQeDtQ9d'],
 ['George Harrison',
  'Ballad of Sir Frankie Crisp (Let It Roll) - 2014 Remaster',
  'spotify:track:0FWeRrB8T5R6maHbWQw4Kk'],
 ['George Harrison',
  'Behind That Locked Door',
  'spotify:track:2VVbLn8nMcWJzjcL1tZsUr'],
 ['George Harrison',
  'Beware of Darkness - 2014 Remaster',
  'spotify:track:606MCyZFMBlc52Ojnn1nvU'],
 ['George Harrison',
  'Give Me Love (Give Me Peace on Earth)',
  'spotify:track:71fXxvXqo1zxWDtBmjoEVk'],
 ['George Harrison',
  'Hear Me Lord - 2014 Remaster',
  'spotify:track:3kopbNyRj10XO1actGZexP'],
 ['George Harrison',
  'I Dig Love - 2014 Remaster',
  'spotify:track:42yK1Wy62c7malKSRwy0Qk'],
 ['George Harrison',
  'I Remember Jeep - 2014 Remaster',
  'spotify:track:058AE5M3ifbCh8VWOV7903'],
 ['George Harrison',
  "It's Johnny's Birthday - 2014 Remaster",
  'spotify:track:6Cv05rcW8HWwCC6wyEp1fC'],
 ['George Harrison',
  'Let It Down - 2014 Remaster',
  'spotify:track:5FFruMKbVg8AhwHnX4xBov'],
 ['George Harrison',
  'My Sweet Lord - 2014 Remaster',
  'spotify:track:6vE90mi4yKsQGY3YD2OOv1'],
 ['George Harrison',
  'Out of the Blue - 2014 Remaster',
  'spotify:track:1KHMyFaGvwVQ7ax4yjq4BZ'],
 ['George Harrison',
  'Plug Me In - 2014 Remaster',
  'spotify:track:0tyk2xHVjBd3nk16cGktTG'],
 ['George Harrison',
  'Run of the Mill - 2014 Remaster',
  'spotify:track:4uSlUBg3NVOA77E7wwKFTO'],
 ['George Harrison',
  'Thanks for the Pepperoni - 2014 Remaster',
  'spotify:track:3smkwfPqFsTmwfnBztMXaM'],
 ['George Harrison',
  'The Inner Light (Alternative Take) - Instrumental',
  'spotify:track:7gWPnvhaBFMlQsTBWEGcSC'],
 ['George Harrison',
  'Wah-Wah - 2014 Remaster',
  'spotify:track:5j3aqkMO2fl0s5eaSuVnQ8'],
 ['George Harrison',
  'What Is Life - 2014 Remaster',
  'spotify:track:44fw7RulJyj7dGIi9qR86N'],
 ['George Harrison',
  'While My Guitar Gently Weeps - Live At Madison Square Garden; 2009 Remaster',
  'spotify:track:4Egi6XuC0rbLlXfqmQeuFa'],
 ['Glenn Miller', 'In the Mood', 'spotify:track:1xsY8IFXUrxeet1Fcmk4oC'],
 ['Hans Zimmer', 'Cornfield Chase', 'spotify:track:6pWgRkpqVfxnj3WuIcJ7WP'],
 ['Hans Zimmer',
  'Day One (Interstellar Theme)',
  'spotify:track:4WmB04GBqS4xPMYN9dHgBw'],
 ["Israel Kamakawiwo'ole",
  'Maui Medley',
  'spotify:track:6TSJ3L9pBQsYIlCD5pk7ju'],
 ['James Taylor',
  'You’ve Got a Friend',
  'spotify:track:3nK4hWsTEr7fVXziI5bTmh'],
 ['Jay Ungar', 'Ashoken Farewell', 'spotify:track:2s6pqLeVialgt5l5TTSeas'],
 ['Jeff Buckley',
  'If You Knew - Live at Sin-é, New York, NY - July/August 1993',
  'spotify:track:1nd2JEHXbUuQFDiQzCBpsv'],
 ['Jimi Hendrix', 'One Rainy Wish', 'spotify:track:5Zyv0v4rPcrXjkaeImuodv'],
 ['Jimi Hendrix',
  'Spanish Castle Magic',
  'spotify:track:2KFE98Iw0X23sf4vJYcbLH'],
 ['Jimi Hendrix',
  'Wait Until Tomorrow',
  'spotify:track:2YtVzmZzew1ILUdNueyWd7'],
 ['John Lennon',
  'Imagine - Remastered 2010',
  'spotify:track:7pKfPomDEeI4TPT6EOYjn9'],
 ['John Mayer', 'Queen of California', 'spotify:track:0CETmgFGt8Ne8vLnaLcduU'],
 ['Johnny Cash',
  'I Walk The Line - Single Version',
  'spotify:track:1TKPfF2fvn6gVLVfp3iG4j'],
 ['Joni Mitchell',
  'Mitchell: Urge for Going (Instrumental Arrangement of the B-Side Track of the Joni Mitchell Single "You Turn Me on I\'m a Radio")',
  'spotify:track:1I1u9aTdxxQ7SDLgBB3V7b'],
 ['Kanye West', 'Come to Life', 'spotify:track:5xvXeuxISyXJDRbZZf4uzd'],
 ['Leonard Cohen', 'Chelsea Hotel #2', 'spotify:track:4krhCfJg0znykZoyjeMXRe'],
 ['Leonard Cohen', 'Dear Heather', 'spotify:track:3MTKMphPprAcBFG1uIhzPZ'],
 ['Leonard Cohen',
  "Death of a Ladies' Man",
  'spotify:track:5wrylUGwZugelovhryPYg2'],
 ['Leonard Cohen', 'The Future', 'spotify:track:5l8lYrnPEM1ln3J4XaTcy5'],
 ['Leonard Cohen',
  'You Want It Darker',
  'spotify:track:5zb7npjQqoJ7Kcpq4yD9qn'],
 ['Lingers.On', 'In Lingerie', 'spotify:track:6FH3kGlJbFVJDCG9RcERf7'],
 ['Louis Armstrong',
  'La vie en rose - Single Version',
  'spotify:track:3yYfoYGVpriV4fG9L1ogsD'],
 ['The Lovecats', 'The Lovecats', 'spotify:track:7iJUiiTfnuY5cTIeEBnqHr'],
 ['Ludovico Einaudi', 'Primavera', 'spotify:track:4BMHp3DkI8VLsuB9Kr0pzu'],
 ['Mazzy Star', 'Flowers In December', 'spotify:track:0G6Ws8Gbdt0S7pZeuYmkmm'],
 ['Metallica',
  'Fade To Black (Remastered)',
  'spotify:track:0dqGfCMAGyDgpUAgLNOjWd'],
 ['Wolfgang Amadeus Mozart',
  'Requiem in D Minor, K. 626: III. Sequenz No. 6, Lacrimosa dies illa',
  'spotify:track:4bvzJZXpkI3bkjxMCWOSu1'],
 ['My Chemical Romance',
  'The Light Behind Your Eyes',
  'spotify:track:3HyDpKAuR3e4l6QB7hSB2l'],
 ['Paul McCartney',
  'Here Today - Remixed 2015',
  'spotify:track:0QtnwXDziZN1K55fXuLN6q'],
 ['Paul McCartney',
  'I’ll Follow The Sun - Live At Amoeba 2007',
  'spotify:track:3xT59EeQdq0TPGtOlXXI8t'],
 ['Puscifer', 'The Humbling River', 'spotify:track:69GE6yPZZldvqtgBHrKXxg'],
 ['Ray LaMontagne',
  'Such A Simple Thing',
  'spotify:track:4PuUa8e5s7P3Zv1IdCGIsa'],
 ['Ray Manzarek',
  'Riders on the Storm',
  'spotify:track:3FvYcTXO2QtDY7kZQHku2d'],
 ['Red Hot Chili Peppers', 'Dosed', 'spotify:track:1iFIZUVDBCCkWe705FLXto'],
 ['Sky Cries Mary',
  "Don't Forget The Sky",
  'spotify:track:4sVpjCJRClVetRrdxVBolP'],
 ['Stevie Nicks', 'Landslide', 'spotify:track:5fprEY6WEN1wvFXkgfb22C'],
 ['Stevie Wonder', 'Isn’t She Lovely', 'spotify:track:6wGlAaMfyhKdEPr2zycAnN'],
 ['Taylor Swift',
  'Fearless (Taylor’s Version)',
  'spotify:track:77sMIMlNaSURUAXq5coCxE'],
 ['Taylor Swift',
  'the lakes - bonus track',
  'spotify:track:0eFQWVz0qIxDOvhLpZ40P7'],
 ['The Band',
  'When I Paint My Masterpiece - Remastered',
  'spotify:track:76WChUuOPeIK027IeUgr0l'],
 ['The Beach Boys',
  "I Just Wasn't Made For These Times - Mono",
  'spotify:track:4CuO8TINNqM3D7aUdNQ3zG'],
 ['The Beach Boys',
  "Let's Go Away For A While - Mono",
  'spotify:track:3GsgJI1aBrvUtqX8f3MhKT'],
 ['The Beatles',
  "Don't Let Me Down - Naked Version / Remastered 2013",
  'spotify:track:5BhMoGrz5KzG2fA5uzHjZ1'],
 ['The Beatles',
  'Love Me Do - Remastered 2009',
  'spotify:track:3VbGCXWRiouAq8VyMYN2MI'],
 ['The Chemical Brothers',
  'The Boxer',
  'spotify:track:1EUeDFq2zNP784GPaRs9aH'],
 ['The Cure',
  'A Night like This - 2006 Remaster',
  'spotify:track:7cKCz7gG84i1XLvDeM3ByT'],
 ['The Cure',
  'Disintegration - 2010 Remaster',
  'spotify:track:0zY8t5dC1KQXcPUKByWMJM'],
 ['The Cure',
  'From the Edge of the Deep Green Sea',
  'spotify:track:2vwBL9RVyr0vA4Og5VH0i3'],
 ['The Cure',
  'In Between Days - 2006 Remaster',
  'spotify:track:07CyrZF9eVd02zzIse7tZA'],
 ['The Cure', 'A Letter to Elise', 'spotify:track:4DdXOLc1VMAY34ourCn1Xa'],
 ['The Cure',
  'Lullaby - 2010 Remaster',
  'spotify:track:4d4oXk7O2lEhZ83ivV93li'],
 ['The Cure', 'Underneath The Stars', 'spotify:track:0PKVjYlKw7z3IvKAoxrYTR'],
 ['The Eagles', 'The Desperadoes', 'spotify:track:10ppF835WJMYI5v65gFLZ3'],
 ['The Helio Sequence',
  'Keep Your Eyes Ahead',
  'spotify:track:3yatRBsGMJ7wMoUIgDBzzo'],
 ['The Moldy Peaches',
  'Anyone Else But You',
  'spotify:track:2pKi1lRvXNASy7ybeQIDTy'],
 ['The Strokes', 'Someday', 'spotify:track:7hm4HTk9encxT0LYC0J6oI'],
 ['Traditional',
  'Scarborough Fair (Arr. Parkin)',
  'spotify:track:4wlNPczIullwvmwb4x0ltz'],
 ['Van Morrison',
  'Madame George - 1999 Remaster',
  'spotify:track:1N4MKISvC1ddfRCRQDXDd2'],
 ['Various Artists',
  'The Girl From Ipanema',
  'spotify:track:0JgH7g0kwsIs1THEVqhlUS'],
 ['Víg Mihály',
  'Öreg - From "Werckmeister Harmóniák"',
  'spotify:track:63wMgkXQuomlkW4an4O9b4'],
 ['Willie Nelson', 'Crazy', 'spotify:track:0xqtcLB45iKNfHroi5y1em']]


In [None]:
len(addlist)

In [None]:
addlist2 = [a[2] for a in addlist]

print (len(addlist2), 'items')

while(addlist2):
    sp.user_playlist_add_tracks(os.getenv('SPOTIFY_USERNAME'), 
                                playlist_id=playlist_id, 
                                tracks=addlist2[-100:])
    addlist2 = addlist2[:-100]
    print("added items, remaining ", len(addlist2))


In [None]:
z1=pd.read_csv('artist_map.csv')
z2=pd.read_csv('artist_map2.csv')



In [None]:
zmap = dict(zip(z2['artist'], z2['artist2']))
zmap


In [None]:
z1['3'] = z1['artist_orig'].apply(lambda s: zmap[s] if s in zmap else s)


In [None]:
z1[['artist_orig', 'artist_corrected']].to_csv('z1.csv', index=False)

In [None]:
z1['artist_corrected']=z1['3']

In [None]:
!diff z1.csv artist_map.csv