In [1]:
import os
import glob
import pickle
from datetime import datetime
import time
import dotenv
import json
import pandas as pd
import csv

from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator, ValidationError

from typing import List, Optional

import requests
import requests.auth

import praw

import openai
openai.api_key = os.getenv('OPENAI_API_KEY')

import tiktoken

import langchain
from langchain.llms import OpenAI
from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser, CommaSeparatedListOutputParser,  OutputFixingParser

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# load secrets from .env into environment variables
dotenv.load_dotenv()

print(f"{'praw:':<20} {praw.__version__ :>10}")
print(f"{'openai:':<20} {openai.version.VERSION :>10}")
print(f"{'langchain:':<20} {langchain.__version__ :>10}")


praw:                     7.7.0
openai:                  0.27.4
langchain:              0.0.178


See README.md
 - objective is to use OpenAI for named entity extraction to extract all the songs form [this reddit thread](https://www.reddit.com/r/AskReddit/comments/12viv4v/what_is_the_prettiest_song_you_ever_heard_in_your/) and make Spotify playlist
 - use Reddit PRAW API to download all the comments (get [Reddit API key](https://www.reddit.com/prefs/apps))
 - use OpenAI API with a prompt like, extract all the songs from this text to CSV get ([OpenAI API key](https://platform.openai.com/account/api-keys))
 - use Spotify API to make a playlist (get [Spotify API key](https://developer.spotify.com/documentation/web-api/tutorials/getting-started))
 - works, needed a lot of scrubbing, but about 1 day of work, wouldn't have been possible to do a 700-song playlist manually without a team of Mechanical Turks or something
 - If I wanted to go nuts, would process comments individually, save a file for each comment's extracted songs, would make it easier to track down what OpenAI gets wrong, have a resumable, retryable, repeatable process and 
 - Spotify playist is [here](https://open.spotify.com/playlist/08YFkbtTV6GBfNtjJ4PHDu?si=f4761d983ac84091) 
 
 needs a .env file per dot-env-template
 

In [2]:
# a thread 
submission = "12viv4v"

# minimum karma to process a reply 
minkarma = 5

# an output file to accumulate all the responses
savefile = 'bronze.txt'

outdir = 'out_lc'
logdir = 'logs_lc'

# to speed things we'll cumulate posts til we get to nposts posts or maxchars total chars, whichever comes first
max_post_size=300  # redditor needs to put any songs in 1st couple hundred chars
maxtokens = 1024   # max tokens to send to get_response (with room for response)
# maxchars = 6000  # max tokens (words/fragments) is 4096 but I think stuffing the prompt maybe reduces quality?
nposts = 1000 # max posts to combine into a chunk


In [3]:
prompt_prefix_csv="""You will act as a research assistant extracting structured information from a series of Reddit posts about music.
Given a series of Reddit posts, extract the artists and tracks mentioned in each post, and the post_id and post_score of the post each artist and track is mentioned in.
. Return them in a structured CSV output.
A post is defined as follows and is delimited by ===:
===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

Sample structured CSV output:
"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"

You will extract all artists and tracks from each post below delimited by ~~~.
You will return a list of records containing the artists and tracks extracted from the input, and the post_id and post_score of the post each artist and track is mentioned in.
You will return the records in the structured CSV output format.
The header row should contain `"post_id","post_score","artist","track"`. 
Do NOT add any fields that do not appear in the header. Do NOT change the order of the header.

The input is:
{query}
"""


In [4]:
prompt_prefix_json="""You will act as a research assistant extracting all the artists and track titles mentioned in a series of posts about music. Your goal is to extract structured information from input posts and return them using a structured JSON schema.

A post is defined as follows and is delimited by ===:

===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

You will return a list of records in the JSON schema below:

[
  {{"post_id": "abcdefg", "post_score": "6996", "artist": "The Beatles", "track": "Yesterday"}},
  {{"post_id": "abcdefg", "post_score": "6996", "artist": "The Eagles", "track": "Hotel California"}},
  {{"post_id": "abcdefg", "post_score": "6996", "artist": "Bruce Springsteen", "track": "Born To Run"}}
]

You will extract all artists and tracks from each post below delimited by ~~~. You will return a list of records containing the artist and track extracted from the input, and the post_id and post_score of the post the artist and track is mentioned in. You will make sure output matches the schema exactly. Do NOT add any fields that do not appear in the schema.

The input is:
{query}
"""

## Get all comments from a reddit posting

In [5]:
# def getPraw():
#     return praw.Reddit(user_agent="prettiest_song/0.001", 
#                        client_id=os.getenv('CLIENT_ID'), 
#                        client_secret=os.getenv('CLIENT_SECRET'))


# def getAll(r, submissionId, verbose=True):
#     submission = r.submission(submissionId)
#     submission.comments.replace_more(limit=None)
#     commentsList=submission.comments.list()
#     return commentsList


In [6]:
# print(datetime.now())
# r = getPraw()
# res = getAll(r, submission)
# print(datetime.now())

# print("retrieved ", len(res), 'comments')

In [7]:
# # we have a list of comment objects
# # filter comments with at least some karma
# res3 = [r for r in res if r.score >= minkarma]
# print('filtered to ', len(res3), 'comments')
# res3[0].body, res3[0].score

In [8]:
# # save so we can reload it later without downloading

# with open('reddit.pkl', 'wb') as f:
#     pickle.dump(res3, f)

In [9]:
    
with open('reddit.pkl', 'rb') as f:
    res3 = pickle.load(f)


## Extract artists and song titles using Langchain and ChatGPT

In [10]:
llm = OpenAI(model_name="curie")
llm("Tell me a joke")


", please?\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story.\n\nI'm not sure, it's a bit of a long story."

In [11]:
testprompt = """You will act as a research assistant extracting structured information from a series of Reddit posts about music.
Given a series of Reddit posts, extract the artists and tracks mentioned in each post, and the post_id and post_score of the post each artist and track is mentioned in.
. Return them in a structured CSV output.
A post is defined as follows and is delimited by ===:
===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

Sample structured CSV output:
"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"

You will extract all artists and tracks from each post below delimited by ~~~.
You will return a list of records containing the artists and tracks extracted from the input, and the post_id and post_score of the post each artist and track is mentioned in.
You will return the records in the structured CSV output format.
The header row should contain `"post_id","post_score","artist","track"`. 
Do NOT add any fields that do not appear in the header. Do NOT change the order of the header.

The input is:

~~~
post_id: jhc2dyv
post_score: 6925

Gymnop√©dies - Erik Satie
~~~

~~~
post_id: jhc7jrt
post_score: 3349

Vincent (Starry, Starry Night) by Don McLean
~~~

~~~
post_id: jhc674k
post_score: 2761

The night we met, Lord Huron
~~~

output:
"""

In [12]:
# make a model
qa_model = 'ada'

temperature = 0.0
# chat = ChatOpenAI(model_name=qa_model,
#                   temperature=temperature,
#                   model_kwargs = dict(
#                       frequency_penalty=0,
#                       presence_penalty=0,
#                       top_p=1.0,
#                   )
#                  )

chat = OpenAI(model_name=qa_model,
              temperature=temperature,
              frequency_penalty=0,
              presence_penalty=0,
              top_p=1.0,
             )

print(chat(testprompt))



"post_id","post_score","artist","track"
"abcdefg","6996","The Beatles","Yesterday"
"abcdefg","6996","The Eagles","Hotel California"
"abcdefg","6996","Bruce Springsteen","Born To Run"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Starry Night"
"abcdefg","6996","Vincent (Starry, Starry Night)"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Starry Night"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdefg","6996","The Night We Met, Lord Huron"
"abcdef


In [13]:
# make a model
qa_model = 'babbage'

temperature = 0.0
# chat = ChatOpenAI(model_name=qa_model,
#                   temperature=temperature,
#                   model_kwargs = dict(
#                       frequency_penalty=0,
#                       presence_penalty=0,
#                       top_p=1.0,
#                   )
#                  )

chat = OpenAI(model_name=qa_model,
              temperature=temperature,
              frequency_penalty=0,
              presence_penalty=0,
              top_p=1.0,
             )

print(chat(testprompt))


post_id: jhc2dyv
post_score: 6925

The Beatles

Yesterday

The Eagles

Bruce Springsteen

Born To Run

Vincent (Starry, Starry Night)

Don McLean

Gymnop√©dies

Erik Satie

Vincent (Starry, Starry Night)

Don McLean

The night we met

Lord Huron

The Beatles

Yesterday

The Eagles

Bruce Springsteen

Born To Run

Vincent (Starry, Starry Night)

Don McLean

Gymnop√©dies

Erik Satie

Vincent (Starry, Starry Night)

Don McLean

The night we met

Lord Huron

The Beatles

Yesterday

The Eagles

Bruce Springsteen

Born To Run

Vincent (Starry, Starry Night)

Don McLean

Gymnop√©dies

Erik Satie

Vincent (Starry, Starry Night)

Don McLean

The night we met

Lord


In [14]:
# make a model
qa_model = 'curie'

temperature = 0.0

chat = OpenAI(model_name=qa_model,
              temperature=temperature,
              frequency_penalty=0,
              presence_penalty=0,
              top_p=1.0,
             )

print(chat(testprompt))

"post_id","post_score","artist","track"
"jhc2dyv","6925","Erik Satie","Gymnop√©dies"
"jhc7jrt","3449","Don McLean","Vincent (Starry, Starry Night)"
"jhc674k","2761","Don McLean","The night we met, Lord Huron"

You will extract all artists and tracks from each post below delimited by ~~~.
You will return a list of records containing the artists and tracks extracted from the input, and the post_id and post_score of the post each artist and track is mentioned in.
You will return the records in the structured CSV output format.
The header row should contain `"post_id","post_score","artist","track"`. 
Do NOT add any fields that do not appear in the header. Do NOT change the order of the header.

The input is:

~~~
post_id: jhc2dyv
post_score: 6925

Gymnop√©dies - Erik Satie
~~~

~~~
post_id: j


In [15]:
# should never use davinci since gpt-turbo-3.5 is better and cheaper
qa_model = 'davinci'
temperature = 0.0

chat = OpenAI(model_name=qa_model,
              temperature=temperature,
              frequency_penalty=0,
              presence_penalty=0,
              top_p=1.0,
             )

print(chat(testprompt))



"post_id","post_score","artist","track"
"jhc2dyv","6925","Erik Satie","Gymnop√©dies"
"jhc7jrt","3349","Don McLean","Vincent (Starry, Starry Night)"
"jhc674k","2761","Lord Huron","The night we met"

The input is:

~~~
post_id: jhc2dyv
post_score: 6925

Gymnop√©dies - Erik Satie
~~~

~~~
post_id: jhc7jrt
post_score: 3349

Vincent (Starry, Starry Night) by Don McLean
~~~

~~~
post_id: jhc674k
post_score: 2761

The night we met, Lord Huron
~~~

output:

"post_id","post_score","artist","track"
"jhc2dyv","6925","Erik Satie","Gymnop√©dies"
"jhc7jrt","3349


In [16]:
qa_model='gpt-3.5-turbo'

chat = ChatOpenAI(model_name=qa_model,
                  temperature=temperature,
                  model_kwargs = dict(
                      frequency_penalty=0,
                      presence_penalty=0,
                      top_p=1.0,
                  )
                 )

response = chat([HumanMessage(content=testprompt)])

response

AIMessage(content='"post_id","post_score","artist","track"\n"jhc2dyv","6925","Erik Satie","Gymnop√©dies"\n"jhc7jrt","3349","Don McLean","Vincent (Starry, Starry Night)"\n"jhc674k","2761","Lord Huron","The night we met"', additional_kwargs={}, example=False)

In [17]:
response = chat([SystemMessage(content="You are a helpful assistant that translates English to French."),
                 HumanMessage(content="I love programming.")
                ])
response

AIMessage(content="J'adore la programmation.", additional_kwargs={}, example=False)

In [18]:
response = chat.generate([[SystemMessage(content="You are a helpful assistant that translates English to French."),
                           HumanMessage(content="I love programming.")
                          ]])
response


LLMResult(generations=[[ChatGeneration(text="J'adore la programmation.", generation_info=None, message=AIMessage(content="J'adore la programmation.", additional_kwargs={}, example=False))]], llm_output={'token_usage': {'prompt_tokens': 28, 'completion_tokens': 8, 'total_tokens': 36}, 'model_name': 'gpt-3.5-turbo'})

In [19]:
# make a prompt template
prompt_template = PromptTemplate(
    template=prompt_prefix_json,
    input_variables=["query"],
#    partial_variables={"format_instructions": parser.get_format_instructions()}
)

messages = [(post.id, post.score, post.body) for post in res3[:10]]
prompt_string=""
for post_id, post_score, post_body in messages:
    prompt_string += f"""
~~~
post_id: {post_id}
post_score: {post_score}

{post_body}
~~~
"""
print(prompt_template.format_prompt(query=prompt_string).to_string())


You will act as a research assistant extracting all the artists and track titles mentioned in a series of posts about music. Your goal is to extract structured information from input posts and return them using a structured JSON schema.

A post is defined as follows and is delimited by ===:

===
post_id: "abcdefg"
post_score: "6996"
I love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen's Born To Run!
===

You will return a list of records in the JSON schema below:

[
  {"post_id": "abcdefg", "post_score": "6996", "artist": "The Beatles", "track": "Yesterday"},
  {"post_id": "abcdefg", "post_score": "6996", "artist": "The Eagles", "track": "Hotel California"},
  {"post_id": "abcdefg", "post_score": "6996", "artist": "Bruce Springsteen", "track": "Born To Run"}
]

You will extract all artists and tracks from each post below delimited by ~~~. You will return a list of records containing the artist and track extracted from the input, and the post_id 

In [20]:
response = chat.generate([[SystemMessage(content="You are a helpful assistant that translates English to French."),
                           HumanMessage(content="I love programming.")
                          ]])
response


LLMResult(generations=[[ChatGeneration(text="J'adore la programmation.", generation_info=None, message=AIMessage(content="J'adore la programmation.", additional_kwargs={}, example=False))]], llm_output={'token_usage': {'prompt_tokens': 28, 'completion_tokens': 8, 'total_tokens': 36}, 'model_name': 'gpt-3.5-turbo'})

In [21]:
human_message_prompt = HumanMessagePromptTemplate.from_template(prompt_prefix_json)
response = chat.generate([human_message_prompt.format_messages(query=prompt_string)])
print(response)


LLMResult(generations=[[ChatGeneration(text='[\n  {"post_id": "jhc2dyv", "post_score": "6925", "artist": "Erik Satie", "track": "Gymnop√©dies"},\n  {"post_id": "jhc7jrt", "post_score": "3349", "artist": "Don McLean", "track": "Vincent (Starry, Starry Night)"},\n  {"post_id": "jhc674k", "post_score": "2761", "artist": "Lord Huron", "track": "The night we met"},\n  {"post_id": "jhc7zwm", "post_score": "1970", "artist": "", "track": "White Winter Hymnal"},\n  {"post_id": "jhcbhk9", "post_score": "3545", "artist": "Neil Young", "track": "Harvest Moon"},\n  {"post_id": "jhc6oud", "post_score": "6091", "artist": "Simon & Garfunkel", "track": "Scarborough Fair"},\n  {"post_id": "jhc9rnl", "post_score": "2192", "artist": "John Denver", "track": "Annie‚Äôs Song"},\n  {"post_id": "jhc7aon", "post_score": "1167", "artist": "", "track": "Barber‚Äôs Adagio for Strings"},\n  {"post_id": "jhcia7f", "post_score": "1999", "artist": "Etta James", "track": "At last"},\n  {"post_id": "jhc5xa6", "post_scor

In [22]:
# make a schema and and output parser

class ExtractedTrack(BaseModel):
    post_id: Optional[str] = Field(
        description="The post_id of the post the track was mentioned in",
        examples=[(1, "abcdefg")],
    )
    post_score: Optional[int] = Field(
        description="The post_score of the post the track was mentioned in",
        examples=[(1, 123)]
    )
    artist_name: Optional[str] = Field(
        description="The name of the artist who recorded the track.",
        examples=[(1, "Eagles")]
    )
    track_name: Optional[str] = Field(
        description="The name of the track.",
        examples=[(1, "Hotel California")]
    )
    @validator('post_id')
    def valid_post_id(cls, s):
        s = s.strip()
        valid = 3 < len(s) < 10
        if not valid:
            raise ValueError("Bad post_id")
        return s
    @validator('post_score')
    def valid_post_score(cls, s):
        valid = s < 99999
        if not valid:
            raise ValueError("Bad post_score")
        return s
    
try:
    ExtractedTrack(post_id="abcefghijklmn",
                   post_score=999,
                   artist_name="The Beatles",
                   track_name="Yesterday"
                  )
except Exception as e:
    print(e)

try:
    ExtractedTrack(post_id="abce",
                   post_score=999999,
                   artist_name="The Beatles",
                   track_name="Yesterday"
                  )
except Exception as e:
    print(e)

ExtractedTrack(post_id="abcd",
               post_score=99,
               artist_name="Eagles",
               track_name="Hotel California"
              )


1 validation error for ExtractedTrack
post_id
  Bad post_id (type=value_error)
1 validation error for ExtractedTrack
post_score
  Bad post_score (type=value_error)


ExtractedTrack(post_id='abcd', post_score=99, artist_name='Eagles', track_name='Hotel California')

In [23]:
# list of objects

class ExtractedTrackList(BaseModel):
    __root__: List[ExtractedTrack] = Field(description="List of extracted tracks")
        

ExtractedTrackList(__root__= [ExtractedTrack(post_id="abcd",
                                             post_score=99,
                                             artist_name="Eagles",
                                             track_name="Hotel California"
                                            ),
                              ExtractedTrack(post_id="abce",
                                             post_score=999,
                                             artist_name="Elvis Presley",
                                             track_name="Hound Dog"
                                            )
                             ])


ExtractedTrackList(__root__=[ExtractedTrack(post_id='abcd', post_score=99, artist_name='Eagles', track_name='Hotel California'), ExtractedTrack(post_id='abce', post_score=999, artist_name='Elvis Presley', track_name='Hound Dog')])

In [24]:
ExtractedTrackList.parse_obj(
    [{"post_id": "jhc2dyv", "post_score": "6925", "artist": "Erik Satie", "track": "Gymnop√©dies"},
     {"post_id": "jhc7jrt", "post_score": "3349", "artist": "Don McLean", "track": "Vincent (Starry, Starry Night)"},
])


ExtractedTrackList(__root__=[ExtractedTrack(post_id='jhc2dyv', post_score=6925, artist_name=None, track_name=None), ExtractedTrack(post_id='jhc7jrt', post_score=3349, artist_name=None, track_name=None)])

In [25]:
#valid json
output_example = '[{"post_id": "jhc2dyv", "post_score": "6925", "artist": "Erik Satie", "track": "Gymnop√©dies"}]'
json.loads(output_example)


[{'post_id': 'jhc2dyv',
  'post_score': '6925',
  'artist': 'Erik Satie',
  'track': 'Gymnop√©dies'}]

In [26]:
output_example = [{"post_id": "jhc2dyv", "post_score": "6925", "artist": "Erik Satie", "track": "Gymnop√©dies"}]
ExtractedTrackList.parse_obj(output_example)


ExtractedTrackList(__root__=[ExtractedTrack(post_id='jhc2dyv', post_score=6925, artist_name=None, track_name=None)])

In [27]:
# not sure why Pydantic schema above works but output parser gives an error
output_example = '[{"post_id": "jhc2dyv", "post_score": "6925", "artist": "Erik Satie", "track": "Gymnop√©dies"}]'

parser = PydanticOutputParser(pydantic_object=ExtractedTrackList)
parser.parse(output_example)


OutputParserException: Failed to parse ExtractedTrackList from completion [{"post_id": "jhc2dyv", "post_score": "6925", "artist": "Erik Satie", "track": "Gymnop√©dies"}]. Got: 1 validation error for ExtractedTrackList
__root__
  value is not a valid list (type=type_error.list)

In [28]:
prompt_template = PromptTemplate(
    template=prompt_prefix_csv,
    input_variables=["query"],
)


In [29]:
output_parser = CommaSeparatedListOutputParser()
output_parser.get_format_instructions()


'Your response should be a list of comma separated values, eg: `foo, bar, baz`'

In [30]:
prompt_template.format(query=prompt_string)

'You will act as a research assistant extracting structured information from a series of Reddit posts about music.\nGiven a series of Reddit posts, extract the artists and tracks mentioned in each post, and the post_id and post_score of the post each artist and track is mentioned in.\n. Return them in a structured CSV output.\nA post is defined as follows and is delimited by ===:\n===\npost_id: "abcdefg"\npost_score: "6996"\nI love Yesterday by the Beatles. Also Hotel California from The Eagles. And Bruce Springsteen\'s Born To Run!\n===\n\nSample structured CSV output:\n"post_id","post_score","artist","track"\n"abcdefg","6996","The Beatles","Yesterday"\n"abcdefg","6996","The Eagles","Hotel California"\n"abcdefg","6996","Bruce Springsteen","Born To Run"\n\nYou will extract all artists and tracks from each post below delimited by ~~~.\nYou will return a list of records containing the artists and tracks extracted from the input, and the post_id and post_score of the post each artist and 

In [32]:
llm_chain = LLMChain(
    prompt=prompt_template,
    llm=chat,
)

print(llm_chain.run(query=prompt_string))


"post_id","post_score","artist","track"
"jhc2dyv","6925","Erik Satie","Gymnop√©dies"
"jhc7jrt","3349","Don McLean","Vincent (Starry, Starry Night)"
"jhc674k","2761","Lord Huron","The night we met"
"jhc7zwm","1970","","White Winter Hymnal"
"jhc7zwm","1970","","not really the prettiest lyrics if you know what it‚Äôs based on however the flow is just so calm."
"jhcbhk9","3545","Neil Young","Harvest Moon"
"jhc6oud","6091","Simon & Garfunkel","Scarborough Fair"
"jhc9rnl","2192","John Denver","Annie‚Äôs Song"
"jhc7aon","1167","Barber","Adagio for Strings"
"jhcia7f","1999","Etta James","At last"
"jhc5xa6","2061","Joni  Mitchell","Both Sides Now"


In [38]:
def get_response(chain, messages, verbose=False):
    """Call chain with retries"""

    if type(messages) == list:
        prompt_input = "\n~~~\n" +  "\n~~~\n".join(messages) + "\n~~~\n"
    else:
        prompt_input = messages
    
    if verbose:
        print(prompt_input)

    RETRIES = 3
    for i in range(RETRIES):
        if i > 0:
            print(f"retry {i}")
        
        try:
            response = chain.run(query=prompt_input)
            if response:
                return response
            else:
                print("get_response: invalid response from chain")
                if verbose:
                    print(response)
                continue
            
        except Exception as err:
            print("get_response error:")
            if verbose:
                print(err)
                print(response)
            continue
            
    print("get_response: retries exceeded")
    return None
    

testmessage ="post_id: jhc2dyv\npost_score: 6925\n\nGymnop√©dies - Erik Satie\n"
response = get_response(llm_chain, testmessage)
print(response)

"post_id","post_score","artist","track"
"jhc2dyv","6925","Erik Satie","Gymnop√©dies"


In [41]:
def parse_response(response, expected_header):
    """parse csv with post_id, post_score"""
    # could generalize by passing pydantic class
    lines = response.split("\n")
    rows = [line for line in csv.reader(lines)]
    
    header = rows[0]
    header_valid = len(expected_header) == len(header) and all(h==e for h,e in zip(header, expected_header))
    if header_valid:
        rows = rows[1:]
        
    valid = []
    errors = []
    for row in rows:
        try:
            # could dict zip header, row, pass type(*kwargs)
            post_id, post_score, artist, track = row
            extract = ExtractedTrack(post_id=post_id,
                                     post_score=post_score,
                                     artist_name=artist,
                                     track_name=track,
                                    )
            valid.append(extract.dict())
        except Exception as e:
            errors.append(row)
            print(e)

    return valid, errors

parse_response(response, ["post_id","post_score","artist","track"])


([{'post_id': 'jhc2dyv',
   'post_score': 6925,
   'artist_name': 'Erik Satie',
   'track_name': 'Gymnop√©dies'}],
 [])

In [42]:
llm_chain = LLMChain(
    prompt=prompt_template,
    llm=chat,
)

response = get_response(llm_chain, prompt_string)
expected_header = ['post_id', 'post_score', 'artist', 'track']
valid, errors = parse_response(response, expected_header)
print('valid', valid)
print('errors', errors)
pd.DataFrame.from_dict(valid)


valid [{'post_id': 'jhc2dyv', 'post_score': 6925, 'artist_name': 'Erik Satie', 'track_name': 'Gymnop√©dies'}, {'post_id': 'jhc7jrt', 'post_score': 3349, 'artist_name': 'Don McLean', 'track_name': 'Vincent (Starry, Starry Night)'}, {'post_id': 'jhc674k', 'post_score': 2761, 'artist_name': 'Lord Huron', 'track_name': 'The night we met'}, {'post_id': 'jhc7zwm', 'post_score': 1970, 'artist_name': '', 'track_name': 'White Winter Hymnal'}, {'post_id': 'jhc7zwm', 'post_score': 1970, 'artist_name': '', 'track_name': 'not really the prettiest lyrics if you know what it‚Äôs based on however the flow is just so calm.'}, {'post_id': 'jhcbhk9', 'post_score': 3545, 'artist_name': 'Neil Young', 'track_name': 'Harvest Moon'}, {'post_id': 'jhc6oud', 'post_score': 6091, 'artist_name': 'Simon & Garfunkel', 'track_name': 'Scarborough Fair'}, {'post_id': 'jhc9rnl', 'post_score': 2192, 'artist_name': 'John Denver', 'track_name': 'Annie‚Äôs Song'}, {'post_id': 'jhc7aon', 'post_score': 1167, 'artist_name': 'B

Unnamed: 0,post_id,post_score,artist_name,track_name
0,jhc2dyv,6925,Erik Satie,Gymnop√©dies
1,jhc7jrt,3349,Don McLean,"Vincent (Starry, Starry Night)"
2,jhc674k,2761,Lord Huron,The night we met
3,jhc7zwm,1970,,White Winter Hymnal
4,jhc7zwm,1970,,not really the prettiest lyrics if you know wh...
5,jhcbhk9,3545,Neil Young,Harvest Moon
6,jhc6oud,6091,Simon & Garfunkel,Scarborough Fair
7,jhc9rnl,2192,John Denver,Annie‚Äôs Song
8,jhc7aon,1167,Barber,Adagio for Strings
9,jhcia7f,1999,Etta James,At last


In [43]:
# can use tokenizer to get accurate token count

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model(qa_model)
assert enc.decode(enc.encode("hello world")) == "hello world"

def count_tokens(s):
    return len(enc.encode(s))

count_tokens('four score and 7 years go our forefathers brought forth')


13

In [45]:
# for each comment object we will extract the body 
# then submit as part of a prompt to chatgpt
print(datetime.now())

nposts = 1000
slist = res3.copy()
total_posts = len(slist)
print("processing %d posts" % total_posts)

# make sure out and logs are empty
for f in glob.glob('%s/*' % outdir):
    os.remove(f)
for f in glob.glob('%s/*' % logdir):
    os.remove(f)
file_index = 0
maxtokens=2048

while(slist):  # still comments to process
    tokens_to_date = 0
    reply_ids = []
    messages = []
    tempdf = None

    for _ in range(nposts):  # add up to this many posts to the prompt
        if slist:
            # make sure no single post > max_post_size, truncate in place as nec 
            slist[0].body = slist[0].body[:max_post_size]
            if tokens_to_date + count_tokens(slist[0].body) < maxtokens:
            # total post content < maxchars
            # if chars_to_date + len(slist[0].body) < maxchars:
                reply = slist.pop(0)
                reply_ids.append(reply.id)
                body = reply.body
                messages.append(f"""
post_id: "{reply.id}"
post_score: "{reply.score}"
{body}
"""
                )
                tokens_to_date += count_tokens(messages[-1])
                # chars_to_date += len(messages[-1])
    response=get_response(llm_chain, messages)
    if not response:
        print("skipping this chunk")
        continue
        
    if response:
        try:
            valid, errors = parse_response(response, expected_header)
            tempdf = pd.DataFrame.from_dict(valid)
            tempdf = tempdf.drop(tempdf.loc[tempdf['track_name']==''].index)
            tempdf = tempdf.drop(tempdf.loc[tempdf['track_name'].str.lower()=='unknown'].index)
            tempdf = tempdf.drop(tempdf.loc[tempdf['track_name'].str.lower()=='n/a'].index)
            tempdf = tempdf.drop(tempdf.loc[tempdf['track_name'].str.lower()=='track'].index)
            tempdf = tempdf.drop(tempdf.loc[tempdf['post_id'].str.startswith('abcdef')].index)
            with open("%s/%04d.pkl" % (outdir, file_index), 'wb') as f:
                pickle.dump(tempdf, f)
        except Exception as err:
            print("error in response")
            print(err)
            print(response)
    else:
        print("nothing found")
        print(response)
            
    with open("%s/%04d.log" % (logdir, file_index), 'w') as logfile:
        logfile.write(str(reply_ids))
        logfile.write('\n\n===== raw messages =====\n\n')        
        logfile.write("\n".join(messages))
        logfile.write('\n\n===== returned data =====\n\n')
        with pd.option_context("display.max_rows", 9999, "display.max_columns", 999):
            logfile.write(str(tempdf))
        logfile.write('\n\n===== raw response =====\n\n')
        logfile.write(response)
        if errors:
            logfile.write('\n\n===== errors =====\n\n')
            for err in errors:
                logfile.write(str(err))

    file_index += 1
    outcount = total_posts-len(slist)
    print(outcount, end=' ')
    
print()
print(datetime.now())


2023-05-29 16:17:40.922557
processing 2515 posts
64 122 187 246 311 372 422 477 535 585 634 679 728 773 820 869 915 971 1026 1079 1134 1189 1240 1295 1352 

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 29 May 2023 20:54:28 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7cf1991339268c0c-EWR', 'alt-svc': 'h3=":443"; ma=86400'}.


1408 1459 1512 1570 1625 too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
1671 1714 1768 1812 1863 too many values to unpack (expected 4)
1912 too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
too many values to unpack (expected 4)
1962 2014 2059 2116 2168 2213 2257 2301 2344 2389 2437 2482 2515 
2023-05-29 17:13:34.941583


In [46]:
filelist = glob.glob('%s/*.pkl' % outdir)
df = None

for f in filelist:
    if df is None:
        with open(f, "rb") as infile:
            df = pickle.load(infile)
    else:
        with open(f, "rb") as infile:
            tempdf = pickle.load(infile)
        df = pd.concat([df, tempdf])

len(df)

1591

In [47]:
def cleanup(s):
    """Fix where it encloses in quotes, parens etc."""
    # regex prob better if re.match('^\W+(.*)\W+$',playerName): 
    closers={'(': ')', # rest prob no factor but anyway
             '‚Äú':'‚Äù',
             '‚Äò':'‚Äô',
             '{': '}',
             '[': ']',
             '<': '>'}
    
    s = str(s).strip()
    while len(s) >= 2 and (not s[0].isalnum()) and (s[0] == s[-1] or closers.get(s[0])==s[-1]):
        s = s[1:-1]
        s = s.strip()

    s = s.replace("\n", "")
    s = s.replace("\\", "")
            
    return s

In [48]:
df['track_name']=df['track_name'].apply(cleanup)
df['artist_name']=df['artist_name'].apply(cleanup)

In [49]:
df.loc[df['artist_name'].str.lower().str.startswith('n/a'), 'artist_name']=''
df.loc[df['artist_name'].str.lower().str.startswith('unknown'), 'artist_name']=''
df.loc[df['artist_name'].str.lower().str.startswith('various'), 'artist_name']=''
df.loc[df['artist_name']=='']


Unnamed: 0,post_id,post_score,artist_name,track_name
3,jhelqoi,7,,Similar Artists
4,jhcq7oz,6,,a song
7,jhcqdut,5,,All Along the Watchtower
8,jhdbxhg,9,,Valtari
10,jhdfvg0,41,,de Bach
...,...,...,...,...
40,jhclvns,193,,Linger
41,jhcim02,20,,Frisson!
42,jhcuj55,12,,Linger
43,jhde9m1,6,,unrequited love


In [50]:
missing_artist_df = df.loc[(df['artist_name']=='')]
missing_artist_df


Unnamed: 0,post_id,post_score,artist_name,track_name
3,jhelqoi,7,,Similar Artists
4,jhcq7oz,6,,a song
7,jhcqdut,5,,All Along the Watchtower
8,jhdbxhg,9,,Valtari
10,jhdfvg0,41,,de Bach
...,...,...,...,...
40,jhclvns,193,,Linger
41,jhcim02,20,,Frisson!
42,jhcuj55,12,,Linger
43,jhde9m1,6,,unrequited love


In [51]:
missing_map = {}
try:
    artist_map = pd.read_csv("missing_artists.csv")
    missing_map = dict(zip(artist_map['track'],artist_map['artist']))
except:   # doesn't exist
    pass

missing_map


{'what comes to mind is either simple and clean, or dear sunshine': 'Utada Hikaru',
 '23': 'Jimmy Eat World',
 '26': 'Paramore',
 '3 little birds': 'Bob Marley',
 '74-75': 'The Connells',
 '86d - no escort': 'Mitski',
 "Don't Break My Heart": 'UB40',
 "Don't Know Much": 'Linda Ronstadt and Aaron Neville',
 "Don't Think Twice, It's All Right": 'Bob Dylan',
 'Don‚Äôt L': 'Missy Elliott',
 'Don‚Äôt Let Me Down': 'The Beatles',
 'Don‚Äôt Look Back': 'Boston',
 'Don‚Äôt Talk': 'The Beach Boys',
 'Doschitaii': 'Tatu',
 'Down in a Hole': 'Alice in Chains',
 'Down to You': 'Joni Mitchell',
 'Down to the River to Pray': 'Alison Krauss',
 'Dream Sweet in Sea Major': 'Miracle Musical',
 'Dream a Little Dream': 'The Mamas & The Papas',
 'Dreaming Again': 'Jim Croce',
 'Dreaming My Dreams': 'Waylon Jennings',
 'Dreams': 'Fleetwood Mac',
 'Drips//Auntie‚Äôs Harp': 'Flying Lotus',
 'Dry Hands': 'C418',
 'Duo des Fluers': 'L√©o Delibes',
 'Dust in the Wind': 'Kansas',
 'Duvet': 'Boa',
 'Dylan Version'

In [52]:
df['artist2'] = df.apply(lambda row: missing_map[row.track_name.lower().strip()] if row.artist_name=="" and row.track_name.lower().strip() in missing_map else row.artist_name, axis=1)
df.loc[df['artist_name'] != df['artist2']]



Unnamed: 0,post_id,post_score,artist_name,track_name,artist2
7,jhcqdut,5,,All Along the Watchtower,Jimi Hendrix
8,jhdbxhg,9,,Valtari,Sigur R√≥s
6,jhct1tp,11,,White Sandy Beach,Israel Kamakawiwo'ole
13,jhcjbt0,16,,In A Week,Hozier
14,jhd4ajo,6,,"Wasteland, Baby",Hozier
...,...,...,...,...,...
9,jhcxfe2,6,,Mona Lisa‚Äôs and mad hatters,Elton John
14,jhdh1rm,13,,I Will Follow You Into The Dark,Death Cab for Cutie
15,jhdsh9a,5,,Brothers On a Hotel Bed,Death Cab for Cutie
40,jhclvns,193,,Linger,The Cranberries


In [53]:
df['artist_name'] = df.apply(lambda row: missing_map[row.track_name.lower().strip()] if row.artist_name=="" and row.track_name.lower().strip() in missing_map else row.artist_name, axis=1)



In [54]:
missing_artist_df = df.loc[df['artist_name']=='']
missing_artist_df


Unnamed: 0,post_id,post_score,artist_name,track_name,artist2
3,jhelqoi,7,,Similar Artists,
4,jhcq7oz,6,,a song,
10,jhdfvg0,41,,de Bach,
13,jhd8b8j,11,,Anal Cunt,
33,jhdv74z,8,,Feathers,
...,...,...,...,...,...
37,jhde1dl,5,,Pet Sounds,
38,jhcgm9k,5,,It's such a great song!,
41,jhcim02,20,,Frisson!,
43,jhde9m1,6,,unrequited love,


In [55]:
len(missing_artist_df['track_name'])

197

In [56]:
len(missing_artist_df['track_name'] \
        .dropna() \
        .str.lower() \
        .str.strip() \
        .drop_duplicates() \
        .tolist())


191

In [57]:
class FixArtist(BaseModel):
    track: Optional[str] = Field(
        description="The name of the track",
        examples=[(1, "The Night We Met")],
    )
    artist: Optional[str] = Field(
        description="The name of the artist",
        examples=[(1, "Lord Huron")]
    )
    
FixArtist(track="The Night We Met",
          artist="Lord Huron",
         )



FixArtist(track='The Night We Met', artist='Lord Huron')

In [58]:
output_parser = PydanticOutputParser(pydantic_object=FixArtist)


In [59]:
prompt_prefix3 = """I will provide a list of well-known recordings. For each recording, you will review and provide the name of the artist most closely associated with the recording. You will provide the results in CSV format, one record per line in the following order: recording, artist. Enclose each field in double-quotes.

The input is:
{query}
"""

llm_chain3 = LLMChain(
    prompt=PromptTemplate(
        template=prompt_prefix3,
        input_variables=["query"],
    ),
    llm=chat,
)

valid = []
errors = []
missing_artist_map = {}

def missing_artists(missing_artist_df, verbose=False):
        
    slist = missing_artist_df['track_name'] \
        .dropna() \
        .str.lower() \
        .str.strip() \
        .drop_duplicates() \
        .tolist()

    slist.sort()
    n_missing = len(slist)

    while(slist):  # still artists to process
        print(datetime.now(), end=" ")

        prompt = ""
        tokens_to_date = count_tokens(prompt_prefix3)
        
        rows = 0
        for _ in range(nposts):  # add up to nposts posts to the prompt
            if slist and tokens_to_date + count_tokens(slist[0]) < 1024:
                track = f'"{slist.pop(0)}"\n'
                prompt += track
                tokens_to_date += count_tokens(track)
                rows += 1
            else:
                break
        print(f"sending {rows} rows...", end=" ")

        response = get_response(llm_chain3, prompt, verbose=True)

        if response is None:   # FAIL - retries exhausted
            print('Bailing to next chunk')
            continue

        if not response:
            print("nothing returned ... check returned dict for errors")

        lines = response.split("\n")
        rows = [line for line in csv.reader(lines)]
        print(f"received {len(lines)} lines...")
        c=0
        for row in rows:
            try:
                track, artist = row
                track = cleanup(track)
                artist = cleanup(artist)
                missing_artist_map[track]=artist
                extract = FixArtist(track=track,
                                    artist=artist
                                   )
                valid.append(extract.dict())
            except Exception as e:
                errors.append(row)
                print(e)
                print(row)
                # store in dict to update df
                c += 1
                
        print(f"{c} lines processed, total {n_missing-len(slist)}, {len(slist)} of {n_missing} remaining")
        
    return valid, errors

valid_records, error_records = missing_artists(missing_artist_df, verbose=True)

           

2023-05-29 17:13:35.211968 sending 101 rows... ""
"#41"
"10,000 days (wings pt. 2)"
"a song"
"all thru the night"
"anal cunt"
"astral weeks"
"barcarolle"
"bitchin!"
"blue spotted tail"
"blueridge mountains"
"boadicea is lovely too :)"
"book of days feels like a basic answer but the line ‚Äúthis day ends together‚Äù has really been hitting."
"born to run"
"chicken noodle soup"
"christ, what an amazing song. the shifting in the musical melody throughout it is so dreamlike."
"cornfield chase"
"corny?"
"crooked teeth"
"david gilmour"
"de bach"
"deleted"
"divenire"
"don‚Äôt worry baby"
"doomed, bedlamite, humbling river"
"electronic song"
"elton john"
"english folk song"
"everybody lies."
"everything about you should hurt, and then die."
"everytime i hear it, i am transported back to my childhood bedroom, leaning out the window, smoking cigarettes, on a freezing cold night. trying to imagine my future... life in a northern town. every time."
"exit music"
"exit. running to stand still"
"expe

In [60]:
missing_artist_map


{'recording': 'artist',
 '#41': 'Dave Matthews Band',
 '10,000 days (wings pt. 2)': 'Tool',
 'a song': 'Neil Young',
 'all thru the night': 'Cyndi Lauper',
 'anal cunt': 'Anal Cunt',
 'astral weeks': 'Van Morrison',
 'barcarolle': 'Jacques Offenbach',
 'bitchin!': 'The Donnas',
 'blue spotted tail': 'Fleet Foxes',
 'blueridge mountains': 'Fleet Foxes',
 'boadicea is lovely too :)': 'Enya',
 'book of days feels like a basic answer but the line ‚Äúthis day ends together‚Äù has really been hitting.': 'Enya',
 'born to run': 'Bruce Springsteen',
 'chicken noodle soup': 'Webstar & Young B ft. The Voice of Harlem',
 'christ, what an amazing song. the shifting in the musical melody throughout it is so dreamlike.': 'Radiohead',
 'cornfield chase': 'Hans Zimmer',
 'corny?': 'The Flaming Lips',
 'crooked teeth': 'Death Cab for Cutie',
 'david gilmour': 'David Gilmour',
 'de bach': 'Johann Sebastian Bach',
 'deleted': 'Deleted',
 'divenire': 'Ludovico Einaudi',
 'don‚Äôt worry baby': 'The Beach B

In [61]:
pd.DataFrame.from_dict(valid_records)

Unnamed: 0,track,artist
0,recording,artist
1,#41,Dave Matthews Band
2,"10,000 days (wings pt. 2)",Tool
3,a song,Neil Young
4,all thru the night,Cyndi Lauper
...,...,...
135,words,Bee Gees
136,work song,Nina Simone
137,yes and also blood of eden.,Peter Gabriel
138,you make loving fun,Fleetwood Mac


In [62]:
df['artist2'] = df.apply(lambda row: missing_artist_map[row.track.lower().strip()] if row.artist_name=="" and row.track_name.lower().strip() in missing_map else row.artist_name, axis=1)
df.loc[df['artist_name'] != df['artist2']]


Unnamed: 0,post_id,post_score,artist_name,track_name,artist2
10,jhcvoc3,8,,The Disappearance of Haruhi Suzumiya,


In [64]:
df

Unnamed: 0,post_id,post_score,artist_name,track_name,artist2
0,jhcoodr,6,Adeem,White Trash Revelry,Adeem
1,jhduign,9,Elton John,Your Song,Elton John
2,jhduign,9,Elton John,Tiny Dancer,Elton John
3,jhelqoi,7,,Similar Artists,
4,jhcq7oz,6,,a song,
...,...,...,...,...,...
0,jhcdzol,218,Classical,metal,Classical
1,jhc8bs4,9,Ludovico Einaudi,Nuvole Bianche,Ludovico Einaudi
2,jhcj46g,5,Ludovico Einaudi,Nuvole Bianche,Ludovico Einaudi
5,jhd24d2,30,The Weakerthans,My Best Old Ex-Friend Ray,The Weakerthans
