In [35]:
import requests
import pandas as pd
from urllib.parse import urlparse
from dotenv import load_dotenv
import os
import re

load_dotenv()

True

In [13]:
# Get and parse the URL
bonsai_url = os.getenv("BONSAI_URL")
parsed = urlparse(bonsai_url)

auth = (parsed.username, parsed.password)
base_url = f"{parsed.scheme}://{parsed.hostname}"

In [63]:
# Load your GPT-enriched lyrics
lyrics_df = pd.read_csv("../data/lyrics_gpt_enriched.csv")

In [75]:
# Pick your index name
#index_name = "bts_lyrics"
lyrics_index = "bts_lyrics_fixed"

In [76]:
mapping = {
    "mappings": {
        "properties": {
            "track": {"type": "keyword"},
            "lyric_excerpt": {"type": "text"},
            "gpt_summary": {"type": "text"},
            "gpt_emotion_tags": {"type": "keyword"}  # ✅ This makes it aggregatable!
        }
    }
}

# Optional: create the index (Bonsai auto-creates, but we can force it)
r = requests.put(f"{base_url}/{lyrics_index}", auth=auth)
print(f"Index creation status: {r.status_code}")

Index creation status: 200


In [77]:
for i, row in lyrics_df.iterrows():
    doc = row.to_dict()
    r = requests.post(f"{base_url}/{lyrics_index}/_doc", json=doc, auth=auth)
    print(f"Indexed lyric {i+1} with status {r.status_code}")


Indexed lyric 1 with status 201
Indexed lyric 2 with status 201
Indexed lyric 3 with status 201
Indexed lyric 4 with status 201
Indexed lyric 5 with status 201
Indexed lyric 6 with status 201


In [78]:
r = requests.get(f"{base_url}/{lyrics_index}/_search?size=3", auth=auth)
for hit in r.json()["hits"]["hits"]:
    print(hit["_source"]["gpt_emotion_tags"])


['Apology', 'Regret', 'Fraternal Bond']
['Encouragement', 'Unity', 'Freedom']
['Longing', 'Uncertainty', 'Loneliness']


In [79]:
r = requests.get(f"{base_url}/{lyrics_index}/_search", auth=auth)
print(r.json())

{'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 6, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'bts_lyrics_fixed', '_type': '_doc', '_id': '1wUVq5cBTo8i77nQhpd9', '_score': 1.0, '_source': {'album': 'Wings', 'track': 'Stigma', 'lyric_excerpt': 'I’m sorry, I’m sorry, my brother', 'timestamp': '2016-10-10', 'gpt_summary': 'Keywords: Apology, Regret, Fraternal bond\nSummary: The emotion in this lyric is filled with deep regret and a sincere apology to a beloved brother, highlighting the strong bond between them.', 'gpt_emotion_tags': ['Apology', 'Regret', 'Fraternal Bond']}}, {'_index': 'bts_lyrics_fixed', '_type': '_doc', '_id': '2AUVq5cBTo8i77nQiJfy', '_score': 1.0, '_source': {'album': 'Map of the Soul: 7', 'track': 'Black Swan', 'lyric_excerpt': 'Do your thing, do your thing with me now', 'timestamp': '2020-02-21', 'gpt_summary': 'Keywords: encouragement, unity, freedom\nSummary: The lyric co

In [82]:
# Step: Extract emotion tags from GPT summaries
def extract_emotion_tags(summary):
    if not isinstance(summary, str):
        return []

    # Look for a line starting with "Keywords:"
    for line in summary.splitlines():
        if line.strip().lower().startswith("keywords:"):
            tag_string = line.split(":", 1)[1]
            tags = [tag.strip().title() for tag in tag_string.split(",") if tag.strip()]
            return tags
    return []


#for i, val in lyrics_df["gpt_summary"].head(5).items():
    #print(f"{i}: {repr(val)}")


lyrics_df["gpt_emotion_tags"] = lyrics_df["gpt_summary"].apply(extract_emotion_tags)
#lyrics_df[["gpt_summary", "gpt_emotion_tags"]].head()


In [58]:
# Load fan comments data
comments_df = pd.read_csv("../data/fan_comments_gpt_enriched.csv")

# Delete old index if needed
requests.delete(f"{base_url}/{comments_index}", auth=auth)

# Recreate it with the correct mapping
mapping = {
    "mappings": {
        "properties": {
            "fan_comment": {"type": "text"},
            "track": {"type": "keyword"},
            "timestamp": {"type": "date"},
            "source": {"type": "keyword"},
            "gpt_sentiment": {"type": "text"},
            "gpt_sentiment_label": {"type": "keyword"},
            "gpt_sentiment_explanation": {"type": "text"}
        }
    }
}
requests.put(f"{base_url}/{comments_index}", json=mapping, auth=auth)


# Create the new index in Bonsai
comments_index_fixed = "bts_fan_comments_fixed"
r = requests.put(f"{base_url}/{comments_index_fixed}", json=mapping, auth=auth)
print(f"Index creation response: {r.status_code} — {r.text}")


# Set index name
#comments_index = "bts_fan_comments"

# Optional: create index
#r = requests.put(f"{base_url}/{comments_index}", auth=auth)
#print(f"Fan comments index creation status: {r.status_code}")


Index creation response: 400 — {"error":{"root_cause":[{"type":"resource_already_exists_exception","reason":"index [bts_fan_comments_fixed/UArSQe6QQWqbDCtaM57gag] already exists","index_uuid":"UArSQe6QQWqbDCtaM57gag","index":"bts_fan_comments_fixed"}],"type":"resource_already_exists_exception","reason":"index [bts_fan_comments_fixed/UArSQe6QQWqbDCtaM57gag] already exists","index_uuid":"UArSQe6QQWqbDCtaM57gag","index":"bts_fan_comments_fixed"},"status":400}


In [59]:
def split_sentiment(output):
    # Match any non-punctuation label followed by ":" or "-" and a space, then explanation
    match = re.match(r"^\s*([\w\s]+)\s*[:\-–—]\s*(.+)", str(output))
    if match:
        label = match.group(1).strip().title()  # e.g., "Inspiration"
        explanation = match.group(2).strip()
        return pd.Series([label, explanation])
    else:
        return pd.Series(["Unknown", output])

# Apply to fan comment GPT output
comments_df[["gpt_sentiment_label", "gpt_sentiment_explanation"]] = (
    comments_df["gpt_sentiment"].apply(split_sentiment)
)

# Confirm
print(comments_df[["gpt_sentiment", "gpt_sentiment_label"]].head())
# comments_df[["gpt_sentiment", "gpt_sentiment_label"]].head()

                                       gpt_sentiment gpt_sentiment_label
0  Sadness - The comment indicates that the perso...             Sadness
1  Inspiration - The comment suggests that the co...         Inspiration
2  Inspiration - The comment expresses a feeling ...         Inspiration
3  Inspiration - The comment expresses a renewed ...         Inspiration
4  Inspiration - The comment suggests that the ly...         Inspiration


In [60]:
for i, row in comments_df.iterrows():
    doc = row.to_dict()
    print(f"Posting doc {i+1}: label = {doc['gpt_sentiment_label']}")
    r = requests.post(f"{base_url}/{comments_index_fixed}/_doc", json=doc, auth=auth)



Posting doc 1: label = Sadness
Posting doc 2: label = Inspiration
Posting doc 3: label = Inspiration
Posting doc 4: label = Inspiration
Posting doc 5: label = Inspiration
Posting doc 6: label = Joy


In [61]:
# Check if the comments are in
r = requests.get(f"{base_url}/{comments_index}/_search", auth=auth)
hits = r.json()["hits"]["hits"]
print(f"Retrieved {len(hits)} documents")
for hit in hits:  
    print(hit["_source"]["gpt_sentiment_label"])


Retrieved 6 documents
Sadness
Inspiration
Inspiration
Inspiration
Inspiration
Joy
