Imports & base paths 

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
import joblib
from sklearn.metrics.pairwise import cosine_similarity

# Optional but nice in notebooks
from IPython.display import display
import json

# Base paths – relative to project root
BASE_DIR = Path("..").resolve()
DATA_CLEAN = BASE_DIR / "data" / "cleaned"
MODELS_DIR = BASE_DIR / "models"

print("BASE_DIR:      ", BASE_DIR)
print("DATA_CLEAN:    ", DATA_CLEAN, "exists:", DATA_CLEAN.exists())
print("MODELS_DIR:    ", MODELS_DIR, "exists:", MODELS_DIR.exists())


BASE_DIR:       D:\Sheridan\First semester\Python\LyriSense
DATA_CLEAN:     D:\Sheridan\First semester\Python\LyriSense\data\cleaned exists: True
MODELS_DIR:     D:\Sheridan\First semester\Python\LyriSense\models exists: True


Load cleaned data + models

In [2]:
emotion_df = pd.read_csv(DATA_CLEAN / "emotion_clean.csv")
songs_df = pd.read_csv(DATA_CLEAN / "songs_clean.csv")
songs_with_pred = pd.read_csv(DATA_CLEAN / "songs_with_predicted_emotions.csv")

# Use consistent naming with other notebooks
tfidf_emotion = joblib.load(MODELS_DIR / "tfidf_emotion.joblib")
logreg_emotion = joblib.load(MODELS_DIR / "logreg_emotion.joblib")

emotion_df.shape, songs_df.shape, songs_with_pred.shape


((416796, 2), (611, 6), (611, 9))

Emotion mappings

In [3]:
EMOTION_ID_TO_NAME = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise",
}

EMOTION_NAME_TO_ID = {v: k for k, v in EMOTION_ID_TO_NAME.items()}

EMOTION_ID_TO_NAME


{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

Helper: predict emotion from text

In [4]:
def predict_emotion(text: str):
    """
    Given a raw user text, return:
        - predicted emotion id
        - emotion name
        - confidence
        - TF-IDF vector (for similarity)

    This uses the same tfidf_emotion + logreg_emotion models as in modeling.
    """
    if not isinstance(text, str) or text.strip() == "":
        raise ValueError("Text must be a non-empty string.")

    X = tfidf_emotion.transform([text])
    probs = logreg_emotion.predict_proba(X)[0]

    emotion_id = int(np.argmax(probs))
    confidence = float(probs[emotion_id])
    emotion_name = EMOTION_ID_TO_NAME[emotion_id]

    return {
        "emotion_id": emotion_id,
        "emotion": emotion_name,
        "confidence": confidence,
        "vector": X,
    }


Quick sanity check: emotion predictions

In [5]:
test_snippets = [
    "i feel so happy and grateful today",
    "i am terrified something bad will happen",
    "i am so angry about how they treated me",
    "i feel hopeless and exhausted",
]

for t in test_snippets:
    pred = predict_emotion(t)
    print(
        f"{t!r} -> {pred['emotion']} "
        f"(id={pred['emotion_id']}, conf={pred['confidence']:.3f})"
    )


'i feel so happy and grateful today' -> joy (id=1, conf=0.959)
'i am terrified something bad will happen' -> fear (id=4, conf=0.983)
'i am so angry about how they treated me' -> anger (id=3, conf=0.978)
'i feel hopeless and exhausted' -> sadness (id=0, conf=1.000)


Build / load song TF-IDF matrix

In [6]:
# Use the songs_with_pred dataframe (which already has predicted emotions)
songs_with_pred.head()

song_texts = songs_with_pred["clean_lyrics"].fillna("")

song_vectors = tfidf_emotion.transform(song_texts)
song_vectors.shape

# Save as a sparse matrix so the API can load it quickly
sparse.save_npz(MODELS_DIR / "song_tfidf_matrix.npz", song_vectors)


Main function: recommend from free text

In [7]:
def recommend_songs_from_text(
    user_text: str,
    top_k: int = 5,
    same_emotion_only: bool = True,
):
    """
    Recommend songs for a given user_text.

    Steps:
        1) Detect dominant emotion from user_text.
        2) Filter songs by that emotion (if same_emotion_only=True).
        3) Rank candidate songs by cosine similarity in TF-IDF space.

    Args:
        user_text: Free-form user mood text.
        top_k: Number of songs to return.
        same_emotion_only: If True, restrict recommendations to songs whose
                           predicted emotion matches the user emotion.

    Returns:
        dict with:
            - user_emotion_id
            - user_emotion
            - user_confidence
            - recommendations: list of dicts with
              ["title", "artist", "album", "song_emotion",
               "song_emotion_conf", "similarity"]
    """
    pred = predict_emotion(user_text)
    user_vec = pred["vector"]
    user_emotion_id = pred["emotion_id"]

    # Choose candidate songs
    if same_emotion_only:
        mask = songs_with_pred["pred_emotion_id"] == user_emotion_id
        idx = np.where(mask.values)[0]
        filtered_vectors = song_vectors[idx]
        filtered_songs = songs_with_pred.iloc[idx].reset_index(drop=True)
    else:
        filtered_vectors = song_vectors
        filtered_songs = songs_with_pred.reset_index(drop=True)

    # Cosine similarity between user vector and candidate songs
    sims = cosine_similarity(user_vec, filtered_vectors)[0]
    top_idx = np.argsort(-sims)[:top_k]

    recs = []
    for i in top_idx:
        row = filtered_songs.iloc[i]
        recs.append({
            "title": row["title"],
            "artist": row["artist"],
            "album": row.get("album", None),
            "song_emotion": row.get("pred_emotion", None),
            "song_emotion_conf": float(row.get("pred_emotion_conf", 0.0)),
            "similarity": float(sims[i]),
        })

    return {
        "user_emotion_id": user_emotion_id,
        "user_emotion": pred["emotion"],
        "user_confidence": pred["confidence"],
        "recommendations": recs,
    }


Example: single query

In [8]:
query = "I feel so anxious and scared about my future"
result = recommend_songs_from_text(query, top_k=5)

print(
    "Detected emotion:",
    result["user_emotion"],
    f"(conf={result['user_confidence']:.3f})",
)

pd.DataFrame(result["recommendations"])


Detected emotion: fear (conf=0.996)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Wild as Her,Corey Kent,Wild as Her,fear,0.621407,0.045985
1,10:35,Tiësto,10:35,fear,0.384485,0.03254
2,Renegade,JAY-Z,Curtain Call: The Hits (Deluxe Edition),fear,0.512791,0.028496
3,Fast Car,Luke Combs,Gettin' Old,fear,0.237228,0.024697
4,Not Afraid,Eminem,Recovery,fear,0.346295,0.021771


Test several typical user texts

In [9]:
test_texts = [
    "I feel so happy and grateful today",          # joy
    "I am terrified something bad will happen",    # fear
    "I am so angry about how they treated me",     # anger
    "I feel hopeless and exhausted",               # sadness
    "I am deeply in love and feel so connected",   # love
    "I can't believe this just happened!",         # surprise-ish
]

for t in test_texts:
    res = recommend_songs_from_text(
        t,
        top_k=3,
        same_emotion_only=True,
    )
    print("=" * 80)
    print("USER TEXT:", t)
    print(
        f"Detected emotion: {res['user_emotion']} "
        f"(id={res['user_emotion_id']}, conf={res['user_confidence']:.3f})"
    )
    display(pd.DataFrame(res["recommendations"]))


USER TEXT: I feel so happy and grateful today
Detected emotion: joy (id=1, conf=0.959)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Happy Together,The Turtles,Happy Together,joy,0.499857,0.232352
1,Ain't Nobody (Loves Me Better),Felix Jaehn,Ain't Nobody (Loves Me Better),joy,0.399015,0.04089
2,Never Forget You,MNEK,Never Forget You,joy,0.290802,0.023491


USER TEXT: I am terrified something bad will happen
Detected emotion: fear (id=4, conf=0.983)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Middle Ground,Maroon 5,Middle Ground,fear,0.444407,0.028509
1,For What It's Worth,Buffalo Springfield,Buffalo Springfield,fear,0.295593,0.009408
2,The Nights,Avicii,The Nights,fear,0.262736,0.008755


USER TEXT: I am so angry about how they treated me
Detected emotion: anger (id=3, conf=0.978)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Forgot About Dre,Dr. Dre,2001,anger,0.324911,0.072806
1,They Don't Care About Us,Michael Jackson,"HIStory - PAST, PRESENT AND FUTURE - BOOK I",anger,0.492296,0.069062
2,WAIT FOR U (feat. Drake & Tems),Future,I NEVER LIKED YOU,anger,0.287765,0.065108


USER TEXT: I feel hopeless and exhausted
Detected emotion: sadness (id=0, conf=1.000)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Rock and A Hard Place,Bailey Zimmerman,Rock and A Hard Place,sadness,0.47182,0.042307
1,Last Last,Burna Boy,Last Last,sadness,0.284963,0.021289
2,How I'm Feeling Now,Lewis Capaldi,How I'm Feeling Now,sadness,0.324487,0.01448


USER TEXT: I am deeply in love and feel so connected
Detected emotion: joy (id=1, conf=0.405)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,"If You Ever (feat. Gabrielle) - Smoove Mix 7""",East 17,The Very Best Of East 17,joy,0.34325,0.141797
1,Peaches,Jack Black,The Super Mario Bros. Movie (Original Motion P...,joy,0.375416,0.084983
2,Real Love,Clean Bandit,Real Love,joy,0.336016,0.081794


USER TEXT: I can't believe this just happened!
Detected emotion: sadness (id=0, conf=0.360)


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Break a Broken Heart,Andrew Lambrou,Break a Broken Heart,sadness,0.320447,0.04538
1,Lost,Frank Ocean,channel ORANGE,sadness,0.927415,0.041498
2,Virtual Insanity - Remastered,Jamiroquai,Travelling Without Moving (Remastered),sadness,0.287919,0.041328


Helper: recommend directly by emotion (no free text)

In [10]:
def recommend_songs_by_emotion(
    emotion: int | str,
    top_k: int = 5,
):
    """
    Recommend songs given an emotion id or name (no free text).

    Args:
        emotion: Emotion id (0-5) or name ("joy", "sadness", etc.).
        top_k: Number of songs to return.

    Returns:
        dict with:
            - emotion_id
            - emotion
            - recommendations: list of dicts as in recommend_songs_from_text.
    """
    # Convert emotion to a valid id
    if isinstance(emotion, str):
        emotion = emotion.lower().strip()
        if emotion not in EMOTION_NAME_TO_ID:
            raise ValueError(f"Unknown emotion name: {emotion}")
        emotion_id = EMOTION_NAME_TO_ID[emotion]
    else:
        emotion_id = int(emotion)
        if emotion_id not in EMOTION_ID_TO_NAME:
            raise ValueError(f"Unknown emotion id: {emotion_id}")

    # Filter songs with this emotion
    mask = songs_with_pred["pred_emotion_id"] == emotion_id
    idx = np.where(mask.values)[0]
    if len(idx) == 0:
        raise ValueError(f"No songs found with emotion_id={emotion_id}")

    filtered_vectors = song_vectors[idx]
    filtered_songs = songs_with_pred.iloc[idx].reset_index(drop=True)

    # Use centroid of all songs with this emotion as the "query"
    centroid_vec = filtered_vectors.mean(axis=0)
    sims = cosine_similarity(centroid_vec, filtered_vectors)[0]
    top_idx = np.argsort(-sims)[:top_k]

    recs = []
    for i in top_idx:
        row = filtered_songs.iloc[i]
        recs.append({
            "title": row["title"],
            "artist": row["artist"],
            "album": row.get("album", None),
            "song_emotion": row.get("pred_emotion", None),
            "song_emotion_conf": float(row.get("pred_emotion_conf", 0.0)),
            "similarity": float(sims[i]),
        })

    return {
        "emotion_id": emotion_id,
        "emotion": EMOTION_ID_TO_NAME[emotion_id],
        "recommendations": recs,
    }


In [11]:
display(
    pd.DataFrame(
        recommend_songs_by_emotion("fear", top_k=5)["recommendations"]
    )
)



Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Not Afraid,Eminem,Recovery,fear,0.346295,0.581618
1,Renegade,JAY-Z,Curtain Call: The Hits (Deluxe Edition),fear,0.512791,0.498307
2,Riptide,Vance Joy,Dream Your Life Away,fear,0.363809,0.493406
3,Afraid To Feel,LF SYSTEM,Afraid To Feel,fear,0.409444,0.468393
4,TRUSTFALL,P!nk,TRUSTFALL,fear,0.284515,0.465349


Edge-case tests

In [12]:
# 1) Empty / invalid input
try:
    recommend_songs_from_text("", top_k=3)
except Exception as e:
    print("Empty text error (expected):", e)

# 2) Very long text
long_text = "I am sad " * 500
res_long = recommend_songs_from_text(long_text, top_k=3)
print("\nLong text detected emotion:", res_long["user_emotion"])
display(pd.DataFrame(res_long["recommendations"]))

# 3) Allow cross-emotion recommendations
cross_res = recommend_songs_from_text(
    "I feel so anxious and scared about my future",
    top_k=5,
    same_emotion_only=False,
)
print("\nCross-emotion mode (same_emotion_only=False)")
print("Detected emotion:", cross_res["user_emotion"])
display(pd.DataFrame(cross_res["recommendations"]))


Empty text error (expected): Text must be a non-empty string.

Long text detected emotion: sadness


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Soarele si Luna,Pasha Parfeni,Soarele si Luna,sadness,0.29496,0.074527
1,If It Makes You Happy,Sheryl Crow,Sheryl Crow,sadness,0.469237,0.054873
2,If We Ever Broke Up,Mae Stephens,If We Ever Broke Up,sadness,0.919973,0.039437



Cross-emotion mode (same_emotion_only=False)
Detected emotion: fear


Unnamed: 0,title,artist,album,song_emotion,song_emotion_conf,similarity
0,Drop The World,Lil Wayne,Rebirth,anger,0.24793,0.047052
1,Perfect,Ed Sheeran,÷ (Deluxe),love,0.625492,0.046893
2,Calm Down (with Selena Gomez),Rema,Rave & Roses Ultra,sadness,0.273178,0.046779
3,In My Bed - So So Def Mix,Dru Hill,Dru Hill,surprise,0.38704,0.045996
4,Wild as Her,Corey Kent,Wild as Her,fear,0.621407,0.045985


API-like JSON output example

In [13]:
example_query = "I feel so anxious and scared about my future"
api_like_result = recommend_songs_from_text(example_query, top_k=5)

print(json.dumps(api_like_result, indent=2, ensure_ascii=False))


{
  "user_emotion_id": 4,
  "user_emotion": "fear",
  "user_confidence": 0.9959027570490666,
  "recommendations": [
    {
      "title": "Wild as Her",
      "artist": "Corey Kent",
      "album": "Wild as Her",
      "song_emotion": "fear",
      "song_emotion_conf": 0.6214065055982034,
      "similarity": 0.045985453421076836
    },
    {
      "title": "10:35",
      "artist": "Tiësto",
      "album": "10:35",
      "song_emotion": "fear",
      "song_emotion_conf": 0.3844847813245405,
      "similarity": 0.03253972263410617
    },
    {
      "title": "Renegade",
      "artist": "JAY-Z",
      "album": "Curtain Call: The Hits (Deluxe Edition)",
      "song_emotion": "fear",
      "song_emotion_conf": 0.512790932893739,
      "similarity": 0.02849597506625931
    },
    {
      "title": "Fast Car",
      "artist": "Luke Combs",
      "album": "Gettin' Old",
      "song_emotion": "fear",
      "song_emotion_conf": 0.2372283916681233,
      "similarity": 0.024696749627646612
    },
  

Summary of artifacts used by the API

In [14]:
print("Artifacts for API:")
print("TF-IDF model:      ", MODELS_DIR / "tfidf_emotion.joblib")
print("LogReg model:      ", MODELS_DIR / "logreg_emotion.joblib")
print("Song TF-IDF matrix:", MODELS_DIR / "song_tfidf_matrix.npz")
print("Songs + emotions:  ", DATA_CLEAN / "songs_with_predicted_emotions.csv")


Artifacts for API:
TF-IDF model:       D:\Sheridan\First semester\Python\LyriSense\models\tfidf_emotion.joblib
LogReg model:       D:\Sheridan\First semester\Python\LyriSense\models\logreg_emotion.joblib
Song TF-IDF matrix: D:\Sheridan\First semester\Python\LyriSense\models\song_tfidf_matrix.npz
Songs + emotions:   D:\Sheridan\First semester\Python\LyriSense\data\cleaned\songs_with_predicted_emotions.csv
