In [1]:
import pandas as pd
import re
import nltk
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords


In [2]:
df = pd.read_csv("Spotify Million Song Dataset_exported.csv")

# Keep only required columns
df = df[['artist', 'song', 'text']]
df.dropna(inplace=True)

print(df.head())


  artist                   song  \
0   ABBA  Ahe's My Kind Of Girl   
1   ABBA       Andante, Andante   
2   ABBA         As Good As New   
3   ABBA                   Bang   
4   ABBA       Bang-A-Boomerang   

                                                text  
0  Look at her face, it's a wonderful face  \nAnd...  
1  Take it easy with me, please  \nTouch me gentl...  
2  I'll never know why I had to go  \nWhy I had t...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  


In [3]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_lyrics'] = df['text'].apply(preprocess)
df = df.drop_duplicates(subset='clean_lyrics')



In [4]:
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), min_df=2)

lyrics_vectors = tfidf.fit_transform(df['clean_lyrics'])


In [5]:
def predict_song(snippet, top_n=3):
    snippet = preprocess(snippet)
    snippet_vector = tfidf.transform([snippet])

    similarity = cosine_similarity(snippet_vector, lyrics_vectors)
    top_indices = similarity.argsort()[0][-top_n:][::-1]

    results = df.iloc[top_indices][['artist', 'song']]
    return results


In [6]:
query = "we were both young when i first saw you i close my eyes and the flashback starts"
predict_song(query)


Unnamed: 0,artist,song
35473,Harry Belafonte,Close Your Eyes
11586,Linda Ronstadt,Burns' Supper
49418,Queen Latifah,Close Your Eyes


In [None]:
def evaluate(sample_size=100):
    correct = 0
    sample = df.sample(sample_size)

    for _, row in sample.iterrows():
        snippet = " ".join(row['clean_lyrics'].split()[:25])
        result = predict_song(snippet, top_n=5)

        if row['song'] == result.iloc[0]['song']:
            correct += 1

    return f"Accuracy: {correct/sample_size:.2f}"

evaluate()

'Accuracy: 0.88'