In [None]:
!pip install nltk scikit-learn pandas



In [2]:
import pandas as pd

df = pd.read_csv("spotify_songs.csv")  # filename adjust koro
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df["clean_lyrics"] = df["text"].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X = vectorizer.fit_transform(df["clean_lyrics"])


In [8]:
SONG_COL = "song"
ARTIST_COL = "artist"
LYRICS_COL = "text"
LINK_COL = "link"


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_song(lyrics_snippet):
    snippet = clean_text(lyrics_snippet)
    snippet_vec = vectorizer.transform([snippet])

    similarity = cosine_similarity(snippet_vec, X)
    index = similarity.argmax()

    song = df.iloc[index][SONG_COL]
    artist = df.iloc[index][ARTIST_COL]
    link = df.iloc[index][LINK_COL]

    print(" Song name:", song)
    print(" Artist:", artist)
    print(" Song link:", "https://www.lyrics.com" + link)


In [10]:
predict_song("hello darkness my old friend")


 Song name: Hello My Old Friend
 Artist: Electric Light Orchestra
 Song link: https://www.lyrics.com/e/electric+light+orchestra/hello+my+old+friend_20045536.html
