In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical


In [2]:
df = pd.read_csv("SongDataset.csv")
df.head()


Unnamed: 0,songid,song_name,lyric,artist
0,1,Tum Hi Ho,Hum Tere Bin Ab Reh Nehi Sakte\nTere Bina Kya ...,Arijit Singh
1,2,Chaiyya Chaiyya,Chal Chaiyya chaiyya chaiyya chaiyya\nChal cha...,Sukhwinder Singh
2,3,Kal Ho Naa Ho,Har ghadi badal rahi hai roop zindagi\nChaav h...,Sonu Nigam
3,4,Tujh Mein Rab Dikhta Hai,tu hi toh jannat meri\ntu hi mera junoon\ntu h...,Roop Kumar Rathod
4,5,Kabira,Haan re..\nKaisi teri khudgarzi\nNa dhoop chun...,Arijit Singh


In [3]:
df.isnull().sum()


songid       0
song_name    0
lyric        0
artist       0
dtype: int64

In [4]:
df = df.dropna()


In [5]:
def clean_lyrics(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-z ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['clean_lyric'] = df['lyric'].apply(clean_lyrics)
df[['lyric', 'clean_lyric']].head()


Unnamed: 0,lyric,clean_lyric
0,Hum Tere Bin Ab Reh Nehi Sakte\nTere Bina Kya ...,hum tere bin ab reh nehi sakte tere bina kya w...
1,Chal Chaiyya chaiyya chaiyya chaiyya\nChal cha...,chal chaiyya chaiyya chaiyya chaiyya chal chai...
2,Har ghadi badal rahi hai roop zindagi\nChaav h...,har ghadi badal rahi hai roop zindagi chaav ha...
3,tu hi toh jannat meri\ntu hi mera junoon\ntu h...,tu hi toh jannat meri tu hi mera junoon tu hi ...
4,Haan re..\nKaisi teri khudgarzi\nNa dhoop chun...,haan re kaisi teri khudgarzi na dhoop chuney n...


In [6]:
df['label'] = df['song_name'] + " - " + df['artist']


In [7]:
#Encode Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])
y = to_categorical(y)


In [8]:
#TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X = tfidf.fit_transform(df['clean_lyric']).toarray()


In [9]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
#Build Neural Network Model (TensorFlow)
model = Sequential()

model.add(Dense(512, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.3))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(y.shape[1], activation='softmax'))

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
#Train the Model
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test, y_test)
)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.3163 - val_accuracy: 0.0000e+00 - val_loss: 2.3460
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.3750 - loss: 2.2654 - val_accuracy: 0.0000e+00 - val_loss: 2.3609
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.6250 - loss: 2.1877 - val_accuracy: 0.0000e+00 - val_loss: 2.3775
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.6250 - loss: 2.1773 - val_accuracy: 0.0000e+00 - val_loss: 2.3969
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.8750 - loss: 2.0926 - val_accuracy: 0.0000e+00 - val_loss: 2.4180
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.8750 - loss: 2.0781 - val_accuracy: 0.0000e+00 - val_loss: 2.4422
Epoch 7/10
[1m

In [12]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.0000e+00 - loss: 2.5557
Test Accuracy: 0.0


In [13]:
def predict_song(lyrics_snippet):
    lyrics_snippet = clean_lyrics(lyrics_snippet)
    vector = tfidf.transform([lyrics_snippet]).toarray()
    
    prediction = model.predict(vector)
    predicted_index = np.argmax(prediction)
    
    return label_encoder.inverse_transform([predicted_index])[0]


In [15]:
predict_song("rang de to mohe garuya")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step


'Gerua - Arijit Singh'