In [None]:
!pip install kaggle -q

In [None]:
import os
import json
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
kaggle_config = json.load(open('kaggle.json'))

In [None]:
# Setting the environment variables

os.environ['KAGGLE_USERNAME'] = kaggle_config['username']
os.environ['KAGGLE_KEY'] = kaggle_config['key']

In [None]:
# Loading the dataset using kaggle API
!kaggle datasets download notshrirang/spotify-million-song-dataset

In [None]:
# Extract th contents of the downloaded zipfile

with zipfile.ZipFile('spotify-million-song-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
# Load the csv file into a dataframe

df = pd.read_csv('/content/spotify_millsongdata.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

## EDA

In [None]:
# Top Artists

top_artists = df['artist'].value_counts().head(10)
print("Top 10 Artists:")
print(top_artists)

In [None]:
df = df.sample(10000)

df = df.drop('link', axis=1).reset_index(drop=True)

In [None]:
# WordCloud for song lyrics

all_lyrics = ' '.join(df['text'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_lyrics)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('The most common words in lyrics')
plt.show()

## Data Preprocessing

In [None]:
 # Download nltk data

 nltk.download('punkt')
 nltk.download('punkt_tab')
 nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
  # Remove special characters and numbers
  text = re.sub(r"[^a-zA-Z\s]", "", text)
  # Convert to lowercase
  text = text.lower()
  # Tokenize and remove the stopwords
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stop_words]
  return " ".join(tokens)

In [None]:
 # Apply preprocessing to lyrics
 df['cleaned_text'] = df['text'].apply(preprocess_text)

In [None]:
# Vectorization with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features = 5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

In [None]:
# Computing Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def recommend_songs(song_name, cosine_sim=cosine_sim, df=df, top_n=5):

  idx = df[df['song'].str.lower() == song_name.lower()].index
  if len(idx) == 0:
    return "Song not found in the dataset"
  idx = idx[0]

  #Get similarity scores
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:top_n+1]

  # Get song indices
  song_indices = [i[0] for i in sim_scores]

  #Return top and similar songs
  return df[['artist', 'song']].iloc[song_indices]

In [None]:
print("\nRecommendations for song 'Cain and Abel':")
recommendations = recommend_songs('Cain and Abel')
print(recommendations)