In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()


In [None]:
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [None]:
 ! kaggle datasets list


In [None]:
!kaggle datasets download -d notshrirang/spotify-million-song-dataset


Dataset URL: https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
License(s): CC0-1.0
Downloading spotify-million-song-dataset.zip to /content
 48% 10.0M/20.7M [00:00<00:00, 40.0MB/s]
100% 20.7M/20.7M [00:00<00:00, 63.8MB/s]


In [None]:
# Unzip the downloaded file
!unzip spotify-million-song-dataset.zip

# Load the dataset into a DataFrame (replace 'filename.csv' with the actual CSV name)
import pandas as pd
data = pd.read_csv("spotify_millsongdata.csv")


Archive:  spotify-million-song-dataset.zip
  inflating: spotify_millsongdata.csv  


In [None]:
pip install pandas scikit-learn nltk



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import string

# Download necessary NLP tools
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##LOADING AND PREPROCESSING THE DATA

In [None]:
# Load the dataset (update the path to your local file)
data = pd.read_csv('/content/spotify_millsongdata.csv')

# Function to preprocess text (lowercase, remove punctuation)
def preprocess_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    return text

# Apply preprocessing to the lyrics (text column)
data['processed_text'] = data['text'].apply(preprocess_text)

##VECTORIZE THE LYRICS

In [None]:
# Vectorize the text using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['processed_text'])

##COSINE SIMILARITY

In [None]:
# Reduce the dataset size for demonstration by selecting a subset of 2000 songs
subset_data = data.sample(n=2000, random_state=42).reset_index(drop=True)

# Re-vectorize the text for the subset using TF-IDF
subset_tfidf_matrix = tfidf.fit_transform(subset_data['processed_text'])

# Recompute cosine similarity for the subset
subset_cosine_sim = cosine_similarity(subset_tfidf_matrix, subset_tfidf_matrix)


##RECOMMENDATION FUNCTION

In [None]:
# Function to recommend songs based on cosine similarity
def recommend_songs(song_title, cosine_sim_matrix, data, num_recommendations=5):
    # Find the index of the song in the subset dataset
    indices = pd.Series(data.index, index=data['song']).drop_duplicates()

    if song_title not in indices:
        return f"Song '{song_title}' not found in the dataset."

    idx = indices[song_title]

    # Get the pairwise similarity scores for the song
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the songs based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar songs (excluding the song itself)
    sim_scores = sim_scores[1:num_recommendations+1]

    # Get the song indices and titles
    song_indices = [i[0] for i in sim_scores]
    return data.iloc[song_indices][['artist', 'song']]


In [None]:
# Select a random song from the subset
sample_song = subset_data['song'].iloc[8]


In [None]:
# Choose a specific song title from the subset
sample_song = 'Dancing Queen'  # Replace with the actual title

In [None]:
print(f"Recommendations for '{sample_song}':")

Recommendations for 'Dancing Queen':


##TESTING

In [None]:
# Test the recommendation system with a sample song from the subset
sample_song = subset_data['song'].iloc[0]
recommendations = recommend_songs(sample_song, subset_cosine_sim, subset_data)

# Display the recommendations
print(f"Recommendations for '{sample_song}':")
print(recommendations)


Recommendations for 'Right Or Wrong':
                 artist                       song
430            Old 97's               St. Ignatius
1973  Dusty Springfield     I've Been Wrong Before
999          Air Supply        I Don't Believe You
1508      Grateful Dead  Man Smart - Woman Smarter
564      Lynyrd Skynyrd              One More Time


In [None]:
# Test the recommendation system with a sample song from the subset and get 10 recommendations
sample_song = subset_data['song'].iloc[0]  # Selecting the first song from the subset
recommendations = recommend_songs(sample_song, subset_cosine_sim, subset_data, num_recommendations=10)

# Display the recommendations
print(f"Recommendations for '{sample_song}':")
print(recommendations)

Recommendations for 'Right Or Wrong':
                 artist                            song
430            Old 97's                    St. Ignatius
1973  Dusty Springfield          I've Been Wrong Before
999          Air Supply             I Don't Believe You
1508      Grateful Dead       Man Smart - Woman Smarter
564      Lynyrd Skynyrd                   One More Time
866         John Legend                      Number One
71         Indigo Girls      Pushing The Needle Too Far
1403      Reba Mcentire  If I Had Any Sense Left At All
1688          Bob Seger                       Which Way
951          Zayn Malik                        It's You


In [None]:
# Install streamlit and pyngrok
!pip install streamlit
!pip install pyngrok


Collecting streamlit
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<6,>=2.1.5 (from streamlit)
  Downloading watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.39.0-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [3

In [None]:
%%writefile streamlit_app.py

import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# Preprocess text to remove punctuation and lowercase it
def preprocess_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    return text

# Function to get song recommendations based on cosine similarity
def recommend_songs(song_title, cosine_sim_matrix, data, num_recommendations=5):
    indices = pd.Series(data.index, index=data['song']).drop_duplicates()
    if song_title not in indices:
        return f"Song '{song_title}' not found in the dataset."

    idx = indices[song_title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]  # Skip the first entry (itself)
    song_indices = [i[0] for i in sim_scores]
    return data.iloc[song_indices][['artist', 'song']]

# Load the dataset
@st.cache_data
def load_data():
    data = pd.read_csv("/content/spotify_millsongdata.csv")  # Ensure this file is uploaded
    data['processed_text'] = data['text'].apply(preprocess_text)  # Preprocess the text column
    return data

# Load the dataset once
data = load_data()

# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['processed_text'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Streamlit app interface
st.title("Song Recommendation System")

# Song search input
song_title = st.text_input("Enter a song title for recommendations")

if st.button("Get Recommendations"):
    if song_title:
        recommendations = recommend_songs(song_title, cosine_sim, data)
        st.write(f"Recommendations for '{song_title}':")
        st.write(recommendations)
    else:
        st.write("Please enter a song title.")


Writing streamlit_app.py


In [None]:
!ngrok config add-authtoken 2n1MfV12T8mzWcGCBoSouFrYGuc_37Yd3feX8BM1Za8XaDvTG


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
pip install pyngrok



In [None]:
pip install ngrok

Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ngrok
Successfully installed ngrok-1.4.0


In [None]:
!pip install --upgrade pyngrok
import subprocess

# Run the Streamlit app in the background
streamlit_process = subprocess.Popen(['streamlit', 'run', 'streamlit_app.py'])

# Import ngrok here so it's accessible in this cell
from pyngrok import ngrok

# Create a public URL using ngrok
public_url = ngrok.connect(8501)  # Use port 8501
print(f"Streamlit app is live at: {public_url}")


Streamlit app is live at: NgrokTunnel: "https://d39c-34-90-151-238.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!pip install --upgrade pyngrok








In [None]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

def preprocess_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    return text

def recommend_songs(song_title, cosine_sim_matrix, data, num_recommendations=5):
    indices = pd.Series(data.index, index=data['song']).drop_duplicates()
    if song_title not in indices:
        return f"Song '{song_title}' not found in the dataset."

    idx = indices[song_title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]  # Skip the first entry (itself)
    song_indices = [i[0] for i in sim_scores]
    return data.iloc[song_indices][['artist', 'song']]

@st.cache_data
def load_data():
    data = pd.read_csv("/content/spotify_millsongdata.csv")
    data['processed_text'] = data['text'].apply(preprocess_text)
    return data

data = load_data()
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['processed_text'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

st.title("Song Recommendation System")
song_title = st.text_input("Enter a song title for recommendations")
if st.button("Get Recommendations"):
    if song_title:
        recommendations = recommend_songs(song_title, cosine_sim, data)
        st.write(f"Recommendations for '{song_title}':")
        st.write(recommendations)
    else:
        st.write("Please enter a song title.")

# Step 3: Start Ngrok and Streamlit
from pyngrok import ngrok
import subprocess

# Stop any existing ngrok processes
ngrok.kill()



Overwriting streamlit_app.py


In [None]:
import subprocess

In [None]:
# Run the Streamlit app in the background
streamlit_process = subprocess.Popen(['streamlit', 'run', 'streamlit_app.py'])

# Create a public URL using ngrok
public_url = ngrok.connect(8501)  # Use port 8501
print(f"Streamlit app is live at: {public_url}")



Streamlit app is live at: NgrokTunnel: "https://ac9d-34-90-151-238.ngrok-free.app" -> "http://localhost:8501"


In [None]:
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Cache the dataset loading function
@st.cache
def load_data():
    data = pd.read_csv('spotify_millsongdata.csv')
    return data

# Cache the TF-IDF vectorizer and cosine similarity calculations
@st.cache
def process_data(data):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(data['processed_text'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_sim

st.title('Song Recommendation System')

# Load data
data = load_data()

# Display a subset of the data
st.write(data.head(10))

# Process data only when necessary (lazy loading)
if st.button('Calculate Similarity'):
    cosine_sim = process_data(data)
    st.write("Similarity Calculated!")

2024-10-10 14:38:18.197 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-10-10 14:38:18.206 
`st.cache` is deprecated and will be removed soon. Please use one of Streamlit's new
caching commands, `st.cache_data` or `st.cache_resource`. More information
[in our docs](https://docs.streamlit.io/develop/concepts/architecture/caching).

**Note**: The behavior of `st.cache` was updated in Streamlit 1.36 to the new caching
logic used by `st.cache_data` and `st.cache_resource`. This might lead to some problems
or unexpected behavior in certain edge cases.

2024-10-10 14:38:18.211 No runtime found, using MemoryCacheStorageManager
2024-10-10 14:38:18.216 
`st.cache` is deprecated and will be removed soon. Please use one of Streamlit's new
caching commands, `st.cache_data` or `st.cache_resource`. More information
[in our docs](https://docs.streamlit.io/develop/concepts/architecture/caching).

**Note**: The behavior of `st.cache` was 

In [None]:
!python streamlit_app.py


2024-10-10 14:38:32.755 
  command:

    streamlit run streamlit_app.py [ARGUMENTS]
2024-10-10 14:38:32.755 No runtime found, using MemoryCacheStorageManager




Traceback (most recent call last):
  File "/content/streamlit_app.py", line 33, in <module>
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/pairwise.py", line 1687, in cosine_similarity
    K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/extmath.py", line 205, in safe_sparse_dot
    ret = a @ b
  File "/usr/local/lib/python3.10/dist-packages/scipy/sparse/_base.py", line 695, in __matmul__
    return self._matmul_dispatch(other)
  File "/usr/local/lib/python3.10/dist-packages/scipy/sparse/_base.py", line 606, in _matmul_dispatch
    return self._matmul_sparse(other)
  File "/usr/local/lib/python3.10/dist-packages/scipy/sparse/_compressed.py", line 520, in _matmul_sparse
    nnz = fn(

In [None]:
!streamlit run streamlit_app.py



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.196.143.23:8501[0m
[0m
