In [1]:
!pip install streamlit
!pip install scikit-learn==1.5.0
!pip install plotly
!pip install pandas
!pip install numpy


Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.

In [3]:
from google.colab import files
uploaded = files.upload()


Saving top_10000_1960-now.csv to top_10000_1960-now.csv


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pickle


# Load the dataset
file_path = 'top_10000_1960-now.csv'
spotify_data = pd.read_csv(file_path)

# Drop unnecessary columns
columns_to_drop = ['Track Preview URL', 'Artist Genres', 'Album Genres', 'Copyrights']
spotify_data = spotify_data.drop(columns=columns_to_drop)

# Handle missing values
spotify_data = spotify_data.dropna(subset=['Track Name', 'Artist Name(s)', 'Album Name', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time Signature'])
spotify_data = spotify_data.fillna('Unknown')

# Convert categorical columns to numeric
spotify_data['Explicit'] = spotify_data['Explicit'].astype(int)

# Handle different date formats in 'Album Release Date'
def parse_date(date):
    try:
        return pd.to_datetime(date, format='%Y-%m-%d').year
    except:
        try:
            return pd.to_datetime(date, format='%Y').year
        except:
            return None

spotify_data['Album Release Year'] = spotify_data['Album Release Date'].apply(parse_date)
spotify_data = spotify_data.drop(columns=['Album Release Date'])
spotify_data = spotify_data.dropna(subset=['Album Release Year'])

# Create target variable 'Popular' based on the median popularity
median_popularity = spotify_data['Popularity'].median()
spotify_data['Popular'] = (spotify_data['Popularity'] >= median_popularity).astype(int)

# Select relevant features and target
features = ['Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Explicit', 'Album Release Year']
target = 'Popular'

X = spotify_data[features]
y = spotify_data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train different models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True)
}

# Evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'conf_matrix': conf_matrix
    }

    print(f'{model_name} Results:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'ROC AUC Score: {roc_auc:.4f}')
    print(f'Confusion Matrix:\n{conf_matrix}\n')

# Select the best model (assuming Random Forest performed the best)
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)

# Save the best model and scaler
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)


Logistic Regression Results:
Accuracy: 0.5575
Precision: 0.5319
Recall: 0.6373
F1 Score: 0.5799
ROC AUC Score: 0.5607
Confusion Matrix:
[[502 535]
 [346 608]]

Random Forest Results:
Accuracy: 0.5721
Precision: 0.5501
Recall: 0.5870
F1 Score: 0.5680
ROC AUC Score: 0.5727
Confusion Matrix:
[[579 458]
 [394 560]]

Support Vector Machine Results:
Accuracy: 0.5640
Precision: 0.5337
Recall: 0.7138
F1 Score: 0.6108
ROC AUC Score: 0.5700
Confusion Matrix:
[[442 595]
 [273 681]]



In [5]:
import streamlit as st
import pandas as pd
import pickle
import sklearn

# Ensure the correct version of scikit-learn is used
assert sklearn.__version__ == '1.5.0', "Scikit-learn version must be 1.5.0"

# Load the trained model and scaler
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)

with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Streamlit app title
st.title('Spotify Song Popularity Prediction')

# Input fields for features
danceability = st.number_input('Danceability', min_value=0.0, max_value=1.0, step=0.01)
energy = st.number_input('Energy', min_value=0.0, max_value=1.0, step=0.01)
key = st.number_input('Key', min_value=0, max_value=11, step=1)
loudness = st.number_input('Loudness', min_value=-60.0, max_value=0.0, step=0.1)
mode = st.number_input('Mode', min_value=0, max_value=1, step=1)
speechiness = st.number_input('Speechiness', min_value=0.0, max_value=1.0, step=0.01)
acousticness = st.number_input('Acousticness', min_value=0.0, max_value=1.0, step=0.01)
instrumentalness = st.number_input('Instrumentalness', min_value=0.0, max_value=1.0, step=0.01)
liveness = st.number_input('Liveness', min_value=0.0, max_value=1.0, step=0.01)
valence = st.number_input('Valence', min_value=0.0, max_value=1.0, step=0.01)
tempo = st.number_input('Tempo', min_value=0.0, max_value=250.0, step=0.1)
time_signature = st.number_input('Time Signature', min_value=1, max_value=5, step=1)
explicit = st.number_input('Explicit', min_value=0, max_value=1, step=1)
album_release_year = st.number_input('Album Release Year', min_value=1960, max_value=2024, step=1)

# Collect features into a DataFrame
features = pd.DataFrame({
    'Danceability': [danceability],
    'Energy': [energy],
    'Key': [key],
    'Loudness': [loudness],
    'Mode': [mode],
    'Speechiness': [speechiness],
    'Acousticness': [acousticness],
    'Instrumentalness': [instrumentalness],
    'Liveness': [liveness],
    'Valence': [valence],
    'Tempo': [tempo],
    'Time Signature': [time_signature],
    'Explicit': [explicit],
    'Album Release Year': [album_release_year]
})

# Normalize the features
features_scaled = scaler.transform(features)

# Predict song popularity
if st.button('Predict Popularity'):
    prediction = model.predict(features_scaled)
    result = 'Popular' if prediction[0] == 1 else 'Not Popular'
    st.write(f'The song is predicted to be: {result}')


2024-06-25 09:55:18.046 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-06-25 09:55:18.049 Session state does not function when running a script without `streamlit run`


In [11]:

# Define the Streamlit app code
streamlit_code = """
import streamlit as st
import pickle
import numpy as np
import pandas as pd
import plotly.graph_objects as go  # type: ignore

# Title and description
st.write(\"\"\"
# Song Popularity Prediction App
## This app can predict a song's popularity from 1960 - 2024!
\"\"\")


st.sidebar.header('User Input Parameters')

# Get user inputs
def user_input_features():
    acousticness = st.sidebar.slider('acousticness', 0.0, 1.0, 0.23)
    danceability = st.sidebar.slider('danceability', 0.0, 1.0, 0.7)
    energy = st.sidebar.slider('energy', 0.0, 1.0, 0.5)
    explicit = st.sidebar.slider('explicit', 0, 1, 1)
    instrumentalness = st.sidebar.slider('instrumentalness', 0.0, 1.0, 0.0)
    key = st.sidebar.slider('key', 0, 11, 3)
    liveness = st.sidebar.slider('liveness', 0.0, 1.0, 0.5)
    loudness = st.sidebar.slider('loudness', -60.0, 0.0, -4.0)
    mode = st.sidebar.slider('mode', 0, 1, 0)
    speechiness = st.sidebar.slider('speechiness', 0.0, 1.0, 0.03)
    tempo = st.sidebar.slider('tempo', 0.0, 250.0, 92.0)
    valence = st.sidebar.slider('valence', 0.0, 1.0, 0.7)
    album_release_year = st.sidebar.slider('year', 1960, 2024, 2015)
    data = {
        'Danceability': danceability,
        'Energy': energy,
        'Key': key,
        'Loudness': loudness,
        'Mode': mode,
        'Speechiness': speechiness,
        'Acousticness': acousticness,
        'Instrumentalness': instrumentalness,
        'Liveness': liveness,
        'Valence': valence,
        'Tempo': tempo,
        'Time Signature': 4,  # Assuming a default time signature
        'Explicit': explicit,
        'Album Release Year': album_release_year
    }
    features = pd.DataFrame(data, index=[0])
    return features

df = user_input_features()

# Show user inputs
st.subheader('User Input parameters')
st.write(df)

# Create Plotly plot
columns = ['Danceability', 'Energy', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness']
df_song_char = df.filter(items=columns)
y = df_song_char.values.tolist()[0]

fig = go.Figure(data=go.Bar(x=columns, y=y), layout_title_text='Audio Features from User Input')
st.plotly_chart(fig, use_container_width=True)

# Load the trained model and scaler
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)

with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Normalize the features
features_scaled = scaler.transform(df)

# Predict song popularity
prediction = model.predict(features_scaled)

st.subheader('Predicted Song Popularity')
result = 'Popular' if prediction[0] == 1 else 'Not Popular'
st.write(f'The song is predicted to be: {result}')
"""

# Save the Streamlit app code to a file
with open('web-app.py', 'w') as file:
    file.write(streamlit_code)

files.download('web-app.py')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
 #Download the files
from google.colab import files
files.download('best_model.pkl')
files.download('scaler.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>