In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pickle

# Load your dataset
data = pd.read_csv('Spotify Most Streamed Songs.csv')

# Preprocessing - Handle missing values, encode categorical variables, etc.
data.fillna(0, inplace=True)

# Convert categorical columns to numeric (Example: 'key' and 'mode')
data = pd.get_dummies(data, columns=['key', 'mode'])

# Convert 'streams' to numeric values, invalid parsing will become NaN
data['streams'] = pd.to_numeric(data['streams'], errors='coerce')

# Drop rows with NaN values in 'streams' column
data = data.dropna(subset=['streams'])

# Define the target variable (Here we assume 'streams' > threshold is a hit)
data['hit'] = (data['streams'] > 1000000).astype(int)  # Threshold can be adjusted

# Remove commas and convert to numeric
data['in_deezer_playlists'] = data['in_deezer_playlists'].replace({',': ''}, regex=True)
data['in_deezer_playlists'] = pd.to_numeric(data['in_deezer_playlists'], errors='coerce')  # Convert to float, with errors set to NaN

data['in_shazam_charts'] = data['in_shazam_charts'].replace({',': ''}, regex=True)
data['in_shazam_charts'] = pd.to_numeric(data['in_shazam_charts'], errors='coerce')  # Convert to float, with errors set to NaN


# Select the features and target
features = data.drop(columns=['track_name', 'artist(s)_name', 'hit', 'cover_url'])
target = data['hit']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a model (Random Forest in this case)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Print the feature names used for training the model
print(X_train.columns.tolist())

# Evaluate the model (optional)
accuracy = model.score(X_test_scaled, y_test)
print(f'Model accuracy: {accuracy}')

# Save the trained model to a pickle file
with open('hit_song_predictor_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Optionally, save the scaler for later use
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)


['artist_count', 'released_year', 'released_month', 'released_day', 'in_spotify_playlists', 'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%', 'key_0', 'key_A', 'key_A#', 'key_B', 'key_C#', 'key_D', 'key_D#', 'key_E', 'key_F', 'key_F#', 'key_G', 'key_G#', 'mode_Major', 'mode_Minor']
Model accuracy: 1.0
