In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

In [3]:
# Parameters for dataset generation
num_users = 1000  # Number of unique users
num_songs = 500   # Number of unique songs
num_records = 10000  # Total number of user-song interaction records

In [4]:
# Generate random user IDs and song IDs
user_ids = np.random.randint(1, num_users+1, num_records)
song_ids = np.random.randint(1, num_songs+1, num_records)

In [5]:
# Generate random play counts (1 to 10)
play_counts = np.random.randint(1, 11, num_records)

In [6]:
# Generate random timestamps for when the song was last played
# Let's assume the current date is October 10, 2023
current_date = datetime(2023, 10, 10)

In [7]:

# Random last play time within the last 60 days
last_play_times = [current_date - timedelta(days=random.randint(1, 60)) for _ in range(num_records)]


In [8]:
# Generate random song features (tempo, energy)
tempos = np.random.randint(60, 180, num_records)  # Tempo ranges from 60 to 180 BPM
energies = np.random.uniform(0.1, 1.0, num_records)  # Energy ranges from 0.1 to 1.0

In [9]:
# Generate replay labels (1 for replay within a month, 0 otherwise)
# We'll assume songs with higher play counts and recent playtimes are more likely to be replayed
labels = [
    1 if play_counts[i] > 3 and (current_date - last_play_times[i]).days < 30 else 0
    for i in range(num_records)
]

In [10]:
# Create a DataFrame to hold the generated dataset
df = pd.DataFrame({
    'user_id': user_ids,
    'song_id': song_ids,
    'play_count': play_counts,
    'last_play_time': last_play_times,
    'tempo': tempos,
    'energy': energies,
    'label': labels
})

In [12]:
# Show the first few rows of the dataset
df.head()

Unnamed: 0,user_id,song_id,play_count,last_play_time,tempo,energy,label
0,103,442,4,2023-09-11,136,0.529093,1
1,436,279,5,2023-08-31,159,0.805545,0
2,861,251,5,2023-09-29,168,0.937152,1
3,271,310,8,2023-09-25,127,0.759065,1
4,107,208,3,2023-09-11,94,0.798485,0


In [13]:
# Optionally, save to a CSV file
df.to_csv('synthetic_music_dataset.csv', index=False)