# Spotify Listening History Data Processing

This script processes JSON files containing Spotify listening history data. It performs the following operations:

1. Loads multiple JSON files from the `raw_data` directory
2. Extracts key features from timestamps (date, hour, day of week)
3. Converts milliseconds played to minutes
4. Filters and organizes relevant columns for analysis

Key features extracted:
- Temporal data (timestamp, date, hour, day of week)
- Track information (song name, artist, album)
- Listening behavior (minutes played, shuffle status, skipped status)
- Platform and playback context (reason for start/end)

The processed DataFrame provides a structured foundation for analyzing listening patterns and music preferences.

In [2]:
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def process_spotify_data(raw_data_path: str) -> pd.DataFrame:
    """Process Spotify JSON files into a single DataFrame."""
    try:
        all_records = []
        data_path = Path(raw_data_path)
        
        if not data_path.exists():
            raise FileNotFoundError(f"Directory not found: {raw_data_path}")
            
        json_files = list(data_path.glob('*.json'))
        if not json_files:
            raise ValueError(f"No JSON files found in {raw_data_path}")
            
        for json_file in json_files:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_records.extend(data)
        
        df = pd.DataFrame(all_records)
        
        # Process timestamps and add time-based features
        df['ts'] = pd.to_datetime(df['ts'])
        df['date'] = df['ts'].dt.date
        df['hour'] = df['ts'].dt.hour
        df['day_of_week'] = df['ts'].dt.day_name()
        df['minutes_played'] = df['ms_played'] / 60000
        
        columns = [
            'ts', 'date', 'hour', 'day_of_week',
            'master_metadata_track_name', 'master_metadata_album_artist_name',
            'master_metadata_album_album_name', 'minutes_played',
            'shuffle', 'skipped', 'platform', 'reason_start', 'reason_end'
        ]
        
        return df[columns]
    
    except Exception as e:
        logger.error(f"Error processing Spotify data: {str(e)}")
        raise

# Usage in notebook
raw_data_path = 'raw_data'  # Adjust this path to your data location
df = process_spotify_data(raw_data_path)
print(f"Processed {len(df):,} records")
df.head()

Processed 120,402 records


Unnamed: 0,ts,date,hour,day_of_week,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,minutes_played,shuffle,skipped,platform,reason_start,reason_end
0,2019-08-04 08:28:37+00:00,2019-08-04,8,Sunday,Gelsin Öpsün Kalbimi,Güliz Ayla,Parla,3.366033,False,False,"iOS 12.3.1 (iPhone9,4)",clickrow,trackdone
1,2019-08-04 08:28:47+00:00,2019-08-04,8,Sunday,Yalan,EDIS,An,0.14035,False,False,"iOS 12.3.1 (iPhone9,4)",trackdone,fwdbtn
2,2019-08-04 08:28:57+00:00,2019-08-04,8,Sunday,Daha Mutlu Olamam,mor ve ötesi,Gül Kendine,0.159433,False,False,"iOS 12.3.1 (iPhone9,4)",fwdbtn,fwdbtn
3,2019-08-04 08:33:26+00:00,2019-08-04,8,Sunday,Öp,Tarkan,Adımı Kalbine Yaz,3.012,False,False,"iOS 12.3.1 (iPhone9,4)",fwdbtn,fwdbtn
4,2019-08-04 08:34:47+00:00,2019-08-04,8,Sunday,Hoş Gör Sen,Ajda Pekkan,The Best of Ajda,1.345583,False,False,"iOS 12.3.1 (iPhone9,4)",fwdbtn,fwdbtn
