<a href="https://colab.research.google.com/github/ezragershman/spotify-listening-data/blob/main/notebooks/data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import os
# Change to the cloned repo directory (optional)
%cd '/content/spotify-listening-data'
for dirname, _, filenames in os.walk('/content/spotify-listening-data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/content/spotify-listening-data
/content/spotify-listening-data/README.md
/content/spotify-listening-data/data/raw/Streaming_History_Audio_2013-2024.json
/content/spotify-listening-data/data/processed/spotify_data.csv
/content/spotify-listening-data/.git/config
/content/spotify-listening-data/.git/index
/content/spotify-listening-data/.git/HEAD
/content/spotify-listening-data/.git/packed-refs
/content/spotify-listening-data/.git/description
/content/spotify-listening-data/.git/hooks/post-update.sample
/content/spotify-listening-data/.git/hooks/update.sample
/content/spotify-listening-data/.git/hooks/commit-msg.sample
/content/spotify-listening-data/.git/hooks/pre-push.sample
/content/spotify-listening-data/.git/hooks/fsmonitor-watchman.sample
/content/spotify-listening-data/.git/hooks/push-to-checkout.sample
/content/spotify-listening-data/.git/hooks/pre-rebase.sample
/content/spotify-listening-data/.git/hooks/applypatch-msg.sample
/content/spotify-listening-data/.git/hooks/prepare-com

# **MAIN FUNCTION TO UPDATE & SAVE MUSIC LIBRARY DATABASE**


In [None]:
import requests
from base64 import b64encode
import os
from google.colab import userdata

# Step 1: Set up Spotify credentials as environment variables in Google Colab
CLIENT_ID = userdata.get('spotify_client_id')
CLIENT_SECRET = userdata.get('spotify_client_sec')

# Step 2: Get Access Token from Spotify API using Client Credentials Flow
def get_access_token(client_id, client_secret):
    auth_header = b64encode(f"{client_id}:{client_secret}".encode('utf-8')).decode('utf-8')
    headers = {
        'Authorization': f'Basic {auth_header}',
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    data = {'grant_type': 'client_credentials'}

    response = requests.post('https://accounts.spotify.com/api/token', headers=headers, data=data)
    if response.status_code == 200:
        return response.json()['access_token']
    else:
        print("Failed to get access token")
        print(response.json())  # Added for better debugging
        return None

# Step 3: Search for a track by name and artist
def search_track(track_name, artist_name, access_token):
    headers = {'Authorization': f'Bearer {access_token}'}
    query = f"track:{track_name} artist:{artist_name}"
    params = {'q': query, 'type': 'track', 'limit': 1}

    response = requests.get('https://api.spotify.com/v1/search', headers=headers, params=params)
    if response.status_code == 200:
        results = response.json()
        if results['tracks']['items']:
            track = results['tracks']['items'][0]  # Get the first matching result
            return {
                'Track Name': track['name'],
                'Artist': track['artists'][0]['name'],
                'Album': track['album']['name'],
                'Release Date': track['album']['release_date'],
                'Duration (ms)': track['duration_ms'],
                'Popularity': track['popularity'],
                'URI': track['uri']
            }
        else:
            print("No track found")
            return None
    else:
        print("Failed to search for track")
        print(response.json())  # Added for better debugging
        return None


In [None]:
# Step 4: Example usage - search for "Never Gonna Give You Up" by Rick Astley
access_token = get_access_token(CLIENT_ID, CLIENT_SECRET)
if access_token:
    track_info = search_track("Never Gonna Give You Up", "Rick Astley", access_token)
    if track_info:
        print(track_info)

{'Track Name': 'Never Gonna Give You Up', 'Artist': 'Rick Astley', 'Album': 'Whenever You Need Somebody', 'Release Date': '1987-11-12', 'Duration (ms)': 213573, 'Popularity': 78, 'URI': 'spotify:track:4PTG3Z6ehGkBFwjybzWkR8'}


# **MAIN FUNCTION TO UPDATE & SAVE TO DATABASE**
Set json_file_path to the name of the file you want to process.

In [7]:
json_file_path = "/content/spotify-listening-data/data/raw/Streaming_History_Audio_2013-2024.json"
json_to_csv(json_file_path)

Data successfully updated and saved to /content/spotify-listening-data/data/processed/spotify_data.csv


In [6]:
import pandas as pd
import uuid
import os
import requests
from ip2geotools.databases.noncommercial import DbIpCity


# Function to get location details from IP address
def get_location_from_ip(ip_address):
    try:
        # Query the ip2geotools database with the given IP address
        res = DbIpCity.get(ip_address, api_key="free")

        # Structure the location data
        location = {
            "ip": ip_address,
            "city": res.city,
            "region": res.region,
            "country": res.country,
            "latitude": res.latitude,
            "longitude": res.longitude
        }
        return location
    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}


def json_to_csv(json_file_path, csv_output_path='/content/spotify-listening-data/data/processed/spotify_data.csv'):
    # Load the new JSON data into a DataFrame
    new_data = pd.read_json(json_file_path)

    # Rename columns for consistency
    new_data.rename(columns={
        'ts': 'end_time_UTC',
        'platform': 'platform',
        'ms_played': 'ms_played',
        'conn_country': 'connection_country',
        'ip_addr_decrypted': 'ip_address',
        'master_metadata_track_name': 'track_name',
        'master_metadata_album_artist_name': 'album_artist_name',
        'master_metadata_album_album_name': 'album_name',
        'spotify_track_uri': 'track_uri',
        'episode_name': 'episode_name',
        'episode_show_name': 'episode_show_name',
        'spotify_episode_uri': 'episode_uri',
        'reason_start': 'start_reason',
        'reason_end': 'end_reason',
        'shuffle': 'shuffle',
        'skipped': 'skipped',
    }, inplace=True)

    # # Get the location data and normalize it into separate columns
    # location_data = new_data['ip_address'].apply(lambda ip: get_location_from_ip(ip))

    # # Normalize the location data into separate columns
    # location_df = json_normalize(location_data)

    # # Merge the location data with the original DataFrame
    # new_data = pd.concat([new_data, location_df], axis=1)

    # Generate a unique ID for each row
    new_data['unique_id'] = new_data['end_time_UTC'].apply(lambda x: f"{x}_{uuid.uuid4().hex[:8]}")

    # Keep only the relevant columns for the CSV
    new_data = new_data[['unique_id', 'end_time_UTC', 'platform', 'ms_played', 'connection_country',
                         'ip_address', 'track_name', 'album_artist_name', 'album_name',
                         'track_uri', 'episode_name', 'episode_show_name', 'episode_uri',
                         'start_reason', 'end_reason', 'shuffle', 'skipped',
                         # 'country', 'region', 'city', 'latitude', 'longitude'
                         ]]

    # If the output CSV already exists, read it to check for duplicates
    if os.path.exists(csv_output_path):
        existing_data = pd.read_csv(csv_output_path)

        # Identify and keep only new rows that are not already in the CSV
        merged_data = pd.concat([existing_data, new_data]).drop_duplicates(subset=['unique_id'], keep='first')
    else:
        # If no existing CSV, just use new data
        merged_data = new_data

    # Ensure the 'processed' directory exists (create it if not)
    os.makedirs(os.path.dirname(csv_output_path), exist_ok=True)

    # Save updated data to CSV in the specified path
    merged_data.to_csv(csv_output_path, index=False)

    print(f"Data successfully updated and saved to {csv_output_path}")