# Imports

#### TODO: package functions
#### TODO: clean up imports

In [8]:
# IMPORTS 
# TODO: move most functions into utility modules, along with their required imports

import os, pickle, re, requests
import pandas as pd
from bs4 import BeautifulSoup

import spotipy
import spotipy.util as util
import spotipy.oauth2 as oauth2
from dotenv import load_dotenv

import functools

import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display
import IPython.display as ipd

from src.obtain.spotify_metadata import generate_token

import random
import librosa
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [4]:
from pymongo import MongoClient
from pprint import pprint

# Functions

In [5]:
# FUNCTIONS TO GET GENRES AND PLAYLIST LINKS FROM EVERY-NOISE-AT-ONCE

def load_or_make(creator):
    """
    Loads data that is pickled at filepath if filepath exists;
    otherwise, calls creator(*args, **kwargs) to create the data 
    and pickle it at filepath.
    Returns the data in either case.
    
    Inputs:
    - filepath: path to where data is / should be stored
    - creator: function to create data if it is not already pickled
    - *args, **kwargs: arguments passed to creator()
    
    Outputs:
    - item: the data that is stored at filepath
    """
    @functools.wraps(creator)
    def cached_creator(filepath, *args, **kwargs):
        if os.path.isfile(filepath):
            with open(filepath, 'rb') as pkl:
                item = pickle.load(pkl)
        else:
            item = creator(*args, **kwargs)
            with open(filepath, 'wb') as pkl:
                pickle.dump(item, pkl)
        return item
    return cached_creator

@load_or_make
def scrape_all_links(domain, path, target_pattern):
    """
    Scrapes a website and compiles a list of urls that match a target pattern.
    
    Inputs: 
    - domain: domain of the website you want to scrape
    - path: path to the page that you want to scrape from `domain`
    - target_pattern: regex that specifies the types of links you want to collect
    
    Outputs:
    - target_urls: list of all the links on domain/path that match target_pattern
    """
    main_page = '/'.join(['http:/', domain, path])
    response = requests.get(main_page)

    if response.status_code != 200:
        raise ConnectionError(f"Failed to connect to {main_page}.")

    soup = BeautifulSoup(response.text, "lxml")

    target_regex = re.compile(target_pattern)
    target_urls = ['/'.join(['http:/', domain, x['href']])
                    for x in soup.find_all('a', {'href':target_regex})]

    return target_urls

@load_or_make
def scrape_links_from_each_page(urls, target_pattern, labeler=(lambda x:x)):
    """
    Loops over a list of urls and finds links that matches a target pattern from each page.
    
    Inputs:
    - urls: the list of urls to scrape links from
    - target_pattern: regex that specifies the types of links you want to collect
    - labeler: function that parses a url and returns a label for that page
    
    Outputs:
    - links: a dictionary with key/value pairs {url_label:[scraped_links]}
    """
    links = {}

    for url in urls:
        response = requests.get(url)
        label = labeler(url)

        if response.status_code != 200:
            raise ConnectionError(f"Failed to connect to {url}.")

        soup = BeautifulSoup(response.text, "lxml")

        target_regex = re.compile(target_pattern)
        target_urls = [x['href'] for x in soup.find_all('a', {'href':target_regex})]

        links[label] = target_urls
    
    return links

In [6]:
# FUNCTIONS TO GET PLAYLIST METADATA FROM SPOTIFY

import os
import pandas as pd

def get_tags(track):
    '''
    Parse metadata for a spotify track
    From a user_playlist json file, a track can be found via:
        user_playlist['tracks']['items'][i]
    '''
    tags =  {
        'id': track['id'],
        'album': track['album']['name'],
        'track': track['track_number'],
        'title': track['name'],
        'artist': track['artists'][0]['name'],
        'duration': int(track['duration_ms']/1000),
        'preview_mp3': track['preview_url'],
        'is_explicit': track['explicit'],
        'isrc_number': track['external_ids'].get('isrc', ''),
        'release_date': track['album']['release_date']
    }
    if track['album']['images']:
        tags['cover_art_url'] = track['album']['images'][0]['url']
    return tags

def build_metadata_df(tracks, client):
    metadata = [get_tags(item['track']) for item in tracks['items'] if item['track']]
    metadata_df = pd.DataFrame(metadata)
    # add more features from the tracks' audio features JSON
    features = client.audio_features(list(metadata_df['id']))
    features_df = pd.DataFrame(features)
    metadata_df = pd.merge(metadata_df, features_df)

    return metadata_df

def download_playlist_metadata(user, pid, pname, client):
    # get metadata for playlist 'pname' by 'user'
    results = client.user_playlist(user, pid, fields="tracks,next")
    tracks = results['tracks']

    all_dfs = []
    batch_df = build_metadata_df(tracks, client)
    all_dfs.append(batch_df)

    while tracks['next']:
        tracks = client.next(tracks)
        batch_df = build_metadata_df(tracks, client)
        all_dfs.append(batch_df)
    metadata = pd.concat(all_dfs)
    metadata.reset_index(drop=True, inplace=True)

    return metadata

def parse_sos_pid(playlists):
    return [x.split('/')[-1] for x in playlists if 'thesoundsofspotify' in x][0]

def download_all_genres_metadata(genre_playlists, client):
    for k, v in genre_playlists.items():
        genre = k
        filepath = f'data/genre_metadata/{genre}_metadata.tsv'
        if os.path.isfile(filepath):
            continue
        playlist_id = parse_sos_pid(v)
        metadata = download_playlist_metadata(user='thesoundsofspotify', 
                                              pid=playlist_id, 
                                              pname=genre, 
                                              client=client)
        metadata.to_csv(filepath, sep='\t', index=False)

In [12]:
def run_data_pipeline(token):
    """
    - scrape genre page urls from everynoise.com/engenremap.html,
        save as a list in ../data/raw/everynoise_genre_urls.pkl
        
    - scrape genre playlist urls from each genre page on everynoise.com,
        save as a dictionary in ../data/raw/thesoundsofspotify_playlist_urls.pkl
        
    - download playlist metadata for each playlist from Spotify,
        save as TSV files in ../data/raw/thesoundsofspotify/[genre].tsv
        
    - download audio_analysis files for each song in a list of playlists
        (not necessarily all playlists because there are 100s of 1000s in the full set)
        save as audio_analysis dictionaries in ../data/raw/audio_analysis/[song_uri].pkl
    
    TODO: include a progress indicator?
    """
    genre_urls = scrape_all_links(
        'data/everynoise/everynoise_genre_urls.pkl',
        domain='everynoise.com', 
        index='engenremap.html', 
        target_pattern='engenremap-[a-z]*')
    
    genre_playlists = scrape_links_from_each_page(
        'data/everynoise/thesoundsofspotify_playlist_urls.pkl',
        urls=genre_urls,
        labeler=(lambda url: url.split('/')[-1].split('-')[-1].split('.')[0]),
        target_pattern='open.spotify.com')
    
    sp = spotipy.Spotify(auth=token)
    
    download_all_genres_metadata(genre_playlists, sp)
       

In [8]:
token=generate_token(username='djconxn')
run_data_pipeline(token)

# all genre urls from Every Noise are now saved as a list in 'data/raw/everynoise_genre_urls.pkl'
# all genres' playlist links are saved in a dictionary in 'data/raw/thesoundsofspotify_playlist_urls.pkl'
# each genre's playlist metadata is saved in 'data/interim/genre_metadata/{genre}_metadata.tsv'

In [39]:
with open('data/raw/thesoundsofspotify_playlist_urls.pkl', 'rb') as urls:
    genre_playlists = pickle.load(urls)
    

# Create Mongo Database

#### TODO: Use this data schema instead.

Old Schema:
- `data/genre_metadata/{genre}_metadata.tsv` : DataFrame of Spotify metadata for each song in genre playlist.

New Schema:
- `data/genre_playlists/{genre}.txt` : list of song_id's for each genre's playlist.
- `data/song_metadata/{song_id}.json` : JSON file containing song_id's metadata.

~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 

Each song's sample mp3 may be found in `/data/mp3s/{song_id}.mp3`.

Each song's Spotify metadata may be found in `/data/song_metadata/{song_id}.json`. 

Note that not all mp3s have been downloaded.

There are over 500K songs across the 2900 Sounds Of Spotify genre playlists (and these are updated, which needs to be built into this pipeline). 

In [None]:
mp3s_path = "data/mp3s"
metadata_path = 'data/genre_metadata'
song_md_path = 'data/song_metadata'
genre_lists_path = 'data/genre_playlists'

metadata_files = os.listdir(metadata_path)

for md_file in metadata_files:

    try:
        meta_df = pd.read_csv(os.path.join(metadata_path, md_file), sep='\t')
    except:
        print(md_file)
        continue
    
    # Write playlist song id's in a text file
    genre_name = md_file.replace('_metadata.tsv', '')
    genre_list_path = os.path.join(genre_lists_path, genre_name) + '.txt'
    if not os.path.isfile(genre_list_path):
        with open(genre_list_path, 'w') as file:
            file.writelines('\n'.join(list(meta_df.id)))

    # Write each song's metadata to a json file
    for i in meta_df.index:
        json_file = "{}.json".format(meta_df.loc[i]['id'])
        json_path = os.path.join(song_md_path, json_file)
        if not os.path.isfile(json_path):
            meta_df.loc[i].to_json(json_path)
