In [1]:
from pathlib import Path
import pandas as pd
import pickle

In [2]:
raw_path = Path.cwd().parent / 'data/raw'

In [3]:
dw_files = [f.name for f in raw_path.glob('*discover-weekly.pkl')]

In [14]:
def build_combined_dw_df(raw_path, dw_files):
    '''
    build the dataset of all discover weekly playlests
    
    raw_path - directory containing weekly dw data
    dw_files - generator of all dw data files
    '''
    pickles = [f.name for f in raw_path.glob('*.pkl')]
    if 'dw_combined.pkl' not in pickles:
        # combine all existing discover weekly datasets
        df = pd.concat([pd.read_pickle(raw_path / f) for f in dw_files])
        
        # popularity can change (future project?)
        df = df.drop_duplicates(df.columns.drop('popularity'))
        df.to_pickle('dw_combined.pkl')
    elif 'dw_combined.pkl' in raw_path:    # add only new files
        # load existing dataset
        df = pd.read_csv(raw_path / 'dw_combined.pkl')
        # get all existing dates
        dw_dates = df['time_added'].str[:10].unique()
        # if a file has a date greater than latest in existing dataset, add it to dataset
        for f in raw_path.glob('*discover-weekly.pkl'):
            temp_df = pd.read_pickle(f)
            playlist_date = temp_df['time_added'].str[:10].unique()
            if playlist_date > max(dw_dates):
                df = df.concat(temp_df)
            else:
                pass
        
    return df.drop_duplicates(df.columns.drop('popularity'))

In [19]:
df = build_combined_dw_df(raw_path, dw_files)
df.to_pickle(raw_path / 'dw_combined.pkl')