In [1]:
import dask.dataframe as dd  
import pandas as pd  
import numpy as np
import boto3
import tempfile
import os

In [4]:
#Load some data
s3 = boto3.resource('s3', region_name='us-east-1')
bucket = s3.Bucket('song-feature-csvs')

#open the essentia file. This downloads the file locally to the instance
#so that it doesnt have to fit directly into memory
fname = 'essentia_combined_csv.csv'
if os.path.isfile(fname) == False:
    %sc
    !wget 'https://song-feature-csvs.s3.amazonaws.com/essentia_combined_csv.csv'

#Note I'm using dask here because this df is too damn big
df_essentia = dd.read_csv(fname, sample=1000000000, 
                          dtype={
                            'metadata_tags_contentgroup_0': object, 
                            'metadata_tags_discnumber_0':  object,
                            'metadata_tags_encodingtime_0': object,
                            'metadata_tags_engineer_0': object,
                            'metadata_tags_filetype_0': object,
                            'metadata_tags_taggingdate_0':object,
                            'metadata_tags_encoding_0': object, 
                            'metadata_tags_initialkey_0':object, 
                            'metadata_tags_isrc_0': object, 
                            'metadata_tags_label_0': object, 
                            'metadata_tags_lyricist_0': object,
                            'metadata_tags_albumartist_0': object,
                            'metadata_tags_composer_0': object,
                            'metadata_tags_copyright_0': object,
                            'metadata_tags_date_0': object,
                            'metadata_tags_date_1': object,
                            'metadata_tags_encodedby_0': object,
                            'metadata_tags_tracknumber_0': object,
                            'metadata_tags_artistwebpage_0': object,
                            'metadata_tags_audiosourcewebpage_0': object,
                            'metadata_tags_copyrighturl_0': object,
                            'metadata_tags_filewebpage_0': object,
                            'metadata_tags_originalartist_0': object,
                            'metadata_tags_originaldate_0': object,
                            'metadata_tags_originalfilename_0': object,
                            'metadata_tags_paymentwebpage_0': object,
                            'metadata_tags_publisherwebpage_0': object,
                            'metadata_tags_radiostationwebpage_0': object,
                            'metadata_tags_remixer_0': object,
                            'metadata_tags_conductor_0': object,
                            'metadata_tags_bpm_0': object,
                            'metadata_tags_genre_1': object,
                            'metadata_tags_musicip_puid_0': object,
                            'metadata_tags_albumartist_1': object,
                            'metadata_tags_albumartistsort_0': object,
                            'metadata_tags_albumsort_0': object,
                            'metadata_tags_artistsort_0': object,
                            'metadata_tags_date_2': object,
                            'metadata_tags_performer_0': object,
                            'metadata_tags_language_0': object,
                            'metadata_tags_musicbrainz_discid_0': object,
                            'metadata_tags_subtitle_0': object,
                            'metadata_tags_originalalbum_0': object,
                            'metadata_tags_originallyricist_0': object,
                            'metadata_tags_artist_1': object, 
                            'metadata_tags_media_0': object,
                            'metadata_tags_subtitle_0': object,
                            'metadata_tags_musicbrainz album release country_0': object,
                            'metadata_tags_musicbrainz album status_0': object,
                            'metadata_tags_musicbrainz album type_0': object,
                            'metadata_tags_musicbrainz_albumartistid_0': object,
                            'metadata_tags_musicbrainz_albumid_0': object,
                            'metadata_tags_musicbrainz_artistid_0': object,
                            'metadata_tags_musicbrainz_trackid_0': object,
                            'metadata_tags_owner_0': object,
                            'metadata_tags_titlesort_0': object,
                            'metadata_tags_releasedate_0': object,
                            'metadata_tags_tracktotal_0': object,
                            'metadata_tags_copyright_1': object,
                            'metadata_tags_genre_1': object,
                            'metadata_tags_discnumber_1': object,
                            'metadata_tags_discnumber_2': object,
                            'metadata_tags_discnumber_3': object,
                            'metadata_tags_radiostation_0': object,
                            'metadata_tags_radiostation_1': object,
                            'metadata_tags_asin_0': object,
                            'metadata_tags_totaldiscs_0': object,
                            'metadata_tags_mood_0': object,
                            'metadata_tags_license_0': object,
                            'metadata_tags_catalognumber_0': object,
                            'metadata_tags_script_0': object,
                            'metadata_tags_compilation_0': object,
                            'metadata_tags_tracknumber_1': object,
                            'metadata_tags_albumartist_1': object
                            }); 

In [5]:
#df_essentia.iloc[:, [7543]].columns

In [6]:
#for i in df_essentia.columns:
#    print(i)

In [7]:
fname = 'features.csv'
if os.path.isfile(fname) == False:
    %sc
    !wget 'https://song-feature-csvs.s3.amazonaws.com/features.csv'
    
preexisting_features = pd.read_csv(fname, header=[0,1], low_memory=True); 

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
fname = 'raw_tracks.csv'
if os.path.isfile(fname) == False:
    %sc
    !wget 'https://song-feature-csvs.s3.amazonaws.com/raw_tracks.csv' 
    
preexisting_tracks = pd.read_csv(fname); 

In [9]:
#adjust unique IDs and cleanup column names
df_essentia['song_id_prefix'] = df_essentia['song_id'].str[:6]
preexisting_features.columns = preexisting_features.columns.map('_'.join)
preexisting_features = preexisting_features.drop([0,1])
preexisting_features.rename(columns={'feature_statistics':'song_id_prefix'}, inplace=True)
preexisting_features['song_id_prefix'] = preexisting_features['song_id_prefix'].apply('{:0>6}'.format)
preexisting_tracks['song_id_prefix'] = preexisting_tracks['track_id'].apply('{:0>6}'.format)

In [10]:
#Lets find nulls
#df_essentia.describe()

In [11]:
#preexisting_features.describe()

In [12]:
#add a genre column and extract it
for index, row in preexisting_tracks.iterrows():
    try:
        genre = eval(str(row['track_genres']))[0]['genre_title']
    except:
        genre = row['track_genres']   
        
    preexisting_tracks.loc[index, 'genre'] = genre

In [13]:
preexisting_tracks['genre'].unique()

array(['Hip-Hop', 'Pop', 'Experimental Pop', 'Loud-Rock', 'Avant-Garde',
       'Folk', 'Jazz', 'Punk', 'Post-Rock', 'Field Recordings', 'Lo-Fi',
       'Electronic', 'Rock', 'Metal', 'Noise', 'Post-Punk', 'Krautrock',
       'Blues', 'Jazz: Vocal', nan, 'Electroacoustic', 'Radio Art',
       'Reggae - Dub', 'Latin America', 'Disco', 'Experimental',
       'International', 'Drone', 'Psych-Folk', 'Free-Folk', 'Improv',
       'Spoken Weird', 'Noise-Rock', 'Psych-Rock', 'Audio Collage',
       'Singer-Songwriter', 'Electro-Punk', 'Indie-Rock',
       'Ambient Electronic', 'Industrial', 'No Wave', 'Classical',
       'Progressive', 'Free-Jazz', 'Garage', 'Interview', 'Poetry',
       'Bluegrass', 'Country', 'Balkan', 'Power-Pop', 'Hardcore',
       'Jazz: Out', 'Polka', 'African', 'French', 'Easy Listening',
       'Americana', 'Middle East', 'Old-Time / Historic',
       'Breakcore - Hard', 'Death-Metal', 'Sound Collage', 'Freak-Folk',
       'Spoken', 'Unclassifiable', 'British Folk', '

In [14]:
preexisting_tracks = preexisting_tracks.drop(['track_id', 'artist_id', 'track_comments', 'track_disc_number', 'track_favorites', 
             'track_instrumental', 'track_interest', 'track_listens', 'track_number'], axis=1)

preexisting_tracks = preexisting_tracks.dropna(subset=['genre'], axis='index')

preexisting_tracks = preexisting_tracks.dropna(axis='columns')

preexisting_features = preexisting_features.dropna(axis='columns')

In [15]:
preexisting_tracks.columns[preexisting_tracks.isna().any()].tolist()

[]

In [16]:
preexisting_features = dd.from_pandas(preexisting_features, npartitions=1)
preexisting_tracks = dd.from_pandas(preexisting_tracks, npartitions=1)

In [17]:
#combine all datasets
result = dd.merge(df_essentia, preexisting_features, how='inner', on=['song_id_prefix', 'song_id_prefix'])
result = dd.merge(result, preexisting_tracks, how='inner', on=['song_id_prefix', 'song_id_prefix'])
#result = result.dropna(axis='columns')


#add a genre column and extract it
#for index, row in result.iterrows():
#    try:
#        genre = eval(str(row['track_genres']))[0]['genre_title']
#    except:
#        genre = row['track_genres']   
#    result['genre'] = genre

In [18]:
result_categorical = list(result.select_dtypes(include=['O']).columns)
result_numerical = list(result.select_dtypes(include=[np.number]).columns)
result_categorical.remove('genre')
result_target = 'genre'

In [19]:
#result_categorical

In [20]:
#drop catagoricals except genre
for item in result_categorical:
    result = result.drop(item,axis=1)

In [21]:
cols = list(result.select_dtypes(include=['O']).columns) + result_numerical

In [30]:
#write out the final dataset. This is not yet scaled and only
#includes one categorical column... genre
result[cols].to_csv('/tmp/final_dataset_new.csv', single_file = True)

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/final_dataset_new.csv/00.part'

In [None]:
import botocore
import boto3
from boto3.s3.transfer import TransferConfig
from botocore.client import Config


#s3 multipart upload
def upload(source, dest, bucket_name):
    try:
        s3 = boto3.client('s3')
        config = TransferConfig(multipart_threshold=1024*20,
                                max_concurrency=3,
                                multipart_chunksize=1024*20,
                                use_threads=True)
        
        s3.upload_file(Filename=source, Bucket=bucket_name,
                         Key=dest, Config=config)
                            
    except Exception as e:
        raise Exception(str(e))

upload('/home/ec2-user/SageMaker/final_dataset.csv', 'final_dataset.csv', 'song-feature-csvs')        