Danielle Paes Barretto de Arruda Camara

**VERSION: 30-07-19 (review)**

Concatenate .csv with tracks information of all categories per category

**Input**: 

* .csv file with tracks information (it is enough to inform the folder (input_folder) of the file and the category_id (e.g. 'afro')

**Output**:

* .csv file with tracks information of all playlists in a category (e.g. tracks_audio_features_playlists_category_jazz_2019-06-02.csv)

# Import necessary libraries

In [1]:
import pandas as pd
import glob

import time
TodaysDate = time.strftime("%Y-%m-%d")

# Defined Folders

In [2]:
input_folder = "./data/NEW_DATA/tracks/"
output_folder = "./data/NEW_DATA/tracks_playlists_category_concatenated/"

# Function to concatenate csv files per category

In [3]:
def concatenate_category_csv(input_folder,output_folder,filename, category_id =''):
    """ Concatenate .csv files in input_folder
    
    input:
        input_folder: path of the folder containing .csv files to be concatenated
        output_folder: path of the folder containing file with .csvs concatenated
        filename_output: name for the resulting .csv file (output)
        category_id: category id, if not specify all .csv files in the folder will be processed (optional).
        If not informed all files will be in the folder will be considered
       
    output:
        .csv of concatenade files
    """
    
    # finding all .csv files in folder from a particular category

    csv_files = glob.glob(input_folder+'*'+category_id+'*.csv')

    # reading csv and making a list of files to concatenate

    list_csv_files = []

    for csv_file in csv_files:
    
        df = pd.read_csv(csv_file)
        list_csv_files.append(df)

    # concatenating dataframes

    df_concat = pd.concat(list_csv_files,axis=0,ignore_index=True)
    
    # reset index

    df_concat.reset_index(drop=True, inplace=True)

    # Saving the dataframe of concatenated dataframes 
    
    if category_id == '':
        category_id = 'ALL'

    TodaysDate = time.strftime("%Y-%m-%d")
    file_name = filename+category_id+'_'+TodaysDate +".csv"
    df_concat.to_csv(output_folder+file_name, index = False)
    
    print("files concatenated and saved!")

    return df_concat

# Test: Concatenating one chosen category

In [4]:
filename = "tracks_audio_features_playlists_category_"
category_id = 'afro'
df_concat_afro = concatenate_category_csv(input_folder,output_folder,filename,category_id)

files concatenated and saved!


In [5]:
df_concat_afro.head()

Unnamed: 0,category_id,playlist_name,playlist_id,track_id,track_uri,track_preview_url,track_name,track_popularity,artist_name,album_name,...,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,1noPA8QfOmSEurS2PekBsp,spotify:track:1noPA8QfOmSEurS2PekBsp,https://p.scdn.co/mp3-preview/a14919775aa749fd...,Nwa Baby (Ashawo Remix),48,Flavour,Best Of Flavour,...,262347,0.973,0.373,1,0.113,0.074,1,0.0743,100.625,0.959
1,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,5JEw4FrNlWmHxgBkw0jbEj,spotify:track:5JEw4FrNlWmHxgBkw0jbEj,https://p.scdn.co/mp3-preview/7f4efca8c472d405...,Gongo Aso,40,9ice,Gongo Aso,...,224307,0.965,9.8e-05,4,0.0983,-2.742,1,0.0409,110.03,0.902
2,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,1KpBtWSI9dlv0RjtzvF1BD,spotify:track:1KpBtWSI9dlv0RjtzvF1BD,https://p.scdn.co/mp3-preview/66e9251c3be09d5b...,"Soco (feat. Wizkid, Ceeza Milli, Spotless & Te...",66,"Starboy,WizKid,Ceeza Milli,Spotless,Terri","Soco (feat. Wizkid, Ceeza Milli, Spotless & Te...",...,255608,0.644,0.00402,9,0.142,-3.284,0,0.083,108.003,0.881
3,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,1qNz5rynw3I9LU8uQRETsO,spotify:track:1qNz5rynw3I9LU8uQRETsO,https://p.scdn.co/mp3-preview/c6b7d76dd65b5991...,Iskaba,59,"Wande Coal,DJ Tunez",Iskaba,...,224680,0.764,0.0174,0,0.0696,-7.944,0,0.053,125.026,0.899
4,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,5fpoDuxvBBNy69mgzIMMrI,spotify:track:5fpoDuxvBBNy69mgzIMMrI,https://p.scdn.co/mp3-preview/a96b995808190ae3...,Lori Le,34,X Project,Turn It Up,...,243017,0.819,1.3e-05,11,0.0652,-4.66,0,0.107,140.009,0.893


In [6]:
len(df_concat_afro.playlist_name.unique())

77

# Concatenating all categories' data

In [7]:
filename = "tracks_audio_features_playlists_category_"
df_all = concatenate_category_csv(input_folder,output_folder,filename,category_id ='')

files concatenated and saved!


In [8]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114434 entries, 0 to 114433
Data columns (total 22 columns):
category_id          114434 non-null object
playlist_name        114434 non-null object
playlist_id          114434 non-null object
track_id             114434 non-null object
track_uri            114434 non-null object
track_preview_url    79655 non-null object
track_name           114416 non-null object
track_popularity     114434 non-null int64
artist_name          114420 non-null object
album_name           114416 non-null object
acousticness         114434 non-null float64
danceability         114434 non-null float64
duration_ms          114434 non-null int64
energy               114434 non-null float64
instrumentalness     114434 non-null float64
key                  114434 non-null int64
liveness             114434 non-null float64
loudness             114434 non-null float64
mode                 114434 non-null int64
speechiness          114434 non-null float64
tempo  

In [9]:
df_all.columns

Index(['category_id', 'playlist_name', 'playlist_id', 'track_id', 'track_uri',
       'track_preview_url', 'track_name', 'track_popularity', 'artist_name',
       'album_name', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence'],
      dtype='object')

In [10]:
selected_columns = ['category_id', 'album_name', 'artist_name', 'playlist_name', 'playlist_id','track_id',
       'track_name','acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode','liveness', 'loudness', 'speechiness']

Number of unique values for each feature:

In [11]:
df_info_categories = df_all[selected_columns].groupby(["category_id"]).nunique()
df_info_categories.index.rename('category',inplace=True)
df_info_categories.reset_index(inplace=True)
df_info_categories.drop(columns=['category_id'],inplace = True)
df_info_categories.rename(columns={'category':'category_id'})
df_info_categories

Unnamed: 0,category,album_name,artist_name,playlist_name,playlist_id,track_id,track_name,acousticness,danceability,energy,valence,tempo,instrumentalness,key,mode,liveness,loudness,speechiness
0,afro,2331,2029,77,77,3494,3360,1520,622,771,839,3184,1822,12,2,1013,2966,1005
1,arab,1363,1076,33,33,1576,1554,974,558,664,728,1496,798,12,2,712,1468,757
2,blues,834,617,15,15,911,875,677,441,583,541,901,738,12,2,480,885,482
3,chill,6162,4738,97,97,7401,6663,1805,912,1443,1390,6617,2577,12,2,1030,5954,950
4,classical,5202,5124,117,117,6538,6382,628,931,1781,1409,6279,1702,12,2,1012,5652,898
5,country,1452,857,39,39,2220,2067,1189,521,754,777,2141,980,12,2,717,1993,563
6,decades,2666,1821,48,48,3405,3127,1797,668,822,855,3304,1705,12,2,996,2969,774
7,desi,1870,2070,63,63,2786,2686,1314,654,751,857,2543,1402,12,2,924,2381,899
8,dinner,1119,998,20,20,1218,1204,747,557,761,693,1201,782,12,2,514,1182,550
9,dutch,944,592,38,38,1737,1624,1111,580,712,781,1691,897,12,2,686,1577,588


In [12]:
df_all_concat = pd.read_csv(output_folder+"tracks_audio_features_playlists_category_ALL_2019-06-02.csv")
df_all_concat.head()

Unnamed: 0,category_id,playlist_name,playlist_id,track_id,track_uri,track_preview_url,track_name,track_popularity,artist_name,album_name,...,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,1noPA8QfOmSEurS2PekBsp,spotify:track:1noPA8QfOmSEurS2PekBsp,https://p.scdn.co/mp3-preview/a14919775aa749fd...,Nwa Baby (Ashawo Remix),48,Flavour,Best Of Flavour,...,262347,0.973,0.373,1,0.113,0.074,1,0.0743,100.625,0.959
1,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,5JEw4FrNlWmHxgBkw0jbEj,spotify:track:5JEw4FrNlWmHxgBkw0jbEj,https://p.scdn.co/mp3-preview/7f4efca8c472d405...,Gongo Aso,40,9ice,Gongo Aso,...,224307,0.965,9.8e-05,4,0.0983,-2.742,1,0.0409,110.03,0.902
2,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,1KpBtWSI9dlv0RjtzvF1BD,spotify:track:1KpBtWSI9dlv0RjtzvF1BD,https://p.scdn.co/mp3-preview/66e9251c3be09d5b...,"Soco (feat. Wizkid, Ceeza Milli, Spotless & Te...",66,"Starboy,WizKid,Ceeza Milli,Spotless,Terri","Soco (feat. Wizkid, Ceeza Milli, Spotless & Te...",...,255608,0.644,0.00402,9,0.142,-3.284,0,0.083,108.003,0.881
3,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,1qNz5rynw3I9LU8uQRETsO,spotify:track:1qNz5rynw3I9LU8uQRETsO,https://p.scdn.co/mp3-preview/c6b7d76dd65b5991...,Iskaba,59,"Wande Coal,DJ Tunez",Iskaba,...,224680,0.764,0.0174,0,0.0696,-7.944,0,0.053,125.026,0.899
4,afro,Afro Party Anthems!,37i9dQZF1DWSjibQnF0bUk,5fpoDuxvBBNy69mgzIMMrI,spotify:track:5fpoDuxvBBNy69mgzIMMrI,https://p.scdn.co/mp3-preview/a96b995808190ae3...,Lori Le,34,X Project,Turn It Up,...,243017,0.819,1.3e-05,11,0.0652,-4.66,0,0.107,140.009,0.893


In [13]:
len(df_all_concat.playlist_name[df_all_concat.category_id=='afro'].unique())

77

## Generating one file per category     

In [14]:
list_categories = df_all_concat.category_id.unique().tolist()

In [15]:
list_categories

['afro',
 'arab',
 'blues',
 'chill',
 'classical',
 'country',
 'decades',
 'desi',
 'dinner',
 'dutch',
 'edm_dance',
 'focus',
 'funk',
 'gaming',
 'hiphop',
 'indie_alt',
 'jazz',
 'kids',
 'kpop',
 'latin',
 'metal',
 'mood',
 'party',
 'pop',
 'punk',
 'reggae',
 'rnb',
 'rock',
 'romance',
 'roots',
 'sessions',
 'sleep',
 'soul',
 'summer',
 'toplists',
 'travel',
 'workout']

In [16]:
len(list_categories)

37

In [17]:
def creating_csv_per_category(df_all, category_id):
    df_cat = df_all[df_all['category_id'] == category_id]
    TodaysDate = time.strftime("%Y-%m-%d")
    file_name = filename+category_id+'_'+TodaysDate +".csv"
    df_cat.to_csv(output_folder+file_name, index = False)

In [18]:
for category_id in list_categories:
    creating_csv_per_category(df_all, category_id)

In [19]:
df_test = pd.read_csv(output_folder+"tracks_audio_features_playlists_category_pop_2019-06-02.csv")
df_test.head()

Unnamed: 0,category_id,playlist_name,playlist_id,track_id,track_uri,track_preview_url,track_name,track_popularity,artist_name,album_name,...,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,pop,Girls Only,0cS6eQ6y8jki1Khi5F2TxG,6LsAAHotRLMOHfCsSfYCsz,spotify:track:6LsAAHotRLMOHfCsSfYCsz,,If I Can't Have You,95,Shawn Mendes,If I Can't Have You,...,190800,0.809,0.0,2,0.147,-4.198,1,0.0602,123.911,0.864
1,pop,Girls Only,0cS6eQ6y8jki1Khi5F2TxG,3M9Apu4OZfylLTFKvgEtKa,spotify:track:3M9Apu4OZfylLTFKvgEtKa,,Ritual,68,"Tiësto,Jonas Blue,Rita Ora",Ritual,...,198996,0.726,0.0,3,0.0844,-4.389,0,0.0552,114.996,0.767
2,pop,Girls Only,0cS6eQ6y8jki1Khi5F2TxG,5PYQUBXc7NYeI1obMKSJK0,spotify:track:5PYQUBXc7NYeI1obMKSJK0,,Never Really Over,74,Katy Perry,Never Really Over,...,223523,0.883,0.0,8,0.303,-4.736,1,0.0688,99.987,0.36
3,pop,Girls Only,0cS6eQ6y8jki1Khi5F2TxG,4tNXntkAzQ5A2dfYRYGIIQ,spotify:track:4tNXntkAzQ5A2dfYRYGIIQ,,Easier,85,5 Seconds of Summer,Easier,...,157493,0.433,0.0,2,0.0996,-5.533,0,0.147,87.997,0.614
4,pop,Girls Only,0cS6eQ6y8jki1Khi5F2TxG,3zR5cAShCz1ugKIWXGTjI3,spotify:track:3zR5cAShCz1ugKIWXGTjI3,,Parents,67,YUNGBLUD,Parents,...,172027,0.833,0.0,9,0.3,-4.287,0,0.0507,82.041,0.599


In [20]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3932 entries, 0 to 3931
Data columns (total 22 columns):
category_id          3932 non-null object
playlist_name        3932 non-null object
playlist_id          3932 non-null object
track_id             3932 non-null object
track_uri            3932 non-null object
track_preview_url    2507 non-null object
track_name           3932 non-null object
track_popularity     3932 non-null int64
artist_name          3932 non-null object
album_name           3932 non-null object
acousticness         3932 non-null float64
danceability         3932 non-null float64
duration_ms          3932 non-null int64
energy               3932 non-null float64
instrumentalness     3932 non-null float64
key                  3932 non-null int64
liveness             3932 non-null float64
loudness             3932 non-null float64
mode                 3932 non-null int64
speechiness          3932 non-null float64
tempo                3932 non-null float64
valence

In [21]:
len(df_test.playlist_id.unique())

43