Danielle Paes Barretto de Arruda Camara

**VERSION: 30-07-19 (last review)**

Generate dataframe containing all possible combinations of 2-track sequences considering tracks of a **playlist**. 

2-track sequence that occurs within the playlist receives is_good_sequence =1, otherwise is_good_sequence = 0.

# Importing Libraries

In [1]:
import pandas as pd
import glob

import time
TodaysDate = time.strftime("%Y-%m-%d")

import itertools

# Folders

In [2]:
input_folder = "./data/PREVIOUS_DATA/tracks_playlists_category_concatenated/"
output_folder = "./data/PREVIOUS_DATA/diff_tracks_050619/"

# Functions

## Creating dataframe with absolute value of difference of audio features of 2 consecutive tracks, including column is_good_sequence= 1.

In [3]:
def create_df_consecutive_diff(df):
    """ Create a dataframe with track_ids and the absolute difference value of audio features between 2 consecutive 
    tracks in a playlist. Includes a colum is_good_sequence = 1 (track belongs to this playlist).
    
    Input: 
        df: dataframe containing track information
    
    Output: dataframe containing 2-track sequence of consecutive tracks of a playlist, absolute difference 
    of audio features (i.e. 'acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode', 'liveness', 'loudness', and 'speechiness'), and is_good_sequence (which is 1).
    
    """
      
    selected_columns = ['track_id','acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode', 'liveness', 'loudness',
       'speechiness']
    
    # dataframe with only selected columns

    df_audio_features = df[selected_columns].drop_duplicates(subset='track_id', keep='first')
    df_audio_features=df_audio_features.set_index('track_id') 
    
    # Calculate absolute difference of audio features of the 2 tracks in the sequence
    
    df_diff = df_audio_features.diff().abs()
    df_diff = df_diff[1:]

    # list of tracks in the playlist
    list_track_ids = df_audio_features.index.tolist()

    # dataframe with current track, the following track, and the sequence of these two

    df_diff['current_track']=list_track_ids[:-1]
    df_diff['next_track']=list_track_ids[1:]
    df_diff["current_track-next_track"] = df_diff['current_track']+'_'+df_diff['next_track']
    
    df_diff.reset_index(inplace=True)
    df_diff.drop(columns=['track_id','current_track', 'next_track'], inplace = True)
    df_diff = df_diff[['current_track-next_track','acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode', 'liveness', 'loudness',
       'speechiness']]
    
    # renaming columns to indicate that is not the audio feature but the absolute difference of the audio feature
    # of 2 conscutive tracks
    
    df_diff.rename(columns={'acousticness':'diff_acousticness', 'danceability':'diff_danceability', 
                            'energy':'diff_energy', 'valence':'diff_valence', 'tempo':'diff_tempo',
                            'instrumentalness':'diff_instrumentalness', 'key':'diff_key', 'mode':'diff_mode', 
                            'liveness':'diff_liveness', 'loudness':'diff_loudness','speechiness':'diff_speechiness'}
                  ,inplace = True)
    
    # Add is_good_sequence column (is_good_sequence = 1, since all 2-track sequence belongs to the playlist)
    
    df_diff['is_good_sequence'] = 1
    
    # List of 2-track consecutive sequence in this playlist 
    
    two_track_sequence_list = df_diff['current_track-next_track'].tolist()
       
    return {'df_diff':df_diff,
            'two_track_sequence_list': two_track_sequence_list}


## Create dataframe of all possible combinations of 2-track sequence considering the tracks in the playlist that are not in the playlist. Includes column is_good_sequence = 0

In [4]:
def create_df_diff_not_in_playlist(df,category_id, list_playlist):
    """
    Create dataframe with all 2-track sequences that are NOT in the playlist with absolute difference of 
    audio features including columns is_good_sequence = 0 (2-track sequence not in playlist)
    
    Input:
    
        df: dataframe containing track information
        category_id: category id (e.g. 'afro')
        list_playlist: list of 2-track_id sequence that occurs in the playlist. Obtained by applying 
    
    Output: dataframe containing 2-track sequence of consecutive tracks which do not occurs in the playlist, 
            absolute difference of audio features (i.e. 'acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode', 'liveness', 'loudness', and 'speechiness'), and is_good_sequence (which is 0)
    
    """
    
    # Creating list of candidates - every combination os 2 tracks like a matrix with rows going from 1 to 
    # n-1 and columns from 2 to n
    
    # generate list of all possible 2-track sequences with tracks in the playlist
    
    list_candidates = [track_id for track_id in itertools.permutations(df.track_id.unique().tolist(),2) ]

    list_candidates = [track_1+'_'+track_2 for track_1,track_2 in list_candidates]

    # remove from the candidates the 2-track sequence that occurs in sequence in the playlist
    # than we have a list with all 2-track sequences that does not occur in the playlist

    list_seq_not_in_playlist = [i for i in list_candidates if i not in list_playlist] 

    selected_columns = ['track_id','acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode', 'liveness', 'loudness',
       'speechiness']
    
    # create dataframe with only selected columns

    df_audio_features = df[selected_columns].drop_duplicates(subset='track_id', keep='first')
    df_audio_features=df_audio_features.set_index('track_id') 

    # Calculate absolute difference of audio features of the 2 tracks in the sequence

    list_diff = [abs(df_audio_features.loc[list_seq_not_in_playlist[idx].split('_')[0]]-df_audio_features.loc[list_seq_not_in_playlist[idx].split('_')[1]]) for idx in range(len(list_seq_not_in_playlist)) ]


    # create dataframe with 2-track sequences that do not occur in the playlist and absolute difference of audio features
    df_diff = pd.DataFrame(index=list_seq_not_in_playlist, data=list_diff)    
    df_diff.reset_index(inplace=True)

    df_diff.rename(columns={'index':'current_track-next_track'},inplace = True)

    df_diff = df_diff[['current_track-next_track','acousticness', 'danceability', 'energy', 'valence', 'tempo',
       'instrumentalness', 'key', 'mode', 'liveness', 'loudness',
       'speechiness']]
    
    # renaming columns to indicate that is not the audio feature but the absolute difference of the audio feature
    # of 2 conscutive tracks
    
    df_diff.rename(columns={'acousticness':'diff_acousticness', 'danceability':'diff_danceability', 
                            'energy':'diff_energy', 'valence':'diff_valence', 'tempo':'diff_tempo',
                            'instrumentalness':'diff_instrumentalness', 'key':'diff_key', 'mode':'diff_mode', 
                            'liveness':'diff_liveness', 'loudness':'diff_loudness','speechiness':'diff_speechiness'}
                   ,inplace = True)

    # Add is_good_sequence column (is_good_sequence = 0, since all 2-track sequence do not belong to the playlist)
    
    df_diff['is_good_sequence'] = 0
    
    return df_diff

## Applying functions, concatenate df_diff_in_playlist and df_diff_not_in_playlist and saving result in csv

In [10]:
# df_test.head()

In [11]:
csv_files = glob.glob(input_folder+'*.csv')

for csv_file in csv_files:
    df = pd.read_csv(csv_file)

    # extract category_id 

    category_id = csv_file.split('/')[-1].split('.')[-2].split('_')[-2]
    print(category_id)

    #Call functions

    result = create_df_consecutive_diff(df)
    df_diff_in_playlist = result['df_diff']
    list_playlist = result['two_track_sequence_list']
    
    df_diff_not_in_playlist = create_df_diff_not_in_playlist(df,category_id, list_playlist)

    del df

    # concatenation both playlists

    df_diff = pd.concat([df_diff_in_playlist,df_diff_not_in_playlist],axis=0,ignore_index=True)
    df_diff.reset_index(drop=True, inplace=True)

    # saving result in csv

    file_name = "df_diff_category_"+category_id+"_"+TodaysDate +".csv"
    df_diff.to_csv(output_folder+file_name, index = False)

    
    del df_diff
    print(category_id)


rock
rock
romance
romance
rootspart1
rootspart1
rootspart2
rootspart2
rootspart3
rootspart3
sessions
sessions
sleep
sleep
soul
soul
toplists
toplists
travel
travel
workout
workout
