In [1]:
import pandas as pd
import numpy as np
import glob

import matplotlib.pyplot as plt
import seaborn as sns

import time
TodaysDate = time.strftime("%Y-%m-%d")

import pickle

In [2]:
def save_list(list_name,output_folder,file_name):
    """Save a list in a .txt
    
    Input: 
        list_name: name of the list to be saved 
        output_folder: path of the output_folder where the list will be saved
        file_name: name for the .txt where the list will be saved
    
    Output: 
        .txt to be open using pickle (use function 'retrieve_saved_list')
    """
    #save in a list
    
    fname = file_name + "_" + TodaysDate +".txt"
    
    with open(output_folder+fname, "wb") as fp:   #Pickling
        pickle.dump(list_name, fp)

In [3]:
def retrieve_saved_list(input_folder, file_name):
    """unplickle (retrieve) a list that was saved in .txt using pickle.
    
    Input:
        input_folder: path of the input_folder where the list saved in .txt is.
        file_name: name for the .txt.
    
    Output: a list 
    
    """
    with open(input_folder + file_name, "rb") as fp:   # Unpickling
        b = pickle.load(fp)
    
    return b

In [4]:
def generate_list_csv_file_in_folder(file_name,folder,partial_name=''):
    """ generate a list of all csv file in a folder and save the result in a .txt in the same folder
    
    Input: 
        file_name: name with which the list will be saved.
        folder: path where the .csv files are and where the list generated will be saved.
        partial_name (optional argument): partial name of the files, in case wanting to avoid having in the 
        list all the files in the folder included in the list.
        
    Output:
        .txt file created using pickle consisting of the path and name of the csv files.
    """
    
    
    # finding all files in folder

    files = glob.glob(folder + partial_name +'*.csv')

    # save list using pickle
    
    save_list(files,folder,file_name)
    

In [5]:
def generate_list_df_from_csv_in_folder(file_name,folder,partial_name=''):
    """ generate a list of all csv file in a folder and save the result in a .txt in the same folder
    
    Input: 
        file_name: name with which the list will be saved.
        folder: path where the .csv files are and where the list generated will be saved.
        partial_name (optional argument): partial name of the files, in case wanting to avoid having in the 
        list all the files in the folder included in the list.
        
    Output:
        .txt file created using pickle consisting of the path and name of the csv files.
    """
    
    
    # finding all files in folder

    files = glob.glob(folder + partial_name +'*.csv')

    # reading csv and making a list of files to concatenate

    list_df = []

    for file in files:
    
        df = pd.read_csv(file)
        list_df.append(df)
    
#     save list using pickle
    
    save_list(list_df,folder,file_name)
    

In [6]:
def create_df_label_info(list_paths,list_dfs,output_folder):
    """ Create dataframe with categories and percentages of label = 0 and label = 1
    
    Input: 
        list_paths:
        list_dfs:
    
    Output:
        dataframe: 
        
        """
    # list with the name of the categories

    list_categories = [list_paths[idx].split('\\')[-1].split('_')[-2] for idx in range(len(list_paths))]

    # list with percentage of 0's in label (i.e. sequences that occurs only once)
    list_label_0 = [round(list_dfs[idx].label.value_counts(normalize = True)[0],3) for idx in range(len(list_dfs))]

    # list with percentage of 0's in label (i.e. sequences that occurs only once)
    list_label_1 = [round(list_dfs[idx].label.value_counts(normalize = True)[1],3) for idx in range(len(list_dfs))]

    # Create a dataframe

    df_label_info = pd.DataFrame({"category_id": list_categories, "Percentage_once_sequences":list_label_0, "Percentage_repeating_sequences":list_label_1})

    # save it in csv

    TodaysDate = time.strftime("%Y-%m-%d")
    filename = "label_info_"+TodaysDate +".csv"
    df_label_info.to_csv(output_folder+filename, index = False)
    
    return df_label_info

In [7]:
def plot_dist_categories_features(category_id, category_id_df, output_folder):
    # fig, axs = plt.subplots(nrows =1, ncols=3, figsize)
    fig, axs = plt.subplots(3, 5, figsize = (15,8))
    fig.subplots_adjust(left=0.2, wspace=0.3, hspace = 0.4)
    # title
    
    # title   
    fig.suptitle("Density plots for categoty "+ category_id)
    

    sns.distplot(category_id_df['acousticness'],ax=axs[0,0])
    sns.distplot(category_id_df['danceability'],ax=axs[0,1])
    sns.distplot(category_id_df['energy'],ax=axs[0,2])
    sns.distplot(category_id_df['valence'],ax=axs[0,3])
    sns.distplot(category_id_df['tempo'],ax=axs[0,4])
    sns.distplot(category_id_df['instrumentalness'],ax=axs[1,0])
    sns.distplot(category_id_df['key'],ax=axs[1,1])
    sns.distplot(category_id_df['liveness'],ax=axs[1,2])
    sns.distplot(category_id_df['loudness'],ax=axs[1,3])
    sns.distplot(category_id_df['speechiness'],ax=axs[1,4])
    sns.distplot(category_id_df['label'],ax=axs[2,0])

    # removing unnecessary subplots
    
    axs[2,1].set_axis_off()
    axs[2,2].set_axis_off()
    axs[2,3].set_axis_off()
    axs[2,4].set_axis_off()
    

# saving image in a file
    
    plt.savefig(output_folder + "density_plots_category_"+category_id+"_" + TodaysDate +".png")
    
    plt.close()

#     plt.show()

In [8]:
# Generate a list of paths of csv files in a folder

folder = "C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification/"
partial_name = "tracks_repeated_seq_category_"
file_name = 'list_category_with_repeated_sequences'
generate_list_csv_file_in_folder(file_name,folder,partial_name)

In [9]:
# Generate a list of dataframes in a folder

folder = "C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification/"
partial_name = "tracks_repeated_seq_category_"
file_name = 'list_dfs_category_with_repeated_sequences'
generate_list_df_from_csv_in_folder(file_name,folder,partial_name)

In [10]:
# Opening list of paths of files from categories with repeated sequences

folder = "C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification/"
input_folder = folder
file_name = "list_category_with_repeated_sequences_2019-02-22.txt"
list_paths = retrieve_saved_list(input_folder, file_name)

In [11]:
list_paths[:3]

['C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification\\tracks_repeated_seq_category_afro_2019-02-21.csv',
 'C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification\\tracks_repeated_seq_category_arab_2019-02-21.csv',
 'C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification\\tracks_repeated_seq_category_blues_2019-02-21.csv']

In [12]:
# Opening list with dataframes of files from categories with repeated sequences

input_folder = folder
file_name = "list_dfs_category_with_repeated_sequences_2019-02-22.txt"
list_dfs = retrieve_saved_list(input_folder, file_name)

In [13]:
list_dfs[0].head()

Unnamed: 0,current_track-next_track,acousticness,danceability,energy,valence,tempo,instrumentalness,key,liveness,loudness,speechiness,label
0,1noPA8QfOmSEurS2PekBsp_5JEw4FrNlWmHxgBkw0jbEj,0.124,0.057,0.008,0.057,9.405,0.372902,3.0,0.0147,2.816,0.0334,0
1,5JEw4FrNlWmHxgBkw0jbEj_1KpBtWSI9dlv0RjtzvF1BD,0.349,0.017,0.321,0.021,2.027,0.003922,5.0,0.0437,0.542,0.0421,0
2,1KpBtWSI9dlv0RjtzvF1BD_1qNz5rynw3I9LU8uQRETsO,0.5146,0.027,0.12,0.018,17.023,0.01338,9.0,0.0724,4.66,0.03,0
3,1qNz5rynw3I9LU8uQRETsO_5fpoDuxvBBNy69mgzIMMrI,0.07033,0.013,0.055,0.006,14.983,0.017387,11.0,0.0044,3.284,0.054,0
4,5fpoDuxvBBNy69mgzIMMrI_7dr9gDHQ4rsLLUWy0pzVpR,0.10093,0.02,0.049,0.106,14.866,1.4e-05,3.0,0.013,1.885,0.17,0


In [14]:
# Generate and save a dataframe showing the percentage of label = 0 and label = 1

output_folder = "C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification/"

df_label = create_df_label_info(list_paths,list_dfs,output_folder)

In [15]:
df_label.head()

Unnamed: 0,category_id,Percentage_once_sequences,Percentage_repeating_sequences
0,afro,0.997,0.003
1,arab,0.993,0.007
2,blues,0.998,0.002
3,chill,0.999,0.001
4,country,0.998,0.002


In [16]:
# Still didn't find out a good way to save a dataframe in image

# from pandas.plotting import table

# ax = plt.subplot(111, frame_on=False) # no visible frame
# ax.xaxis.set_visible(False)  # hide the x axis
# ax.yaxis.set_visible(False)  # hide the y axis

# table(ax, df_label)  # where df is your data frame

# plt.savefig('mytable.png')

In [17]:
# Creating distribution plots of the features for all categories and saving the images

# First generate the necessary input so the function can be used once for all categories which have repeated sequences

output_folder = "C:/Master/THESIS/Analysis_datasets/data/SpotifyCategories/processed_data/playlists_with_repeated_seqs_2_tracks/category_analysis/classification/images/"

# generate a list of category_id

list_categories = [list_paths[idx].split('\\')[-1].split('_')[-2] for idx in range(len(list_paths))]

# building a dictionary where (key, value) = (category_id, dataframe of this category_id)

dic_categories = dict(zip(list_categories,list_dfs))

In [18]:
# Creating distribution plots of the features for all categories and saving the images

for category_id, category_df  in dic_categories.items():
    plot_dist_categories_features(category_id, category_df, output_folder)
