Danielle Paes Barretto

**version: 30/07/19 (last review)**


This notebook processes .csv files related to 'Playlist level' save data after processed. 

Input:

* participants_playlists.csv


Output:

* participants_playlists_processed_2019-07-04.csv


P.S.: Processed files mean removing participants not used in the analysis and renaming playlist in a more meaningful way.


# Libraries

In [1]:
import pandas as pd
import time
TodaysDate = time.strftime("%Y-%m-%d")

# Folder

In [2]:
folder_data_generated = "./data/DATA_USER_EXPERIMENT/data_generated/"
folder_data_processed = "./data/DATA_USER_EXPERIMENT/data_processed/"

# Loading data

In [3]:
df_playlists = pd.read_csv(folder_data_generated+"participants_playlists.csv",error_bad_lines=False, dtype={'participant_id':'category', 
                                                                                      'is_track_known':'category',
                                                                                      'is_participant_favorite':'category'})
df_playlists.head()

Unnamed: 0,participant_id,playlist,avg_flow_score,is_track_known,is_participant_favorite
0,1,2,0.42,0,0
1,1,1,0.39,1,1
2,1,3,0.3,1,0
3,2,2,0.5,0,0
4,2,3,0.22,1,1


In [4]:
df_playlists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 5 columns):
participant_id             132 non-null category
playlist                   132 non-null object
avg_flow_score             132 non-null float64
is_track_known             132 non-null category
is_participant_favorite    132 non-null category
dtypes: category(3), float64(1), object(1)
memory usage: 4.2+ KB


In [5]:
# remove rows with playlists 1,2 and 3 - category 'mood'
df_playlists = df_playlists[~df_playlists['playlist'].isin(['1','2','3'])]

In [6]:
df_playlists.head()

Unnamed: 0,participant_id,playlist,avg_flow_score,is_track_known,is_participant_favorite
6,3,4B,0.46,1,0
7,3,5B,0.49,1,1
8,3,6B,0.58,1,0
9,4,4B,0.6,0,0
10,4,6B,0.47,1,0


In [7]:
# renaming playlist in a more meaningful way

df_playlists['playlist'].replace({'4B': 'low_mood_low_tempo_not_extr','5B': 'low_mood_high_tempo_not_extr',
                                  '6B': 'high_mood_low_tempo_not_extr'},inplace=True)
df_playlists['playlist'].replace({'4': 'low_mood_low_tempo_extr','5': 'low_mood_high_tempo_extr',
                                  '6': 'high_mood_low_tempo_extr'},inplace=True)

In [8]:
df_playlists.head()

Unnamed: 0,participant_id,playlist,avg_flow_score,is_track_known,is_participant_favorite
6,3,low_mood_low_tempo_not_extr,0.46,1,0
7,3,low_mood_high_tempo_not_extr,0.49,1,1
8,3,high_mood_low_tempo_not_extr,0.58,1,0
9,4,low_mood_low_tempo_not_extr,0.6,0,0
10,4,high_mood_low_tempo_not_extr,0.47,1,0


In [9]:
df_playlists.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105 entries, 6 to 131
Data columns (total 5 columns):
participant_id             105 non-null category
playlist                   105 non-null object
avg_flow_score             105 non-null float64
is_track_known             105 non-null category
is_participant_favorite    105 non-null category
dtypes: category(3), float64(1), object(1)
memory usage: 4.4+ KB


In [10]:
# saving processed dataframe in .csv

df_playlists.to_csv(folder_data_processed+"participants_playlists_processed_"+TodaysDate+".csv", index = False)

# Create Dataframe with name of feature and data type

In [11]:
list_features = df_playlists.columns.tolist()

feature_types = df_playlists.dtypes.tolist()

df = pd.DataFrame({"Name Feature":list_features,"Data Type":feature_types})
df = df[["Name Feature", "Data Type"]]
df

Unnamed: 0,Name Feature,Data Type
0,participant_id,category
1,playlist,object
2,avg_flow_score,float64
3,is_track_known,category
4,is_participant_favorite,category
