In [1]:
import pandas as pd
import numpy as np
import zipfile
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Reading our data stored in a csv
zf = zipfile.ZipFile('../notebooks/data/complete_100.csv.zip') 
complete = pd.read_csv(zf.open('complete_100.csv'),index_col=0)

In [4]:
# Keep only top 10 music
top_ten = complete[complete['Week Position'] <= 10]

# We transform each year into corresponding decade, eg 1962 => 1960                                                                         
top_ten['WeekID'] = (pd.to_datetime(top_ten['WeekID']).dt.year//10)*10

# Transform value representation of genre into list
top_ten.spotify_genre = top_ten.spotify_genre.apply(lambda x : str(x).strip('][').split(', '))

# Count number of occurence of each genre among decades
audio_genre = top_ten.explode("spotify_genre")
df_genre = audio_genre.explode("spotify_genre").groupby("WeekID")["spotify_genre"].value_counts().to_frame('counts')

In [5]:
# Method to get only main genre from composite genre
def main_genre(x):
    if("pop" in x):
        return "pop"
    if("rock" in x):
        return "rock"
    if("rap" in x):
        return "rap"
    if("jazz" in x):
        return "jazz"
    if("r&b" in x):
        return "r&b"
    if("hip hop" in x):
        return "hip hop"
    if("house" in x):
        return "house"
    if("metal" in x):
        return "metal"
    if("country" in x):
        return "country"
    if("folk" in x):
        return "folk"
    if("soul" in x):
        return "soul"
    if("disco" in x):
        return "disco"
    if("blues" in x):
        return "blues"
    if("punk" in x):
        return "punk"
    if("funk" in x):
        return "funk"
    if("electro" in x):
        return "electro"
    return np.nan
    

In [6]:
# Explode dataframe to get one entry per genre/music
audio_genre.spotify_genre = audio_genre.spotify_genre.apply(lambda x : main_genre(str(x)))

In [7]:
# Keep Only features that interest us and drop duplicates and lines with empty value
df_main_genre= audio_genre[['WeekID', 'SongID','danceability','energy','loudness','liveness', 'acousticness', 'valence', 'tempo','spotify_genre']].dropna().drop_duplicates(subset=['SongID','spotify_genre','WeekID'])


In [8]:
# Keep only data before 2020 and from 2 genres - rock and pop -
df_main_genre = df_main_genre[df_main_genre.WeekID < 2020]
df_rock = df_main_genre[df_main_genre.spotify_genre == "rock"]
df_pop = df_main_genre[df_main_genre.spotify_genre == "pop"]

In [9]:
# Compute stats about the genres and music that we hhave in our dataframes

df_pop_stats = df_pop.groupby(['WeekID','spotify_genre']).mean()
df_rock_stats = df_rock.groupby(['WeekID','spotify_genre']).mean()

max_loud_pop = df_pop_stats.loudness.max()+1
min_loud_pop = df_pop_stats.loudness.min()-1

max_tempo_pop = df_pop_stats.tempo.max()+1
min_tempo_pop = df_pop_stats.tempo.min()-1

max_loud_rock = df_rock_stats.loudness.max()+1
min_loud_rock = df_rock_stats.loudness.min()-1

max_tempo_rock = df_rock_stats.tempo.max()+1
min_tempo_rock = df_rock_stats.tempo.min()-1

In [10]:
# Scale Features which value is not between 0 and 1
df_pop_stats.loudness = df_pop_stats.loudness.apply(lambda x : (x-min_loud_pop)/(max_loud_pop-min_loud_pop))
df_pop_stats.tempo = df_pop_stats.tempo.apply(lambda x : (x-min_tempo_pop)/(max_tempo_pop-min_tempo_pop))

df_rock_stats.loudness = df_rock_stats.loudness.apply(lambda x : (x-min_loud_rock)/(max_loud_rock-min_loud_rock))
df_rock_stats.tempo = df_rock_stats.tempo.apply(lambda x : (x-min_tempo_rock)/(max_tempo_rock-min_tempo_rock))

In [11]:
# Create final version of the dataframes
cols = {
    'WeekID':'group',
    'danceability':'Danceability',
    'energy':'Energy',
    'acousticness':'Acousticness',
    'loudness':'Loudness',
    'valence':'Valence',
    'tempo':'Tempo'
}
df_pop_final = df_pop_stats.reset_index()[['WeekID','danceability','energy','acousticness','loudness','valence','tempo']].rename(columns=cols).set_index('group').T
df_rock_final = df_rock_stats.reset_index()[['WeekID','danceability','energy','acousticness','loudness','valence','tempo']].rename(columns=cols).set_index('group').T


In [12]:
df_rock_final

group,1950,1960,1970,1980,1990,2000,2010
Danceability,0.556418,0.537768,0.56351,0.611659,0.57598,0.56426,0.59588
Energy,0.52179,0.562469,0.541313,0.657444,0.640547,0.79126,0.70588
Acousticness,0.668853,0.454922,0.355189,0.216617,0.200636,0.069351,0.122007
Loudness,0.202504,0.276201,0.127725,0.31078,0.418988,0.872275,0.798447
Valence,0.698254,0.692633,0.621395,0.635907,0.533805,0.525868,0.55274
Tempo,0.127273,0.222862,0.077315,0.387053,0.135602,0.922685,0.477529


In [13]:
df_pop_final

group,1950,1960,1970,1980,1990,2000,2010
Danceability,0.554985,0.539358,0.565145,0.649319,0.647526,0.683037,0.671939
Energy,0.480898,0.525183,0.548674,0.664241,0.635411,0.700878,0.675083
Acousticness,0.656894,0.500841,0.380917,0.208776,0.215376,0.133899,0.148162
Loudness,0.173386,0.243953,0.145844,0.283975,0.468555,0.854156,0.845394
Valence,0.672692,0.681525,0.652694,0.675296,0.571392,0.591266,0.528508
Tempo,0.116623,0.603072,0.427778,0.883377,0.422711,0.459182,0.754848


In [14]:
# Save dataframes
df_pop_final.to_csv('../notebooks/data/lollipop_pop.csv')
df_rock_final.to_csv('../notebooks/data/lollipop_rock.csv')