# Pop atoms
Let's find the musical atoms that make up the most popular hits of the last decades

## Import functions and libraries

In [4]:
from functions import *
from os.path import exists

## Get data

We want to compare the audio features of the most popular songs of before 1962 (before the Beatles era), with a more general sample of popular songs from the 60's to current time. 

### Popular before The Beatles

In [None]:
# Get all the Billboard data.
# Billboard collects the most popular songs weekly since 1958. This is the data we will use for this project.
# The API may fail, so to avoid having to repeat queries, we save the data continuously int a csv ("df_ranks_weekly.csv")

recalculate = False

if recalculate:

    if exists("df_ranks_weekly_temp.csv"):
        temp_csv_path = "df_ranks_weekly.csv"
    else:
        temp_csv_path = ""
        
    df_ranks_weekly = create_weekly_ranks_df("1958-01-01", "1961-12-31", csv_path = temp_csv_path)

In [None]:
# Get peak values. 
# Some songs are listed more than once, as they come up several times in the charts for different weeks. 
# We will only keep one entry for each song, calculate the peak position, maximum number of weeks on the chart, and date that peak position was reached for each song.  

if not exists("df_ranks_peaks.csv"):    

#    billboard_ranks_raw = pd.read_csv("billboard_rank_raw.csv")
    df_ranks_weekly = pd.read_csv("df_ranks_weekly.csv")

    df_ranks_peaks = calculate_ranks_peaks(df_ranks_weekly)

    # Save the cleaned data to a CSV file.
    df_ranks_peaks.to_csv("df_ranks_peaks.csv.csv", index=False)

In [None]:
## Get audio analysis data from Spotify.

if not exists("df_audio_analysis_60.csv"):

    # Load the cleaned data
    df_ranks_peaks = pd.read_csv("df_ranks_peaks.csv")

    # add audio analysis columns
    audio_analysis_df = create_audio_analysis_df(df_ranks_peaks)

    # Save the cleaned data to a CSV file.
    audio_analysis_df.to_csv("df_audio_analysis.csv", index=False)

### Popular of all time

In [8]:
# We asked GPT for a list of songs that represent the greatest hits from year to year since 1960. 
# The result was stored in a csv file "df_songs_sample.csv"

df_songs_sample = pd.read_csv("df_songs_sample.csv")

# we need to melt the dataframe to have one column for years, one column for titles, and one column for the artist name. 
df_songs_sample = df_songs_sample.melt(var_name='peak_year', value_name='title')
df_songs_sample["artist"] = df_songs_sample["title"].str.split(" - ").str[0]
df_songs_sample["title"] = df_songs_sample["title"].str.split(" - ").str[1]

In [9]:
# We can now query these songs from spotify. 

if not exists("df_audio_analysis_sample.csv"):
    df_audio_analysis_sample = create_audio_analysis_df(df_songs_sample)

    # Save the cleaned data to a CSV file.
    df_audio_analysis_sample.to_csv("df_audio_analysis_sample.csv", index=False)

Unnamed: 0,peak_year,title,artist
0,1960,It's Now or Never,Elvis Presley
1,1960,Smoke Gets in Your Eyes,The Platters
2,1960,The Twist,Chubby Checker
3,1960,Theme from A Summer Place,Percy Faith
4,1960,Save the Last Dance for Me,The Drifters
...,...,...,...
320,2024,Good 4 U,Olivia Rodrigo
321,2024,Dance the Night,Dua Lipa
322,2024,Save Your Tears,The Weeknd
323,2024,SOS,SZA


## Clean data

In [None]:
# read data from csv
audio_analysis_df = pd.read_csv("audio_analysis.csv")

## Explore data