In [1]:
import numpy as np
import pandas as pd

top_2017 = pd.read_csv('../datasets/top2017.csv')
top_2018 = pd.read_csv('../datasets/top2018.csv')
top_2019 = pd.read_csv('../datasets/top2019.csv', encoding='mbcs', index_col=0)
top_2010_2019 = pd.read_csv('../datasets/top2010-2019.csv', encoding='mbcs', index_col=0)

In [2]:
for df in [top_2019]:
    print("DF")
    for col in ["Track.Name", "Artist.Name", "Genre"]:
        print(col)
        print("Data Type: Categorical")
        unique = len(df[col].unique())
        s = sorted(df[col].unique())
        if unique > 7:
            s = sorted(list(df[col].unique()[:3]) + list(df[col].unique()[-3:]))
            s.insert(3, "…")
        
        print(f"There are {unique} unique values: ",end="{")
        for val in s:
            print(val,end="")
            if not val is s[-1]:
                print(", ",end="")
        print("}\n")

DF
Track.Name
Data Type: Categorical
There are 50 unique values: {Call You Mine, China, Cross Me (feat. Chance the Rapper & PnB Rock), …, Happier, Señorita, boyfriend (with Social House)}

Artist.Name
Data Type: Categorical
There are 38 unique values: {Anuel AA, Ariana Grande, Marshmello, …, Nicky Jam, ROSALÍA, Shawn Mendes}

Genre
Data Type: Categorical
There are 21 unique values: {boy band, brostep, canadian pop, …, dance pop, r&b en espanol, reggaeton flow}



In [3]:
for df in [top_2019]:
    print("DF")
    for col in df.columns:
        if col in ["Track.Name", "Artist.Name", "Genre"]:
            continue
        print(col)
        q3 = df[col].quantile(.75)
        q1 = df[col].quantile(.25)
        outlier_threshold = 1.5 * (q3 - q1)
        outliers = list(df[col][(df[col] < q1 - outlier_threshold) | (df[col] > q3 + outlier_threshold)])
        mini = df[col].min()
        maxi = df[col].max()
        c = len(outliers)
        print("Data Type: Quantitative")
        print(f"Float: contains [{mini}, {maxi}] inclusive, there {'is' if c == 1 else 'are'} {c} outlier{'s' if c != 1 else ''}",end="")
        if len(outliers) == 0:
            print("\n")
            continue
        print(": {",end="")
        for val in outliers:
            print(val,end="")
            if not val is outliers[-1]:
                print(", ",end="")
        print("}\n")

DF
Beats.Per.Minute
Data Type: Quantitative
Float: contains [85, 190] inclusive, there are 0 outliers

Energy
Data Type: Quantitative
Float: contains [32, 88] inclusive, there are 0 outliers

Danceability
Data Type: Quantitative
Float: contains [29, 90] inclusive, there are 2 outliers: {40, 29}

Loudness..dB..
Data Type: Quantitative
Float: contains [-11, -2] inclusive, there are 3 outliers: {-11, -11, -11}

Liveness
Data Type: Quantitative
Float: contains [5, 58] inclusive, there are 6 outliers: {44, 36, 32, 36, 58, 41}

Valence.
Data Type: Quantitative
Float: contains [10, 95] inclusive, there are 0 outliers

Length.
Data Type: Quantitative
Float: contains [115, 309] inclusive, there are 4 outliers: {302, 288, 115, 309}

Acousticness..
Data Type: Quantitative
Float: contains [1, 75] inclusive, there is 1 outlier: {75}

Speechiness.
Data Type: Quantitative
Float: contains [3, 46] inclusive, there are 6 outliers: {46, 38, 31, 32, 33, 34}

Popularity
Data Type: Quantitative
Float: conta

In [4]:
rename_2019 = {
    "Track.Name":'Name',
    'Artist.Name':'Artists',
    'Genre':'Genre',
    'Beats.Per.Minute':'Tempo',
    'Energy':'Energy',
    'Danceability':'Danceability',
    'Loudness..dB..':'Loudness',
    'Liveness':'Liveness',
    'Valence.':'Valence',
    'Length.':'Duration',
    'Acousticness..':'Acousticness',
    'Speechiness.':'Speechiness',
    'Popularity':'Popularity'
}

rename_2010_2019 = {
    "title":'Name',
    'artist':'Artists',
    'top genre':'Genre',
    'year':'Year',
    'bpm':'Tempo',
    'nrgy':'Energy',
    'dnce':'Danceability',
    'dB':'Loudness',
    'val':'Valence',
    'dur':'Duration',
    'acous':'Acousticness',
    'spch':'Speechiness',
    'pop':'Popularity'
}

rename_2017_2018 = {
    'id':'ID',
    'name':'Name',
    'artists':'Artists',
    'danceability':'Danceability',
    'energy':'Energy',
    'key':'Key',
    'loudness':'Loudness',
    'mode':'Mode',
    'speechiness':'Speechiness',
    'acousticness':'Acousticness',
    'instrumentalness':'Instrumentalness',
    'liveness':'Liveness',
    'valence':'Valence',
    'tempo':'Tempo',
    'duration_ms':'Duration',
    'time_signature':'Time Signature'
}

top_2017.rename(columns = rename_2017_2018, inplace = True) 
top_2018.rename(columns = rename_2017_2018, inplace = True) 
top_2019.rename(columns = rename_2019, inplace = True) 
top_2010_2019.rename(columns = rename_2010_2019, inplace = True) 