### Cleaning and Processing Data

In [2]:
# Importing required libraries
import pandas as pd
import os
import datetime
import math
import numpy as np

# Reading CSV files
data = 'data/data.csv'
genres = 'data/data_w_genres.csv'
mus_data = pd.read_csv(data)
genres_data = pd.read_csv(genres)

In [5]:
# Cleaning genres column's str values
genres_data['genres'] = genres_data['genres'].str.replace(r"[\"\])([']", '')
genres_data['artists'] = genres_data['artists'].str.replace(r"[\"\])(*[']", '')
genres_data['artists'] = genres_data['artists'].str.replace(r"[$]", 'S')

In [6]:
# Keeping only selected columns for genres
genres_data = genres_data[['artists', 'genres']]

In [7]:
# Cleaning main data column's str values
mus_data['artists'] = mus_data['artists'].str.replace(r"[\"\])(*[']", '')
mus_data['artists'] = mus_data['artists'].str.replace(r"$", 'S')

In [9]:
# Creating decades column to sort data by 10 years intervals
mus_data['decade'] = np.floor(mus_data['year']/10)*10

In [10]:
# Assigning Int64 to decades column values
mus_data = mus_data.astype({"decade": int})

In [11]:
# Adding additional artist_songs column to get accurate song's counts per decades
mus_data['artist_song'] = mus_data['artists'] + ' - ' + mus_data['name']

In [19]:
# Merging dataframes - genres data with main data
merged_data = mus_data.merge(genres_data, on='artists', how='left')
# merged_data.shape

In [20]:
# Replacing NaN values with empty values after merging
merged_data['genres'].fillna('', inplace=True)
# merged_data.shape

In [21]:
# Rearranging columns order for better visibility
merged_data = merged_data[['decade', 'year', 'artists', 'name', 'genres', 'duration_ms', 'artist_song', 'release_date', 'valence', 'acousticness', 'danceability', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'id']]

In [22]:
# Sorting data by duration in order to drop non-music items
merged_data = merged_data.sort_values('duration_ms',ascending=False)

In [23]:
# Cleaning merged data, dropping audio books
merged_data.drop(merged_data.loc[merged_data['duration_ms'] >= 1000000].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Georgette Heyer, Irina Salkow'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Трумен Капоте'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Seweryn Goszczyński'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'H.P. Lovecraft'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Эрих Мария Ремарк'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Эрнест Хемингуэй'].index, inplace=True)

In [24]:
# Saving master clean data to csv file
merged_data.to_csv('data/clean_data.csv', encoding='utf-8', index=False)

### Dividing data into decades to find out top-20 artists and songs

In [25]:
# 1920s decade

mus_1920s = merged_data[merged_data['decade']==1920]
mus_1920s.to_csv('data/mus_1920s.csv', encoding='utf-8', index=False)
# mus_1920s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1920s['artists'].value_counts().nlargest(20)
# mus_1920s.shape

Francisco Canaro                         686
Ignacio Corsini                          447
Francisco Canaro, Charlo                 241
Frédéric Chopin, Vladimir Horowitz       160
Louis Armstrong & His Hot Five            98
Robert Schumann, Vladimir Horowitz        78
Fats Waller                               61
Louis Armstrong                           49
Alexander Scriabin, Vladimir Horowitz     48
Frédéric Chopin, Arthur Rubinstein        48
George Olsen                              47
Ted Weems & His Orchestra                 47
Louis Armstrong & His Hot Seven           43
Jimmie Rodgers                            41
Leroy Carr                                40
Francisco Canaro, Agustín Irusta          39
Iván Rolón                                32
Frédéric Chopin, Alfred Cortot            31
Tommy Dorsey                              31
Maurice Chevalier                         31
Name: artists, dtype: int64

In [26]:
mus_1920s['artist_song'].value_counts().nlargest(20)

Frédéric Chopin, Vladimir Horowitz - Ballade No. 1 in G Minor, Op. 23               8
Frédéric Chopin, Vladimir Horowitz - Mazurka in C-Sharp Minor, Op. 30, No. 4        7
Frédéric Chopin, Vladimir Horowitz - Barcarolle, Op. 60                             6
Frédéric Chopin, Vladimir Horowitz - Scherzo No. 1 in B Minor, Op. 20               6
Louis Armstrong & His Hot Seven - Potato Head Blues                                 6
Frédéric Chopin, Vladimir Horowitz - Polonaise-Fantaisie in A-Flat Major, Op. 61    5
Louis Armstrong & His Hot Five - Jazz Lips                                          4
Louis Armstrong & His Hot Five - Heebie Jeebies                                     4
Frédéric Chopin, Vladimir Horowitz - Polonaise in A-Flat Major, Op. 53 "Heroic"     4
Louis Armstrong & His Hot Seven - Weary Blues                                       4
Louis Armstrong & His Savoy Ballroom Five - Tight Like This                         4
Frédéric Chopin, Vladimir Horowitz - Étude in C-Sharp 

In [27]:
# 1930s decade

mus_1930s = merged_data[merged_data['decade']==1930]
mus_1930s.to_csv('data/mus_1930s.csv', encoding='utf-8', index=False)
# mus_1930s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1930s['artists'].value_counts().nlargest(20)
# mus_1930s.shape

Billie Holiday                          220
Francisco Canaro, Charlo                212
Sinclair Lewis, Frank Arnold            204
Francisco Canaro                        203
Francisco Canaro, Ernesto Fama          194
Billie Holiday, Teddy Wilson            191
Francisco Canaro, Roberto Maida         180
Ignacio Corsini                         159
Lead Belly                              141
Umm Kulthum                             116
Franz Joseph Haydn, Pro Arte Quartet    113
Ernst H. Gombrich, Christoph Waltz      109
Count Basie                              99
Zofia Dromlewiczowa                      96
Carmen Miranda                           83
Benny Goodman                            81
Roy Fox                                  75
Frédéric Chopin, Robert Lortat           69
Roza Eskenazi                            66
Robert Johnson                           64
Name: artists, dtype: int64

In [28]:
mus_1930s['artist_song'].value_counts().nlargest(20)

Count Basie - One O'Clock Jump                                                               5
Count Basie - Oh, Lady Be Good                                                               4
Billie Holiday, Count Basie - Swing, Brother, Swing - Live                                   4
Count Basie - Jumpin' At The Woodside - Live                                                 4
Count Basies Kansas City Seven - Lester Leaps In                                             4
Billie Holiday - The Man I Love                                                              4
Billie Holiday, Teddy Wilson - I Must Have That Man (with Teddy Wilson & His Orchestra)      4
Benny Goodman - Let's Dance                                                                  4
Robert Johnson - Traveling Riverside Blues                                                   3
Fred Astaire - They All Laughed                                                              3
Billie Holiday - Laughing at Life - Take 1        

In [29]:
# 1940s decade

mus_1940s = merged_data[merged_data['decade']==1940]
mus_1940s.to_csv('data/mus_1940s.csv', encoding='utf-8', index=False)
# mus_1940s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1940s['artists'].value_counts().nlargest(20)
# mus_1940s.shape

Orchestra Studio 7                                                                                                                           238
Lead Belly                                                                                                                                   188
Frédéric Chopin, Arthur Rubinstein                                                                                                           151
Igor Stravinsky, Columbia Symphony Orchestra                                                                                                 135
Shamshad Begum                                                                                                                               118
Giuseppe Verdi, Arturo Toscanini                                                                                                             107
Lata Mangeshkar                                                                                                                   

In [30]:
mus_1940s['artist_song'].value_counts().nlargest(20)

Modest Mussorgsky, William Kapell - Pictures at an Exhibition: Promenade                                    8
Lead Belly - John Henry                                                                                     6
Mina - Stringimi forte i polsi                                                                              5
Igor Stravinsky, New York Philharmonic - Ode - Elegiacal Chant in 3 Parts for Orchestra: I. Eulogy          4
Igor Stravinsky, Joseph Szigeti - Russian Maiden's Song                                                     4
Gustav Mahler, Bruno Walter, New York Philharmonic - Symphony No. 5 in C-Sharp Minor: V. Rondo-Finale       4
Johann Sebastian Bach, Claudio Arrau - Goldberg Variations, BWV 988: Aria                                   4
Igor Stravinsky, Woody Herman Orchestra - Ebony Concerto: III. Moderato - Con moto - Moderato - Vivo        4
Igor Stravinsky, New York Philharmonic - Ode - Elegiacal Chant in 3 Parts for Orchestra: II. Eclogue        4
Gustav Mah

In [32]:
# 1950s decade

mus_1950s = merged_data[merged_data['decade']==1950]
mus_1950s.to_csv('data/mus_1940s.csv', encoding='utf-8', index=False)
# mus_1950s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1950s['artists'].value_counts().nlargest(20)
# mus_1950s.shape

Ella Fitzgerald                       268
Miles Davis                           260
Dean Martin                           253
Oscar Peterson                        205
Lata Mangeshkar                       202
Unspecified                           194
Frank Sinatra                         181
Johann Sebastian Bach, Glenn Gould    179
Billie Holiday                        167
Javier Solís                          136
Thelonious Monk                       133
Johnny Cash                           123
Duke Ellington                        115
Elvis Presley                         109
Nat King Cole                         107
Sarah Vaughan                         107
Jackie Gleason                        106
Chet Baker                            103
Charlie Parker                         98
Lefty Frizzell                         93
Name: artists, dtype: int64

In [33]:
mus_1950s['artist_song'].value_counts().nlargest(20)

Miles Davis - Two Bass Hit                                        5
Miles Davis - 'Round Midnight                                     5
Thelonious Monk - Ruby, My Dear                                   4
Miles Davis - So What                                             4
Oscar Peterson - Blue Moon                                        4
Marty Robbins - El Paso                                           4
Thelonious Monk - Reflections                                     4
Giuseppe Verdi, Arturo Toscanini - Overture                       4
Otis Rush - I Can't Quit You Baby                                 4
Charles Mingus - Pussy Cat Dues                                   3
Miles Davis Quintet - I Could Write A Book                        3
Miles Davis Quintet - Oleo - Edited Without False Start           3
Javier Solís - Esclavo y Amo                                      3
Gerry Mulligan, Paul Desmond Quartet - Blues In Time              3
Miles Davis Quintet - Tune Up/When Lights Are Lo

In [34]:
# 1960s decade

mus_1960s = merged_data[merged_data['decade']==1960]
mus_1960s.to_csv('data/mus_1960s.csv', encoding='utf-8', index=False)
# mus_1960s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1960s['artists'].value_counts().nlargest(20)
# mus_1960s.shape

The Beach Boys        392
Frank Sinatra         320
The Beatles           275
Bob Dylan             266
Johnny Cash           246
Elvis Presley         239
The Rolling Stones    236
Sam Cooke             173
Nina Simone           169
The Kinks             157
Dean Martin           157
John Coltrane         144
Bill Evans Trio       141
The Who               128
Otis Redding          124
The Byrds             111
The Moody Blues       101
The Monkees            98
Aretha Franklin        97
Miles Davis            92
Name: artists, dtype: int64

In [35]:
mus_1960s['artist_song'].value_counts().nlargest(20)

John Coltrane - A Love Supreme, Pt. III - Pursuance                           4
Johnny Cash - Hey Porter                                                      4
Sam Cooke - (Ain't That) Good News                                            4
Tammy Wynette - D-I-V-O-R-C-E                                                 4
John Coltrane - A Love Supreme, Pt. II - Resolution                           4
John Coltrane - A Love Supreme, Pt. I – Acknowledgement                       4
Santana - Soul Sacrifice                                                      3
The Byrds - My Back Pages                                                     3
The Animals - I'm Crying                                                      3
Bill Evans Trio - All Of You - Live At The Village Vanguard, 1961 / Take 2    3
Johnny Cash - Give My Love to Rose                                            3
Mongo Santamaria - Watermelon Man                                             3
The Animals - Don't Let Me Be Misunderst

In [36]:
# 1970s decade

mus_1970s = merged_data[merged_data['decade']==1970]
mus_1970s.to_csv('data/mus_1970s.csv', encoding='utf-8', index=False)
# mus_1970s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1970s['artists'].value_counts().nlargest(20)
# mus_1970s.shape

Fleetwood Mac               252
Queen                       236
Led Zeppelin                204
The Rolling Stones          195
Elton John                  188
Bob Marley & The Wailers    182
Bob Dylan                   173
Marvin Gaye                 142
Willie Nelson               132
Elvis Presley               124
KISS                        121
David Bowie                 114
Wings                       113
Waylon Jennings             112
The Who                     112
Ramones                     106
Vicente Fernández           103
Van Morrison                101
Stevie Wonder               101
Genesis                      99
Name: artists, dtype: int64

In [37]:
mus_1970s['artist_song'].value_counts().nlargest(20)

The Rolling Stones - Bitch - 2009 Mix                                                              4
The Rolling Stones - Brown Sugar - 2009 Mix                                                        4
Eagles - Wasted Time - 2013 Remaster                                                               4
The Charlie Daniels Band - The Devil Went Down to Georgia                                          4
Charlie Rich - Behind Closed Doors                                                                 4
The Rolling Stones - Can't You Hear Me Knocking - 2009 Mix                                         4
The Rolling Stones - Wild Horses - 2009 Mix                                                        4
The Rolling Stones - Dead Flowers - 2009 Mix                                                       4
The Velvet Underground - Rock and Roll - Full Length Version; 2015 Remaster                        3
Queen - Seven Seas Of Rhye - Remastered 2011                                               

In [38]:
# 1980s decade

mus_1980s = merged_data[merged_data['decade']==1980]
mus_1980s.to_csv('data/mus_1980s.csv', encoding='utf-8', index=False)
# mus_1980s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1980s['artists'].value_counts().nlargest(20)
# mus_1980s.shape

U2                    155
The Cure              131
Queen                 126
Metallica             123
The Smiths            117
Talking Heads         111
R.E.M.                105
Bruce Springsteen     100
Depeche Mode           98
Prince                 89
Billy Joel             83
Fleetwood Mac          82
George Strait          82
Journey                81
Def Leppard            80
Alabama                80
Stevie Ray Vaughan     78
Iron Maiden            77
Rush                   72
Mötley Crüe            71
Name: artists, dtype: int64

In [39]:
mus_1980s['artist_song'].value_counts().nlargest(20)

Rod Stewart - Young Turks                              4
Guns N Roses - You're Crazy                            4
Tears For Fears - Everybody Wants To Rule The World    4
Tears For Fears - Shout                                4
Steely Dan - FM                                        3
Steely Dan - Babylon Sisters                           3
Cyndi Lauper - Girls Just Want to Have Fun             3
Whodini - One Love                                     3
Faith No More - We Care a Lot                          3
R.E.M. - Radio Free Europe                             3
Wham! - Wham Rap! (Enjoy What You Do?)                 3
Echo & the Bunnymen - The Killing Moon                 3
Julio Iglesias - Hey                                   3
Foreigner - Urgent                                     3
New Order - Bizarre Love Triangle                      3
Ricky Van Shelton - Life Turned Her That Way           3
Tears For Fears - Head Over Heels / Broken             3
Gloria Estefan - Don't Wanna Lo

In [40]:
# 1990s decade

mus_1990s = merged_data[merged_data['decade']==1990]
mus_1990s.to_csv('data/mus_1990s.csv', encoding='utf-8', index=False)
# mus_1990s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_1990s['artists'].value_counts().nlargest(20)
# mus_1990s.shape

Nirvana                     119
Sublime                     104
Joan Sebastian               90
Metallica                    89
Green Day                    80
2Pac                         79
Red Hot Chili Peppers        75
The Smashing Pumpkins        68
Alice In Chains              67
Los Tigres Del Norte         67
Mariah Carey                 66
Pearl Jam                    65
Luis Miguel                  65
A Tribe Called Quest         64
Los Temerarios               63
The Notorious B.I.G.         62
Rage Against The Machine     60
Nine Inch Nails              59
Korn                         54
Antonio Aguilar              54
Name: artists, dtype: int64

In [41]:
mus_1990s['artist_song'].value_counts().nlargest(20)

Nirvana - Something In The Way                         5
Nirvana - Come As You Are                              5
Nirvana - Lithium                                      5
Nirvana - Polly                                        5
LL Cool J - Mama Said Knock You Out                    4
Nirvana - Smells Like Teen Spirit                      4
Nirvana - Dumb                                         4
Nirvana - Breed                                        4
Nirvana - On A Plain                                   4
Nirvana - All Apologies                                4
Nirvana - In Bloom - Nevermind Version                 4
Nirvana - Territorial Pissings                         4
Nirvana - Drain You                                    4
Nirvana - Tourette's                                   3
Alan Jackson - Livin' On Love                          3
Cake - The Distance                                    3
The Notorious B.I.G. - Who Shot Ya? - 2005 Remaster    3
Crash Test Dummies - Mmm Mmm Mm

In [42]:
# 2000s decade

mus_2000s = merged_data[merged_data['decade']==2000]
mus_2000s.to_csv('data/mus_2000s.csv', encoding='utf-8', index=False)
# mus_2000s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_2000s['artists'].value_counts().nlargest(20)
# mus_2000s.shape

Eminem                   96
Linkin Park              72
John Mayer               70
George Strait            67
Slipknot                 66
Red Hot Chili Peppers    63
John Williams            62
Fall Out Boy             62
Jack Johnson             59
System Of A Down         57
Taylor Swift             57
Coldplay                 52
Kenny Chesney            52
50 Cent                  51
Radiohead                51
JAY-Z                    49
Breaking Benjamin        49
Disturbed                48
Jay Chou                 48
Green Day                46
Name: artists, dtype: int64

In [43]:
mus_2000s['artist_song'].value_counts().nlargest(20)

Eminem - The Real Slim Shady                      5
Akon - Lonely                                     5
Akon - Locked Up                                  4
Eminem - Mockingbird                              4
Eminem - Without Me                               4
Wu-Tang Clan - Gravel Pit                         4
Ice Cube - It Was A Good Day                      3
Papa Roach - Getting Away With Murder             3
Plain White Ts - Hey There Delilah                3
Toby Keith, Willie Nelson - Beer For My Horses    3
Eminem - Like Toy Soldiers                        3
Sia - Breathe Me                                  3
Valentín Elizalde - A Mis Enemigos                3
Eminem, Dr. Dre, 50 Cent - Crack A Bottle         3
JAY-Z, Eminem - Renegade                          3
Nickelback - Photograph                           3
Eminem, Dido - Stan                               3
B2K, Diddy - Bump, Bump, Bump (feat. P. Diddy)    3
Red Hot Chili Peppers - By the Way                3
Taylor Swift

In [44]:
# 2010s decade

mus_2010s = merged_data[merged_data['decade']==2010]
mus_2010s.to_csv('data/mus_2010s.csv', encoding='utf-8', index=False)
# mus_2010s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_2010s['artists'].value_counts().nlargest(20)
# mus_2010s.shape()

Taylor Swift        134
Drake               109
BTS                 108
Mac Miller          101
One Direction        95
Lana Del Rey         93
The Weeknd           77
Ariana Grande        62
J. Cole              57
Kendrick Lamar       57
Ed Sheeran           54
NF                   52
SuicideBoyS          49
Imagine Dragons      48
Lil Uzi Vert         47
Michael Bublé        47
Glee Cast            45
Kevin Gates          45
Kanye West           45
Childish Gambino     44
Name: artists, dtype: int64

In [45]:
mus_2010s['artist_song'].value_counts().nlargest(20)

Ellie Goulding - Lights - Single Version                  4
Lana Del Rey - Summertime Sadness                         4
Katy Perry - Last Friday Night (T.G.I.F.)                 4
Lana Del Rey - Born To Die                                4
Rihanna - Diamonds                                        4
Lana Del Rey - Radio                                      4
Lana Del Rey - National Anthem                            4
Lana Del Rey - Video Games - Remastered                   4
JAY-Z, Kanye West - Ni**as In Paris                       4
Katy Perry - Firework                                     3
Taylor Swift - 22                                         3
The Weeknd - Wicked Games                                 3
Rihanna - S&M                                             3
Taylor Swift - We Are Never Ever Getting Back Together    3
Taylor Swift - Blank Space                                3
Drake - Started From the Bottom                           3
PARTYNEXTDOOR, Drake - Come and See Me (

In [46]:
# 2020s decade

mus_2020s = merged_data[merged_data['decade']==2020]
mus_2020s.to_csv('data/mus_2020s.csv', encoding='utf-8', index=False)
# us_2020s.groupby('artists')['artist_song'].value_counts().nlargest(20).reset_index(name='count')
mus_2020s['artists'].value_counts().nlargest(20)
# mus_2020s.shape

Future, Lil Uzi Vert          75
YoungBoy Never Broke Again    32
J Balvin                      26
NAV                           25
BTS                           24
The Kid LAROI                 23
KAROL G                       22
Lil Uzi Vert                  18
Machine Gun Kelly             16
Sam Smith                     16
Joji                          16
Juice WRLD                    16
Taylor Swift                  16
Kodak Black                   16
The Weeknd                    15
Chris Stapleton               15
Halsey                        15
Daddy Yankee                  14
2 Chainz                      14
Ariana Grande                 14
Name: artists, dtype: int64

In [47]:
mus_2020s['artist_song'].value_counts().nlargest(20)

KAROL G - A Ella                                  11
KAROL G, Ozuna - Hello                             7
Sebastian Yatra - Traicionera                      7
Daddy Yankee - Limbo                               6
KAROL G - Casi Nada                                5
Future, Lil Uzi Vert - Bought A Bad Bitch          4
Sebastian Yatra, Wisin, Nacho - Alguien Robo       4
Lil Uzi Vert - Lullaby                             4
J Balvin - Ay Vamos                                4
KAROL G, Bad Bunny - Ahora Me Llama                4
Future, Lil Uzi Vert - Drankin N Smokin            4
Future, Lil Uzi Vert - Plastic                     4
Future - Rockstar Chainz                           4
Sam Smith - Diamonds                               4
Future, Lil Uzi Vert - I Don’t Wanna Break Up      4
Future, Lil Uzi Vert - Stripes Like Burberry       4
Future, Lil Uzi Vert - Million Dollar Play         4
Future, Lil Uzi Vert - She Never Been To Pluto     4
Future, Lil Uzi Vert - Bankroll               