### Cleaning and Processing Data

In [1]:
# Importing required libraries
import pandas as pd
import os
import datetime
import math
import numpy as np

# Reading CSV files
data = 'data/data.csv'
genres = 'data/data_w_genres.csv'
mus_data = pd.read_csv(data)
genres_data = pd.read_csv(genres)

In [2]:
# Cleaning genres column's str values
genres_data['genres'] = genres_data['genres'].str.replace(r"[\"\])([']", '')
genres_data['artists'] = genres_data['artists'].str.replace(r"[\"\])(*[']", '')
genres_data['artists'] = genres_data['artists'].str.replace(r"[$]", 'S')

In [3]:
# Keeping only selected columns for genres
genres_data = genres_data[['artists', 'genres']]

In [4]:
# Cleaning main data column's str values
mus_data['artists'] = mus_data['artists'].str.replace(r"[\"\])(*[']", '')
mus_data['artists'] = mus_data['artists'].str.replace(r"$", 'S')

In [5]:
# Creating decades column to sort data by 10 years intervals
mus_data['decade'] = np.floor(mus_data['year']/10)*10

In [6]:
# Assigning Int64 to decades column values
mus_data = mus_data.astype({"decade": int})

In [7]:
# Adding additional artist_songs column to get accurate song's counts per decades
mus_data['artist_song'] = mus_data['artists'] + ' - ' + mus_data['name']

In [8]:
# Merging dataframes - genres data with main data
merged_data = mus_data.merge(genres_data, on='artists', how='left')
# merged_data.shape

In [9]:
# Replacing NaN values with empty values after merging
merged_data['genres'].fillna('', inplace=True)
# merged_data.shape

In [10]:
# Rearranging columns order for better visibility
merged_data = merged_data[['decade', 'year', 'artists', 'name', 'genres', 'duration_ms', 'artist_song', 'release_date', 'valence', 'acousticness', 'danceability', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'id']]

In [11]:
# Sorting data by duration in order to drop non-music items
merged_data = merged_data.sort_values('duration_ms',ascending=False)

In [12]:
# Cleaning merged data, dropping audio books
merged_data.drop(merged_data.loc[merged_data['duration_ms'] >= 1000000].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Georgette Heyer, Irina Salkow'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Трумен Капоте'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Seweryn Goszczyński'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'H.P. Lovecraft'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Эрих Мария Ремарк'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Эрнест Хемингуэй'].index, inplace=True)

In [13]:
merged_data['release_date'] = pd.to_datetime(merged_data['release_date'], errors='coerce')
merged_data['release_date'] = pd.DatetimeIndex(merged_data['release_date']).year

In [14]:
# Saving master clean data to csv file
merged_data.to_csv('data/clean_data_all.csv', encoding='utf-8', index=False)
# merged_data.head()

### Dividing data into decades to find out top-10 artists and songs

In [15]:
# 1920s decade

mus_1920s = merged_data[merged_data['decade']==1920]
mus_1920s.shape
# mus_1920s['artists'].value_counts().nlargest(10)

(3948, 22)

In [16]:
a20 = mus_1920s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1920s = pd.DataFrame({'artists':a20.index}).merge(mus_1920s, how='left')
# mus_1920s.to_csv('static/data/mus_1920s.csv', encoding='utf-8', index=False)
mus_1920s.shape

(1916, 22)

In [17]:
mus_1920s['artists'].value_counts().nlargest(10)

Francisco Canaro                         686
Ignacio Corsini                          447
Francisco Canaro, Charlo                 241
Frédéric Chopin, Vladimir Horowitz       160
Louis Armstrong & His Hot Five            98
Robert Schumann, Vladimir Horowitz        78
Fats Waller                               61
Louis Armstrong                           49
Frédéric Chopin, Arthur Rubinstein        48
Alexander Scriabin, Vladimir Horowitz     48
Name: artists, dtype: int64

In [18]:
# top10_artist20s = mus_1920s[['artists', 'decade']].copy()
# top10_artist20s['songs_released'] = top10_artist20s.groupby(['artists'])['artists'].transform('count')
# top10_artist20s = top10_artist20s.drop_duplicates(subset=['artists'], keep='first')
# top10_artist20s.reset_index(drop=True, inplace=True)
# top10_artist20s.to_csv('static/data/top10_artist20s.csv', encoding='utf-8', index=False)
# top10_artist20s.head(15)

In [19]:
# 1930s decade

mus_1930s = merged_data[merged_data['decade']==1930]
mus_1930s.shape

(8745, 22)

In [20]:
a30 = mus_1930s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1930s = pd.DataFrame({'artists':a30.index}).merge(mus_1930s, how='left')
# mus_1930s.to_csv('static/data/mus_1930s.csv', encoding='utf-8', index=False)
mus_1930s.shape

(1820, 22)

In [21]:
mus_1930s['artists'].value_counts().nlargest(10)

Billie Holiday                     220
Francisco Canaro, Charlo           212
Sinclair Lewis, Frank Arnold       204
Francisco Canaro                   203
Francisco Canaro, Ernesto Fama     194
Billie Holiday, Teddy Wilson       191
Francisco Canaro, Roberto Maida    180
Ignacio Corsini                    159
Lead Belly                         141
Umm Kulthum                        116
Name: artists, dtype: int64

In [22]:
# 1940s decade

mus_1940s = merged_data[merged_data['decade']==1940]
mus_1940s.shape

(14183, 22)

In [23]:
a40 = mus_1940s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1940s = pd.DataFrame({'artists':a40.index}).merge(mus_1940s, how='left')
# mus_1940s.to_csv('static/data/mus_1940s.csv', encoding='utf-8', index=False)
mus_1940s.shape

(1284, 22)

In [24]:
mus_1940s['artists'].value_counts().nlargest(10)

Orchestra Studio 7                              238
Lead Belly                                      188
Frédéric Chopin, Arthur Rubinstein              151
Igor Stravinsky, Columbia Symphony Orchestra    135
Shamshad Begum                                  118
Giuseppe Verdi, Arturo Toscanini                107
Lata Mangeshkar                                  95
Amirbai Karnataki                                94
Mina                                             80
Rita Ampatzi                                     78
Name: artists, dtype: int64

In [25]:
# 1950s decade

mus_1950s = merged_data[merged_data['decade']==1950]
mus_1950s.shape

(19626, 22)

In [26]:
a50 = mus_1950s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1950s = pd.DataFrame({'artists':a50.index}).merge(mus_1950s, how='left')
# mus_1950s.to_csv('static/data/mus_1950s.csv', encoding='utf-8', index=False)
mus_1950s.shape

(2045, 22)

In [27]:
mus_1950s['artists'].value_counts().nlargest(10)

Ella Fitzgerald                       268
Miles Davis                           260
Dean Martin                           253
Oscar Peterson                        205
Lata Mangeshkar                       202
Unspecified                           194
Frank Sinatra                         181
Johann Sebastian Bach, Glenn Gould    179
Billie Holiday                        167
Javier Solís                          136
Name: artists, dtype: int64

In [28]:
# 1960s decade

mus_1960s = merged_data[merged_data['decade']==1960]
mus_1960s.shape

(19473, 22)

In [29]:
a60 = mus_1960s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1960s = pd.DataFrame({'artists':a60.index}).merge(mus_1960s, how='left')
# mus_1960s.to_csv('static/data/mus_1960s.csv', encoding='utf-8', index=False)
mus_1960s.shape

(2473, 22)

In [30]:
mus_1960s['artists'].value_counts().nlargest(10)

The Beach Boys        392
Frank Sinatra         320
The Beatles           275
Bob Dylan             266
Johnny Cash           246
Elvis Presley         239
The Rolling Stones    236
Sam Cooke             173
Nina Simone           169
The Kinks             157
Name: artists, dtype: int64

In [31]:
# 1970s decade

mus_1970s = merged_data[merged_data['decade']==1970]
mus_1970s.shape

(19914, 22)

In [32]:
a70 = mus_1970s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1970s = pd.DataFrame({'artists':a70.index}).merge(mus_1970s, how='left')
# mus_1970s.to_csv('static/data/mus_1970s.csv', encoding='utf-8', index=False)
mus_1970s.shape

(1828, 22)

In [33]:
mus_1970s['artists'].value_counts().nlargest(10)

Fleetwood Mac               252
Queen                       236
Led Zeppelin                204
The Rolling Stones          195
Elton John                  188
Bob Marley & The Wailers    182
Bob Dylan                   173
Marvin Gaye                 142
Willie Nelson               132
Elvis Presley               124
Name: artists, dtype: int64

In [34]:
# 1980s decade

mus_1980s = merged_data[merged_data['decade']==1980]
mus_1980s.shape

(19825, 22)

In [35]:
a80 = mus_1980s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1980s = pd.DataFrame({'artists':a80.index}).merge(mus_1980s, how='left')
# mus_1980s.to_csv('static/data/mus_1980s.csv', encoding='utf-8', index=False)
mus_1980s.shape

(1155, 22)

In [36]:
mus_1980s['artists'].value_counts().nlargest(10)

U2                   155
The Cure             131
Queen                126
Metallica            123
The Smiths           117
Talking Heads        111
R.E.M.               105
Bruce Springsteen    100
Depeche Mode          98
Prince                89
Name: artists, dtype: int64

In [37]:
# 1990s decade

mus_1990s = merged_data[merged_data['decade']==1990]
mus_1990s.shape

(19889, 22)

In [38]:
a90 = mus_1990s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1990s = pd.DataFrame({'artists':a90.index}).merge(mus_1990s, how='left')
# mus_1990s.to_csv('static/data/mus_1990s.csv', encoding='utf-8', index=False)
mus_1990s.shape

(838, 22)

In [39]:
mus_1990s['artists'].value_counts().nlargest(10)

Nirvana                  119
Sublime                  104
Joan Sebastian            90
Metallica                 89
Green Day                 80
2Pac                      79
Red Hot Chili Peppers     75
The Smashing Pumpkins     68
Los Tigres Del Norte      67
Alice In Chains           67
Name: artists, dtype: int64

In [40]:
# 2000s decade

mus_2000s = merged_data[merged_data['decade']==2000]
mus_2000s.shape

(19632, 22)

In [41]:
b00 = mus_2000s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_2000s = pd.DataFrame({'artists':b00.index}).merge(mus_2000s, how='left')
# mus_2000s.to_csv('static/data/mus_2000s.csv', encoding='utf-8', index=False)
mus_2000s.shape

(674, 22)

In [42]:
mus_2000s['artists'].value_counts().nlargest(10)

Eminem                   96
Linkin Park              72
John Mayer               70
George Strait            67
Slipknot                 66
Red Hot Chili Peppers    63
Fall Out Boy             62
John Williams            62
Jack Johnson             59
Taylor Swift             57
Name: artists, dtype: int64

In [43]:
# 2010s decade

mus_2010s = merged_data[merged_data['decade']==2010]
mus_2010s.shape

(19765, 22)

In [44]:
b10 = mus_2010s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_2010s = pd.DataFrame({'artists':b10.index}).merge(mus_2010s, how='left')
# mus_2010s.to_csv('static/data/mus_2010s.csv', encoding='utf-8', index=False)
mus_2010s.shape

(893, 22)

In [45]:
mus_2010s['artists'].value_counts().nlargest(10)

Taylor Swift      134
Drake             109
BTS               108
Mac Miller        101
One Direction      95
Lana Del Rey       93
The Weeknd         77
Ariana Grande      62
Kendrick Lamar     57
J. Cole            57
Name: artists, dtype: int64

In [46]:
# 2020s decade

mus_2020s = merged_data[merged_data['decade']==2020]
mus_2020s.shape

(2030, 22)

In [47]:
b20 = mus_2020s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_2020s = pd.DataFrame({'artists':b20.index}).merge(mus_2020s, how='left')
# mus_2020s.to_csv('static/data/mus_2020s.csv', encoding='utf-8', index=False)
mus_2020s.shape

(277, 22)

In [48]:
mus_2020s['artists'].value_counts().nlargest(10)

Future, Lil Uzi Vert          75
YoungBoy Never Broke Again    32
J Balvin                      26
NAV                           25
BTS                           24
The Kid LAROI                 23
KAROL G                       22
Lil Uzi Vert                  18
Sam Smith                     16
Machine Gun Kelly             16
Name: artists, dtype: int64

In [49]:
## All Decades Data Together

all_mus = mus_1920s.append([mus_1930s, mus_1940s, mus_1950s, mus_1960s, mus_1970s, mus_1980s, mus_1990s, mus_2000s, mus_2010s, mus_2020s])
all_mus.to_csv('data/clean_data_decade.csv', encoding='utf-8', index=False)

In [50]:
all_mus.shape

(15203, 22)