### Cleaning and Processing Data

In [1]:
# Importing required libraries
import pandas as pd
import os
import datetime
import math
import numpy as np

# Reading CSV files
data = 'data/data.csv'
genres = 'data/data_w_genres.csv'
mus_data = pd.read_csv(data)
genres_data = pd.read_csv(genres)

In [2]:
# Cleaning genres column's str values
genres_data['genres'] = genres_data['genres'].str.replace(r"[\"\])([']", '')
genres_data['artists'] = genres_data['artists'].str.replace(r"[\"\])(*[']", '')
genres_data['artists'] = genres_data['artists'].str.replace(r"[$]", 'S')

In [3]:
# Keeping only selected columns for genres
genres_data = genres_data[['artists', 'genres']]

In [4]:
# Cleaning main data column's str values
mus_data['artists'] = mus_data['artists'].str.replace(r"[\"\])(*[']", '')
mus_data['artists'] = mus_data['artists'].str.replace(r"$", 'S')

In [5]:
# Creating decades column to sort data by 10 years intervals
mus_data['decade'] = np.floor(mus_data['year']/10)*10

In [6]:
# Assigning Int64 to decades column values
mus_data = mus_data.astype({"decade": int})

In [7]:
# Adding additional artist_songs column to get accurate song's counts per decades
mus_data['artist_song'] = mus_data['artists'] + ' - ' + mus_data['name']

In [8]:
# Merging dataframes - genres data with main data
merged_data = mus_data.merge(genres_data, on='artists', how='left')
# merged_data.shape

In [9]:
# Replacing NaN values with empty values after merging
merged_data['genres'].fillna('', inplace=True)
# merged_data.shape

In [10]:
# Rearranging columns order for better visibility
merged_data = merged_data[['decade', 'year', 'artists', 'name', 'genres', 'duration_ms', 'artist_song', 'release_date', 'valence', 'acousticness', 'danceability', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'id']]

In [11]:
# Sorting data by duration in order to drop non-music items
merged_data = merged_data.sort_values('duration_ms',ascending=False)

In [12]:
# Cleaning merged data, dropping audio books
merged_data.drop(merged_data.loc[merged_data['duration_ms'] >= 1000000].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Georgette Heyer, Irina Salkow'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Трумен Капоте'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Seweryn Goszczyński'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'H.P. Lovecraft'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Эрих Мария Ремарк'].index, inplace=True)
merged_data.drop(merged_data.loc[merged_data['artists'] == 'Эрнест Хемингуэй'].index, inplace=True)

In [13]:
merged_data['release_date'] = pd.to_datetime(merged_data['release_date'], errors='coerce')
merged_data['release_date'] = pd.DatetimeIndex(merged_data['release_date']).year

In [14]:
# Saving master clean data to csv file
merged_data = merged_data.drop_duplicates(subset='artist_song', keep="first")
merged_data.to_csv('data/clean_data_all.csv', encoding='utf-8', index=False)
merged_data.shape

(154093, 22)

### Dividing data into decades to find out top-10 artists

In [15]:
# 1920s decade

mus_1920s = merged_data[merged_data['decade']==1920]
mus_1920s.shape
# mus_1920s['artists'].value_counts().nlargest(10)

(3478, 22)

In [16]:
a20 = mus_1920s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1920s = pd.DataFrame({'artists':a20.index}).merge(mus_1920s, how='left')
# mus_1920s.to_csv('static/data/mus_1920s.csv', encoding='utf-8', index=False)
mus_1920s.shape

(1678, 22)

In [17]:
mus_1920s['artists'].value_counts().nlargest(10)

Francisco Canaro                      666
Ignacio Corsini                       405
Francisco Canaro, Charlo              239
Frédéric Chopin, Vladimir Horowitz     82
Fats Waller                            61
Ted Weems & His Orchestra              47
Louis Armstrong & His Hot Five         47
George Olsen                           46
Frédéric Chopin, Arthur Rubinstein     45
Leroy Carr                             40
Name: artists, dtype: int64

In [18]:
# top10_artist20s = mus_1920s[['artists', 'decade']].copy()
# top10_artist20s['songs_released'] = top10_artist20s.groupby(['artists'])['artists'].transform('count')
# top10_artist20s = top10_artist20s.drop_duplicates(subset=['artists'], keep='first')
# top10_artist20s.reset_index(drop=True, inplace=True)
# top10_artist20s.to_csv('static/data/top10_artist20s.csv', encoding='utf-8', index=False)
# top10_artist20s.head(15)

In [19]:
# 1930s decade

mus_1930s = merged_data[merged_data['decade']==1930]
mus_1930s.shape

(8106, 22)

In [20]:
a30 = mus_1930s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1930s = pd.DataFrame({'artists':a30.index}).merge(mus_1930s, how='left')
# mus_1930s.to_csv('static/data/mus_1930s.csv', encoding='utf-8', index=False)
mus_1930s.shape

(1572, 22)

In [21]:
mus_1930s['artists'].value_counts().nlargest(10)

Francisco Canaro, Charlo                210
Sinclair Lewis, Frank Arnold            204
Francisco Canaro                        197
Francisco Canaro, Ernesto Fama          194
Francisco Canaro, Roberto Maida         180
Ignacio Corsini                         138
Umm Kulthum                             115
Franz Joseph Haydn, Pro Arte Quartet    113
Lead Belly                              112
Ernst H. Gombrich, Christoph Waltz      109
Name: artists, dtype: int64

In [22]:
# 1940s decade

mus_1940s = merged_data[merged_data['decade']==1940]
mus_1940s.shape

(13037, 22)

In [23]:
a40 = mus_1940s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1940s = pd.DataFrame({'artists':a40.index}).merge(mus_1940s, how='left')
# mus_1940s.to_csv('static/data/mus_1940s.csv', encoding='utf-8', index=False)
mus_1940s.shape

(1126, 22)

In [24]:
mus_1940s['artists'].value_counts().nlargest(10)

Orchestra Studio 7                              221
Lead Belly                                      153
Frédéric Chopin, Arthur Rubinstein              120
Shamshad Begum                                  118
Giuseppe Verdi, Arturo Toscanini                107
Lata Mangeshkar                                  95
Amirbai Karnataki                                94
Francisco Canaro, Carlos Roldán                  75
Igor Stravinsky, Columbia Symphony Orchestra     73
Geeta Dutt                                       70
Name: artists, dtype: int64

In [25]:
# 1950s decade

mus_1950s = merged_data[merged_data['decade']==1950]
mus_1950s.shape

(18383, 22)

In [26]:
a50 = mus_1950s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1950s = pd.DataFrame({'artists':a50.index}).merge(mus_1950s, how='left')
# mus_1950s.to_csv('static/data/mus_1950s.csv', encoding='utf-8', index=False)
mus_1950s.shape

(1812, 22)

In [27]:
mus_1950s['artists'].value_counts().nlargest(10)

Ella Fitzgerald                       243
Dean Martin                           235
Miles Davis                           205
Lata Mangeshkar                       202
Unspecified                           189
Frank Sinatra                         179
Oscar Peterson                        158
Johann Sebastian Bach, Glenn Gould    148
Billie Holiday                        141
Duke Ellington                        112
Name: artists, dtype: int64

In [28]:
# 1960s decade

mus_1960s = merged_data[merged_data['decade']==1960]
mus_1960s.shape

(18111, 22)

In [29]:
a60 = mus_1960s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1960s = pd.DataFrame({'artists':a60.index}).merge(mus_1960s, how='left')
# mus_1960s.to_csv('static/data/mus_1960s.csv', encoding='utf-8', index=False)
mus_1960s.shape

(2193, 22)

In [30]:
mus_1960s['artists'].value_counts().nlargest(10)

The Beach Boys        356
Frank Sinatra         299
Bob Dylan             252
Elvis Presley         236
The Beatles           220
Johnny Cash           205
The Rolling Stones    171
Nina Simone           160
Sam Cooke             147
The Kinks             147
Name: artists, dtype: int64

In [31]:
# 1970s decade

mus_1970s = merged_data[merged_data['decade']==1970]
mus_1970s.shape

(18175, 22)

In [32]:
a70 = mus_1970s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1970s = pd.DataFrame({'artists':a70.index}).merge(mus_1970s, how='left')
# mus_1970s.to_csv('static/data/mus_1970s.csv', encoding='utf-8', index=False)
mus_1970s.shape

(1419, 22)

In [33]:
mus_1970s['artists'].value_counts().nlargest(10)

Queen                       181
Fleetwood Mac               173
Bob Dylan                   168
Led Zeppelin                154
Bob Marley & The Wailers    148
Elton John                  136
The Rolling Stones          122
David Bowie                 113
Elvis Presley               113
Marvin Gaye                 111
Name: artists, dtype: int64

In [34]:
# 1980s decade

mus_1980s = merged_data[merged_data['decade']==1980]
mus_1980s.shape

(18223, 22)

In [35]:
a80 = mus_1980s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1980s = pd.DataFrame({'artists':a80.index}).merge(mus_1980s, how='left')
# mus_1980s.to_csv('static/data/mus_1980s.csv', encoding='utf-8', index=False)
mus_1980s.shape

(981, 22)

In [36]:
mus_1980s['artists'].value_counts().nlargest(10)

Queen                115
U2                   111
Talking Heads        109
The Smiths           104
Bruce Springsteen    100
Depeche Mode          96
Metallica             89
The Cure              87
Prince                86
R.E.M.                84
Name: artists, dtype: int64

In [37]:
# 1990s decade

mus_1990s = merged_data[merged_data['decade']==1990]
mus_1990s.shape

(18320, 22)

In [38]:
a90 = mus_1990s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_1990s = pd.DataFrame({'artists':a90.index}).merge(mus_1990s, how='left')
# mus_1990s.to_csv('static/data/mus_1990s.csv', encoding='utf-8', index=False)
mus_1990s.shape

(731, 22)

In [39]:
mus_1990s['artists'].value_counts().nlargest(10)

Metallica                89
Joan Sebastian           85
Sublime                  82
Green Day                78
2Pac                     68
The Smashing Pumpkins    68
Mariah Carey             66
Alice In Chains          65
Los Tigres Del Norte     65
Luis Miguel              65
Name: artists, dtype: int64

In [40]:
# 2000s decade

mus_2000s = merged_data[merged_data['decade']==2000]
mus_2000s.shape

(17874, 22)

In [41]:
b00 = mus_2000s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_2000s = pd.DataFrame({'artists':b00.index}).merge(mus_2000s, how='left')
# mus_2000s.to_csv('static/data/mus_2000s.csv', encoding='utf-8', index=False)
mus_2000s.shape

(562, 22)

In [42]:
mus_2000s['artists'].value_counts().nlargest(10)

John Mayer               69
Eminem                   68
John Williams            61
Jack Johnson             58
Fall Out Boy             55
System Of A Down         55
Radiohead                51
Red Hot Chili Peppers    50
Jay Chou                 48
Breaking Benjamin        47
Name: artists, dtype: int64

In [43]:
# 2010s decade

mus_2010s = merged_data[merged_data['decade']==2010]
mus_2010s.shape

(18682, 22)

In [44]:
b10 = mus_2010s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_2010s = pd.DataFrame({'artists':b10.index}).merge(mus_2010s, how='left')
# mus_2010s.to_csv('static/data/mus_2010s.csv', encoding='utf-8', index=False)
mus_2010s.shape

(749, 22)

In [45]:
mus_2010s['artists'].value_counts().nlargest(10)

One Direction    95
BTS              93
Drake            92
Taylor Swift     91
Mac Miller       88
Lana Del Rey     66
The Weeknd       63
Ariana Grande    57
J. Cole          55
SuicideBoyS      49
Name: artists, dtype: int64

In [46]:
# 2020s decade

mus_2020s = merged_data[merged_data['decade']==2020]
mus_2020s.shape

(1704, 22)

In [47]:
b20 = mus_2020s['artists'].value_counts().sort_values(ascending=False).head(10)
mus_2020s = pd.DataFrame({'artists':b20.index}).merge(mus_2020s, how='left')
# mus_2020s.to_csv('static/data/mus_2020s.csv', encoding='utf-8', index=False)
mus_2020s.shape

(180, 22)

In [48]:
mus_2020s['artists'].value_counts().nlargest(10)

YoungBoy Never Broke Again    32
Future, Lil Uzi Vert          23
BTS                           22
Juice WRLD                    16
Taylor Swift                  16
Lil Uzi Vert                  15
The Weeknd                    15
Chris Stapleton               14
The Kid LAROI                 14
J Balvin                      13
Name: artists, dtype: int64

In [49]:
## All Decades Data Together

all_mus = mus_1920s.append([mus_1930s, mus_1940s, mus_1950s, mus_1960s, mus_1970s, mus_1980s, mus_1990s, mus_2000s, mus_2010s, mus_2020s])
all_mus = all_mus.drop_duplicates(subset='artist_song', keep="first")
all_mus.to_csv('data/clean_data_decade.csv', encoding='utf-8', index=False)

In [50]:
all_mus.shape

(13003, 22)