In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
revenue_df = pd.read_csv('../../../data/Old_Data/Music_Sales_Revenue_Data.csv')

### Cleaning Music Sales Revenue dataset

In this notebook, I cleaned my dataset. I removed extra columns and renamed the ones I kept. I also turned some objects into floats to be able to carry out analyses with them.

In [3]:
revenue_df.shape

(286, 16)

In [4]:
revenue_df.head(20)

Unnamed: 0,Year of Year Date,Adjusted for Inflation Notes,Adjusted for Inflation Title,Format,Metric,Year,Value (For Charting),Adjusted for Inflation Flag,Year Date,Format Value # (Billion),Format Value # (Million),Total Value # (Billion),Total Value # (Million),Total Value For Year,Value (Actual),Year (copy)
0,2000,,,CD,Value,2000,13214.5,,2000,$13.2B,,$14.3B,,$14323.7B,13214.5,2000
1,2001,,,CD,Value,2001,12909.4,,2001,$12.9B,,$13.7B,,$13746.2B,12909.4,2001
2,2002,,,CD,Value,2002,12044.1,,2002,$12.0B,,$12.6B,,$12615.8B,12044.1,2002
3,2003,,,CD,Value,2003,11232.9,,2003,$11.2B,,$11.9B,,$11854.4B,11232.9,2003
4,2004,,,CD,Value,2004,11446.5,,2004,$11.4B,,$12.3B,,$12345.1B,11446.5,2004
5,2005,,,CD,Value,2005,10520.2,,2005,$10.5B,,$12.3B,,$12289.9B,10520.2,2005
6,2006,,,CD,Value,2006,9372.6,,2006,$9.4B,,$11.8B,,$11759.5B,9372.6,2006
7,2007,,,CD,Value,2007,7452.3,,2007,$7.5B,,$10.7B,,$10650.9B,7452.3,2007
8,2008,,,CD,Value,2008,5471.3,,2008,$5.5B,,$8.8B,,$8776.8B,5471.3,2008
9,2009,,,CD,Value,2009,4318.8,,2009,$4.3B,,$7.8B,,$7831.0B,4318.8,2009


In [5]:
revenue_df.columns

Index(['Year of Year Date', 'Adjusted for Inflation Notes',
       'Adjusted for Inflation Title', 'Format', 'Metric', 'Year',
       'Value (For Charting)', 'Adjusted for Inflation Flag', 'Year Date',
       'Format Value # (Billion)', 'Format Value # (Million)',
       'Total Value # (Billion)', 'Total Value # (Million)',
       'Total Value For Year', 'Value (Actual)', 'Year (copy)'],
      dtype='object')

In [6]:
revenue_df2 = revenue_df.rename(columns={'Value (For Charting)':'Revenue_Value','Total Value # (Billion)':'Total_Revenue_Year_Billions'})

In [7]:
revenue_df2.columns

Index(['Year of Year Date', 'Adjusted for Inflation Notes',
       'Adjusted for Inflation Title', 'Format', 'Metric', 'Year',
       'Revenue_Value', 'Adjusted for Inflation Flag', 'Year Date',
       'Format Value # (Billion)', 'Format Value # (Million)',
       'Total_Revenue_Year_Billions', 'Total Value # (Million)',
       'Total Value For Year', 'Value (Actual)', 'Year (copy)'],
      dtype='object')

In [8]:
cols_to_keep = ['Format','Metric','Year','Revenue_Value','Total_Revenue_Year_Billions']

In [9]:
cols_to_keep

['Format', 'Metric', 'Year', 'Revenue_Value', 'Total_Revenue_Year_Billions']

In [10]:
revenue_df3=revenue_df2[cols_to_keep]

In [11]:
revenue_df3

Unnamed: 0,Format,Metric,Year,Revenue_Value,Total_Revenue_Year_Billions
0,CD,Value,2000,13214.500000,$14.3B
1,CD,Value,2001,12909.400000,$13.7B
2,CD,Value,2002,12044.100000,$12.6B
3,CD,Value,2003,11232.900000,$11.9B
4,CD,Value,2004,11446.500000,$12.3B
...,...,...,...,...,...
281,Vinyl Single,Value,2015,5.752954,$6.7B
282,Vinyl Single,Value,2016,4.880680,$7.6B
283,Vinyl Single,Value,2017,6.078385,$8.8B
284,Vinyl Single,Value,2018,5.290337,$9.8B


In [12]:
revenue_df3['Total_Revenue_Year_Billions']

0      $14.3B
1      $13.7B
2      $12.6B
3      $11.9B
4      $12.3B
        ...  
281     $6.7B
282     $7.6B
283     $8.8B
284     $9.8B
285    $11.1B
Name: Total_Revenue_Year_Billions, Length: 286, dtype: object

In [13]:
revenue_df3['Total_Revenue_Year_Billions'] = revenue_df3['Total_Revenue_Year_Billions'].str.replace('$','')

In [14]:
revenue_df3['Total_Revenue_Year_Billions']

0      14.3B
1      13.7B
2      12.6B
3      11.9B
4      12.3B
       ...  
281     6.7B
282     7.6B
283     8.8B
284     9.8B
285    11.1B
Name: Total_Revenue_Year_Billions, Length: 286, dtype: object

In [15]:
revenue_df3['Total_Revenue_Year_Billions'] = revenue_df3['Total_Revenue_Year_Billions'].str.replace('B','')

In [16]:
revenue_df3['Total_Revenue_Year_Billions']

0      14.3
1      13.7
2      12.6
3      11.9
4      12.3
       ... 
281     6.7
282     7.6
283     8.8
284     9.8
285    11.1
Name: Total_Revenue_Year_Billions, Length: 286, dtype: object

In [17]:
revenue_df3['Total_Revenue_Year_Billions'] = revenue_df3.Total_Revenue_Year_Billions.astype(float)

In [18]:
revenue_df3['Total_Revenue_Year_Billions']

0      14.3
1      13.7
2      12.6
3      11.9
4      12.3
       ... 
281     6.7
282     7.6
283     8.8
284     9.8
285    11.1
Name: Total_Revenue_Year_Billions, Length: 286, dtype: float64

## Notes: 

* Similar steps as Music_Sales_Volume data, changed names to remove spaces and special characters and removed unneeded columns. 
* In this case, less columns were needed. Total Revenue in Millions was irrelevant since there was no data, everything was in the Billions column. 
* Under 'Revenue_Value' is the revenue in dollars for that specific format in that specific year. 
* Under 'Total_Revenue_Year_Billions is the total revenue of that year (including all formats). 
    * Turned the numbers in this column into floats, removed dollar sign and B to turn them into numbers. 
    * I'm not sure if I'll actually use that specific column for analysis or for the story at the end, but just leaving the option open by turning them into usable numbers.

In [19]:
revenue_df3

Unnamed: 0,Format,Metric,Year,Revenue_Value,Total_Revenue_Year_Billions
0,CD,Value,2000,13214.500000,14.3
1,CD,Value,2001,12909.400000,13.7
2,CD,Value,2002,12044.100000,12.6
3,CD,Value,2003,11232.900000,11.9
4,CD,Value,2004,11446.500000,12.3
...,...,...,...,...,...
281,Vinyl Single,Value,2015,5.752954,6.7
282,Vinyl Single,Value,2016,4.880680,7.6
283,Vinyl Single,Value,2017,6.078385,8.8
284,Vinyl Single,Value,2018,5.290337,9.8


In [20]:
revenue_df3.to_csv('../../../data/Old_Data/Music_Sales_Revenue_Clean.csv', index=False)