In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

In [2]:
music_sales_df = pd.read_csv('../../data/Usable_Data/Full_Music_Sales_Data_Raw.csv')

## Cleaning my new data. 

I mainly removed and renamed certain columns, and made sure some objects were turned into floats to be ably to analyze them.

In [3]:
music_sales_df.shape
music_sales_df

Unnamed: 0,Adjusted for Inflation Flag,Format,Format (copy),Metric,Year,Year Date,Format Value # (Billion),Format Value # (Million),Number of Records,Value (Actual),Value (Actual) (copy),Year (copy)
0,,CD,CD,Units,1973,1973,,,1,,,1973
1,,CD,CD,Units,1974,1974,,,1,,,1974
2,,CD,CD,Units,1975,1975,,,1,,,1975
3,,CD,CD,Units,1976,1976,,,1,,,1976
4,,CD,CD,Units,1977,1977,,,1,,,1977
...,...,...,...,...,...,...,...,...,...,...,...,...
2956,,Vinyl Single,Vinyl Single,Value (Adjusted),2015,2015,,$6.2M,1,6.205390,$6M,2015
2957,,Vinyl Single,Vinyl Single,Value (Adjusted),2016,2016,,$5.2M,1,5.198931,$5M,2016
2958,,Vinyl Single,Vinyl Single,Value (Adjusted),2017,2017,,$6.3M,1,6.339678,$6M,2017
2959,,Vinyl Single,Vinyl Single,Value (Adjusted),2018,2018,,$5.4M,1,5.386197,$5M,2018


In [4]:
music_sales_df.columns

Index(['Adjusted for Inflation Flag', 'Format', 'Format (copy)', 'Metric',
       'Year', 'Year Date', 'Format Value # (Billion)',
       'Format Value # (Million)', 'Number of Records', 'Value (Actual)',
       'Value (Actual) (copy)', 'Year (copy)'],
      dtype='object')

In [5]:
cols_to_keep = ['Format',
               'Metric',
               'Year',
               'Format Value # (Billion)',
               'Format Value # (Million)',
               'Value (Actual)']

In [6]:
cols_to_keep

['Format',
 'Metric',
 'Year',
 'Format Value # (Billion)',
 'Format Value # (Million)',
 'Value (Actual)']

In [7]:
music_sales_df2 = music_sales_df[cols_to_keep]

In [8]:
music_sales_df2

Unnamed: 0,Format,Metric,Year,Format Value # (Billion),Format Value # (Million),Value (Actual)
0,CD,Units,1973,,,
1,CD,Units,1974,,,
2,CD,Units,1975,,,
3,CD,Units,1976,,,
4,CD,Units,1977,,,
...,...,...,...,...,...,...
2956,Vinyl Single,Value (Adjusted),2015,,$6.2M,6.205390
2957,Vinyl Single,Value (Adjusted),2016,,$5.2M,5.198931
2958,Vinyl Single,Value (Adjusted),2017,,$6.3M,6.339678
2959,Vinyl Single,Value (Adjusted),2018,,$5.4M,5.386197


#### First I removed the unnecessary columns, some were copies of each other and others were not relevant to my analysis.

In [9]:
music_sales_df3 = music_sales_df2.rename(columns={'Format Value # (Billion)':'Format_Value_Num_Billions',
                                                'Format Value # (Million)':'Format_Value_Num_Millions',
                                                'Value (Actual)':'Actual_Value'})

In [10]:
music_sales_df3.sample(10)

Unnamed: 0,Format,Metric,Year,Format_Value_Num_Billions,Format_Value_Num_Millions,Actual_Value
2923,Vinyl Single,Value (Adjusted),1982,,$749.8M,749.75058
386,Music Video (Physical),Units,1983,,,
1953,CD,Value (Adjusted),1999,$19.7B,,19667.327786
2888,Synchronization,Value (Adjusted),1994,,,
1103,Download Album,Value,1982,,,
700,Download Music Video,Units,2015,,$3.2M,3.223325
541,Download Single,Units,1997,,,
1547,Other Digital,Value,2016,,$17.1M,17.141663
1453,On-Demand Streaming (Ad-Supported),Value,2016,,$489.4M,489.377608
548,Download Single,Units,2004,,$139.4M,139.4


#### Next I renamed the columns that had special characters and spaces.

In [11]:
music_sales_df3['Format_Value_Num_Billions'].sample(15)

1574      NaN
1134    $1.2B
2188      NaN
2372      NaN
2246      NaN
2822      NaN
2692      NaN
746       NaN
2785      NaN
107       NaN
269       NaN
2097      NaN
1573      NaN
770       NaN
1832      NaN
Name: Format_Value_Num_Billions, dtype: object

#### Unsure if I will use these specific columns, but I am removing special characters and turning the objects into floats to make them usable numbers just in case.

In [12]:
music_sales_df3['Format_Value_Num_Billions'] = music_sales_df3['Format_Value_Num_Billions'].str.replace('$','')

In [13]:
music_sales_df3['Format_Value_Num_Billions'] = music_sales_df3['Format_Value_Num_Billions'].str.replace('B','').astype(float)

In [14]:
music_sales_df3['Format_Value_Num_Billions'].sample(15)

2068   NaN
2670   NaN
932    NaN
531    NaN
2284   NaN
590    NaN
2494   NaN
2689   NaN
2890   NaN
2554   NaN
72     NaN
2190   NaN
0      NaN
364    NaN
2079   NaN
Name: Format_Value_Num_Billions, dtype: float64

In [15]:
music_sales_df3['Format_Value_Num_Millions'].sample(15)

2736        NaN
765         NaN
174       $0.00
856         NaN
1468        NaN
2123        NaN
2566        NaN
1903        NaN
823         NaN
2275        NaN
1449    $170.9M
236     $204.0M
2158      $3.0M
231      $14.8M
993         NaN
Name: Format_Value_Num_Millions, dtype: object

In [16]:
music_sales_df3['Format_Value_Num_Millions'] = music_sales_df3['Format_Value_Num_Millions'].str.replace('$','')

In [17]:
music_sales_df3['Format_Value_Num_Millions'] = music_sales_df3['Format_Value_Num_Millions'].str.replace('M','').astype(float)

In [18]:
music_sales_df3

Unnamed: 0,Format,Metric,Year,Format_Value_Num_Billions,Format_Value_Num_Millions,Actual_Value
0,CD,Units,1973,,,
1,CD,Units,1974,,,
2,CD,Units,1975,,,
3,CD,Units,1976,,,
4,CD,Units,1977,,,
...,...,...,...,...,...,...
2956,Vinyl Single,Value (Adjusted),2015,,6.2,6.205390
2957,Vinyl Single,Value (Adjusted),2016,,5.2,5.198931
2958,Vinyl Single,Value (Adjusted),2017,,6.3,6.339678
2959,Vinyl Single,Value (Adjusted),2018,,5.4,5.386197


In [19]:
music_sales_df3.to_csv('../../data/Usable_Data/Full_Music_Sales_Data_Clean.csv', index=False)

#### Saved the clean data to my data folder. However, this is not the data I want to _use_ as I have to add a map to separate the formats into "Modes"