In [589]:
import pandas as pd
import sklearn.model_selection as skmod
import datetime as dt

folder = 'Data/'
song_df = pd.read_csv(folder+'ml-03-data-processing-songs-dataset.csv')

song_df.shape

(1994, 16)

In [590]:
song_df.info()
song_df[song_df['Month'].isna()]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Index                   1994 non-null   int64  
 1   Title                   1991 non-null   object 
 2   Artist                  1986 non-null   object 
 3   Top Genre               1986 non-null   object 
 4   Year                    1987 non-null   float64
 5   Month                   1994 non-null   object 
 6   Beats Per Minute (BPM)  1985 non-null   float64
 7   Energy                  1986 non-null   float64
 8   Danceability            970 non-null    float64
 9   Loudness (dB)           1987 non-null   float64
 10  Liveness                1986 non-null   float64
 11  Valence                 960 non-null    float64
 12  Length (Duration)       1985 non-null   object 
 13  Acousticness            1986 non-null   float64
 14  Speechiness             1990 non-null   

Unnamed: 0,Index,Title,Artist,Top Genre,Year,Month,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity


In [591]:
song_df[song_df['Artist'].isna()]



Unnamed: 0,Index,Title,Artist,Top Genre,Year,Month,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
664,665,I Just Can't Help Believin',,adult standards,,7,,,,,9.0,,274.0,,5.0,
775,776,Nothing Breaks Like a Heart (feat. Miley Cyrus),,,2018.0,3,114.0,79.0,,-6.0,,,,,,
886,887,Starman - 2012 Remaster,,,,2,,,,,54.0,,254.0,17.0,3.0,74.0
1109,1110,Fire,,dance pop,1978.0,6,,,,,,,,20.0,3.0,56.0
1331,1332,,,,1985.0,11,,52.0,,-11.0,,,,7.0,,60.0
1553,1554,In Bloom - Nevermind Version,,alternative rock,1991.0,1,,,,-5.0,21.0,,,,,
1775,1776,Freak On a Leash,,,,12,,,,-6.0,,,256.0,,5.0,73.0
1887,1888,Respect,,,,9,,,,-5.0,5.0,,,16.0,4.0,73.0


In [592]:
# removing all rows with NaN title, artist because they also have more than 50% values missing
song_df = song_df[song_df['Title'].notnull()]
song_df = song_df[song_df['Artist'].notnull()]
song_df = song_df[song_df['Top Genre'].notnull()]

# drop danceability and valence columns because of missing values
song_df.drop(labels=['Danceability','Valence'],axis=1,inplace=True)


# also deleting Come on Eileen entry because it makes resulting data much cleaner
song_df = song_df[song_df['Title'] != 'Come On Eileen']
song_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1981 entries, 0 to 1993
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Index                   1981 non-null   int64  
 1   Title                   1981 non-null   object 
 2   Artist                  1981 non-null   object 
 3   Top Genre               1981 non-null   object 
 4   Year                    1981 non-null   float64
 5   Month                   1981 non-null   object 
 6   Beats Per Minute (BPM)  1981 non-null   float64
 7   Energy                  1981 non-null   float64
 8   Loudness (dB)           1981 non-null   float64
 9   Liveness                1981 non-null   float64
 10  Length (Duration)       1981 non-null   object 
 11  Acousticness            1981 non-null   float64
 12  Speechiness             1981 non-null   float64
 13  Popularity              1981 non-null   float64
dtypes: float64(8), int64(1), object(5)
memor

In [593]:
# write a function to convert months to numbers
def convertMonth(month):
    # check if already a number
    if month.isnumeric():
        # make int
        return int(month)
    # if not, convert
    else:
        if month == 'Jan':
            return 1
        elif month == 'Feb':
            return 2
        elif month == 'Mar':
            return 3
        elif month == 'Apr':
            return 4
        elif month == 'May':
            return 5
        elif month =='Jun':
            return 6
        elif month == 'Jul':
            return 7
        elif month == 'Aug':
            return 8
        elif month == 'Sep':
            return 9
        elif month == 'Oct':
            return 10
        elif month == 'Nov':
            return 11
        elif month == 'Dec':
            return 12
        else:
            return None

# write function to convert duration to int
def convertDuration(duration):
    if duration.isnumeric():
        return int(duration)
    else:
        duration = duration[0] + duration[2:]
        print(duration)
        return int(duration)


# get months to same format, convert year 92 to 1992
song_df.Month = song_df.Month.apply(convertMonth)
song_df['Year'] = song_df.Year.replace([92.],1992.)
song_df['Length (Duration)'] = song_df['Length (Duration)'].apply(convertDuration)



1412
1121
1367
1292


In [594]:
# now that we are pretty clean, get descriptive statistics for all columns
colnames = song_df.columns[4:]
for col in colnames:
    std = song_df[col].std()
    mean = song_df[col].mean()
    outliers = song_df[(song_df[col] > mean + 3*std) | (song_df[col] < mean - 3*std)]
    print(col)
    print(f'Mean = {mean}')
    print(outliers[['Title',col]])
    print('============')



Year
Mean = 1993.0085815244827
Empty DataFrame
Columns: [Title, Year]
Index: []
Month
Mean = 6.521453811206461
Empty DataFrame
Columns: [Title, Month]
Index: []
Beats Per Minute (BPM)
Mean = 120.23876829883898
                                                Title  Beats Per Minute (BPM)
133                                               Zij                    18.0
284        Lady Jane - (Original Single Mono Version)                   305.0
1901  Lucy In The Sky With Diamonds - Remastered 2009                    15.0
1977                          Appleknockers Flophouse                   297.0
Energy
Mean = 59.641090358404846
Empty DataFrame
Columns: [Title, Energy]
Index: []
Loudness (dB)
Mean = -9.018172640080767
                                                  Title  Loudness (dB)
527                                          I See Fire          -21.0
690                    Het het nog nooit zo donker west          -20.0
878                                Famous Blue Raincoat        

In [595]:
# convert month and year into datetime

song_df['Day'] = 1
song_df['Full Date'] = pd.to_datetime(song_df[['Year','Month','Day']])
today = dt.date.today()
song_df['Age'] = None


for idx,date in enumerate(song_df['Full Date']):
    age = (dt.datetime.today() - date).days
    song_df['Age'].iloc[idx] = age

song_df['Age']
song_df.drop(['Full Date','Day','Month','Year'],axis=1,inplace=True)
song_df.info()
song_df.columns
# age is now calculated and other date columns dropped

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1981 entries, 0 to 1993
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Index                   1981 non-null   int64  
 1   Title                   1981 non-null   object 
 2   Artist                  1981 non-null   object 
 3   Top Genre               1981 non-null   object 
 4   Beats Per Minute (BPM)  1981 non-null   float64
 5   Energy                  1981 non-null   float64
 6   Loudness (dB)           1981 non-null   float64
 7   Liveness                1981 non-null   float64
 8   Length (Duration)       1981 non-null   int64  
 9   Acousticness            1981 non-null   float64
 10  Speechiness             1981 non-null   float64
 11  Popularity              1981 non-null   float64
 12  Age                     1981 non-null   object 
dtypes: float64(7), int64(2), object(4)
memory usage: 185.7+ KB


Index(['Index', 'Title', 'Artist', 'Top Genre', 'Beats Per Minute (BPM)',
       'Energy', 'Loudness (dB)', 'Liveness', 'Length (Duration)',
       'Acousticness', 'Speechiness', 'Popularity', 'Age'],
      dtype='object')

In [596]:
# now run get dummies on top genre
song_df = pd.get_dummies(song_df,columns=['Top Genre'])
song_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1981 entries, 0 to 1993
Columns: 161 entries, Index to Top Genre_yacht rock
dtypes: float64(7), int64(2), object(3), uint8(149)
memory usage: 466.2+ KB


END OF PROCESSING/CLEANING EXERCISE: BEGIN MODEL TRAINING EXERCISE

In [597]:
X = song_df
y = song_df['Popularity']
X.drop('Popularity',axis=1,inplace=True)

X_train, X_test, y_train, y_test = skmod.train_test_split(X,y,test_size=0.3,random_state=26)

X_train.describe()

Unnamed: 0,Index,Beats Per Minute (BPM),Energy,Loudness (dB),Liveness,Length (Duration),Acousticness,Speechiness,Top Genre_acid jazz,Top Genre_acoustic pop,...,Top Genre_reggae,Top Genre_reggae fusion,Top Genre_rock-and-roll,Top Genre_scottish singer-songwriter,Top Genre_soft rock,Top Genre_stomp and holler,Top Genre_streektaal,Top Genre_trance,Top Genre_uk pop,Top Genre_yacht rock
count,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,...,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0
mean,985.055556,119.993506,59.756854,-8.953824,19.134921,260.805195,29.189755,5.018038,0.000722,0.000722,...,0.003608,0.002165,0.000722,0.001443,0.0,0.000722,0.0,0.000722,0.000722,0.000722
std,576.889223,28.36514,22.050992,3.632688,17.009994,86.237634,28.977884,4.536777,0.026861,0.026861,...,0.059976,0.046491,0.026861,0.037973,0.0,0.026861,0.0,0.026861,0.026861,0.026861
min,1.0,37.0,3.0,-27.0,2.0,93.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,484.25,99.0,43.0,-11.0,9.0,212.0,4.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,978.5,119.0,61.5,-8.0,12.0,245.0,18.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1475.75,136.0,77.0,-6.0,23.0,289.0,50.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1994.0,305.0,100.0,-2.0,99.0,1367.0,99.0,55.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0


END OF MODEL TRAINING EXERCISE