# ETL Project
###
### Team 6 Project Members:  Nisha Saphota, Daniel Eddie, Jon Simpson, Rob Gauer
### Date Due:  Tuesday June 9, 2020

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Store CSV into DataFrame

In [2]:
# Create dataframe from Kaggle dataset
kaggle_csv_file = "./Resources/spotify_60k_kaggle_master_song.csv"
album_data_df = pd.read_csv(kaggle_csv_file)

# display the contents of the data frame 
album_data_df.head()

Unnamed: 0,Release Date,Artist,Album,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,2013,The Devil Wears Prada,0.345833333,56PpndWxF2IGNQeBATYEEl,Rumors,31,0.455,0.972,A,9,-3.025,1,0.187,0.000193,0.000229,0.387,0.102,133.034,4
1,2011,BeyoncÃ©,4.0,1uXbwHHfgsXcUKfSZw5ZJ0,Run the World (Girls),73,0.733,0.899,C,0,-4.237,1,0.143,0.00496,4.7e-05,0.372,0.76,127.086,4
2,1998,Lenny Kravitz,5.0,6vUoqsJ0uVBgSKaUAUEQYC,Live,39,0.632,0.926,G,7,-5.954,1,0.127,0.0089,0.466,0.367,0.777,168.284,4
3,1998,Lenny Kravitz,5.0,0KF7XWr4IxZsmD1DnSkDwh,Supersoulfighter,30,0.747,0.868,B,11,-6.36,0,0.0613,0.00247,0.174,0.494,0.485,110.95,4
4,2000,Lenny Kravitz,5.0,2zee8Zcesqwnnwliw2Jy8M,I Belong To You,57,0.69,0.664,F,5,-7.715,0,0.0542,0.000447,0.0343,0.073,0.63,87.287,4


In [3]:
# Create dataframe from Zenodo dataset
zenodo_csv_file = "./Resources/spotify_zenodo_output_data.csv"
album_reviews_all_columns_df = pd.read_csv(zenodo_csv_file)

# display the contents of the data frame 
album_reviews_all_columns_df.head()

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Studio 1,Studio 1,Andy Battaglia,8.5,2009.0,February 18 2009,Studio,Electronic,0.511917,0.499667,5.25,-5.626583,0.031983,0.724917,0.024493,0.165367,0.555083,101.395167
1,John Fahey,The Great Santa Barbara Oil Slick,Mark Richardson,8.2,2005.0,February 13 2005,Water,Folk/Country,0.369765,0.325412,4.470588,-19.153824,0.148624,0.647053,0.559133,0.527782,0.179465,107.622647
2,Reigning Sound,Too Much Guitar,Stephen M. Deusner,8.3,2004.0,August 19 2004,In the Red,Electronic,0.253943,0.912857,4.428571,-1.0895,0.0555,0.000253,0.751214,0.199071,0.552786,133.8955
3,The Red Thread,After the Last,Chris Dahlen,7.3,2003.0,July 17 2003,Badman,Rock,0.4254,0.433474,5.7,-12.871,0.02826,0.310325,0.224137,0.12515,0.4514,104.3542
4,Mac Miller,Swimming,Evan Rytlewski,7.5,2018.0,August 3 2018,Warner Bros.,Rap,0.624846,0.438154,4.153846,-9.456077,0.170246,0.652462,0.012819,0.121131,0.281138,122.121308


## Select columns from DataFrame

In [4]:
# Revise dataframe from Zenodo dataset to only include the data columns of 'album' and 'score'.
album_reviews_data_df=album_reviews_all_columns_df[['album','score']]

# display the contents of the data frame 
album_reviews_data_df.head()

Unnamed: 0,album,score
0,Studio 1,8.5
1,The Great Santa Barbara Oil Slick,8.2
2,Too Much Guitar,8.3
3,After the Last,7.3
4,Swimming,7.5


## Clean DataFrame

In [5]:
# Review data counts
album_data_df.count()

Release Date        61044
Artist              61044
Album               61044
Track ID            61044
Track               61044
Popularity          61044
Danceability        61044
Energy              61044
Key                 61044
Key Val             61044
Loudness            61044
Mode                61044
Speechiness         61044
Acousticness        61044
Instrumentalness    61044
Liveness            61044
Valence             61044
Tempo               61044
Time Signature      61044
dtype: int64

In [6]:
# Review data counts
album_reviews_data_df.count()

album    18403
score    18403
dtype: int64

In [7]:
# Collecting a list of all columns within the DataFrame
album_data_df.columns

#for col in album_data_df:
#    print(col)

Index(['Release Date', 'Artist', 'Album', 'Track ID', 'Track', 'Popularity',
       'Danceability', 'Energy', 'Key', 'Key Val', 'Loudness', 'Mode',
       'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature'],
      dtype='object')

In [8]:
# Collecting a list of all columns within the DataFrame
album_reviews_data_df.columns
#for columns in album_reviews_data_df:
#    print(columns)

Index(['album', 'score'], dtype='object')

In [9]:
# Rename columns in dataframe 
album_reviews_data_df=album_reviews_data_df.rename(columns={"artist":"Artist","album":"Album","reviewauthor":"Review_Author","score":"Score","releaseyear":"Release_Year","reviewdate":"Review_Date","recordlabel":"Record_Label","genre":"Genre","danceability":"Danceability","energy":"Energy","key":"Key","loudness":"Loudness","speechiness":"Speechiness","acousticness":"Acousticness","instrumentalness":"Instrumentalness","liveness":"Liveness","valence":"Valence","tempo":"Tempo"})

# display the contents of the data frame 
album_reviews_data_df.head()

Unnamed: 0,Album,Score
0,Studio 1,8.5
1,The Great Santa Barbara Oil Slick,8.2
2,Too Much Guitar,8.3
3,After the Last,7.3
4,Swimming,7.5


In [10]:
# Display data information
album_reviews_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18403 entries, 0 to 18402
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Album   18403 non-null  object 
 1   Score   18403 non-null  float64
dtypes: float64(1), object(1)
memory usage: 287.7+ KB


## Merge the two DataFrames into a single dataset

In [11]:
# Create the Final Dataframe - presentation 
# Combine the two datasets (DataFrames) into a single dataset (DataFrame)
merge_datasets_df=pd.merge(album_reviews_data_df,album_data_df,on="Album")

# display the contents of the data frame 
merge_datasets_df.head()

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,Swimming,7.5,2018,Mac Miller,39NDBdU5Xkm5pCFGa5kZtI,Ladders,73,0.802,0.463,G#,8,-8.379,1,0.162,0.236,0.00531,0.105,0.291,103.961,4
1,Swimming,7.5,2018,Mac Miller,01z2fBGB8Hl3Jd3zXe4IXR,Come Back to Earth,72,0.272,0.238,D,2,-12.148,0,0.0349,0.89,0.00925,0.275,0.119,83.507,4
2,Swimming,7.5,2018,Mac Miller,5p7GiBZNL1afJJDUrOA6C8,Hurt Feelings,71,0.69,0.462,G#,8,-8.054,1,0.13,0.343,0.00218,0.113,0.325,157.655,4
3,Swimming,7.5,2018,Mac Miller,2dgrYdgguVZKeCsrVb9XEs,What's the Use?,70,0.759,0.492,C#,1,-10.338,0,0.12,0.736,0.00989,0.107,0.561,104.974,4
4,Swimming,7.5,2018,Mac Miller,0Xcy81PsJCOO6mhLZaQyQ4,Perfecto,64,0.599,0.317,C#,1,-11.212,1,0.174,0.748,0.0,0.111,0.203,148.09,4


# Review/clean the combined DataFrame

In [12]:
# Clean the dataframe of all NAN values (remove data rows with NAN values from dataset)
merge_datasets_df.dropna()

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,Swimming,7.5,2018,Mac Miller,39NDBdU5Xkm5pCFGa5kZtI,Ladders,73,0.802,0.463,G#,8,-8.379,1,0.1620,0.2360,0.005310,0.1050,0.2910,103.961,4
1,Swimming,7.5,2018,Mac Miller,01z2fBGB8Hl3Jd3zXe4IXR,Come Back to Earth,72,0.272,0.238,D,2,-12.148,0,0.0349,0.8900,0.009250,0.2750,0.1190,83.507,4
2,Swimming,7.5,2018,Mac Miller,5p7GiBZNL1afJJDUrOA6C8,Hurt Feelings,71,0.690,0.462,G#,8,-8.054,1,0.1300,0.3430,0.002180,0.1130,0.3250,157.655,4
3,Swimming,7.5,2018,Mac Miller,2dgrYdgguVZKeCsrVb9XEs,What's the Use?,70,0.759,0.492,C#,1,-10.338,0,0.1200,0.7360,0.009890,0.1070,0.5610,104.974,4
4,Swimming,7.5,2018,Mac Miller,0Xcy81PsJCOO6mhLZaQyQ4,Perfecto,64,0.599,0.317,C#,1,-11.212,1,0.1740,0.7480,0.000000,0.1110,0.2030,148.090,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10924,Destroyer,7.0,1976,KISS,3sb0xLwjjEjKaS7a80cWbq,Sweet Pain,31,0.422,0.877,B,11,-7.748,1,0.0934,0.0441,0.000000,0.4140,0.5010,131.640,4
10925,Destroyer,7.0,1976,KISS,3qyHjQ1zTLvPeA99CP3K9N,Shout It Out Loud,49,0.571,0.953,D#,3,-5.002,1,0.0568,0.1240,0.000006,0.3500,0.6820,135.599,4
10926,Destroyer,7.0,1976,KISS,1cPXOzyeZauSAsT1yy7aBp,Beth,60,0.387,0.238,C,0,-13.496,1,0.0339,0.8780,0.000715,0.1940,0.2790,115.354,4
10927,Destroyer,7.0,1976,KISS,6KbSviXtYJuPG3FbTvJZSI,Do You Love Me,44,0.457,0.915,D#,3,-5.876,1,0.0808,0.0623,0.000003,0.0778,0.4460,126.695,4


In [13]:
# verify counts - clean the data
merge_datasets_df.count()

Album               10929
Score               10929
Release Date        10929
Artist              10929
Track ID            10929
Track               10929
Popularity          10929
Danceability        10929
Energy              10929
Key                 10929
Key Val             10929
Loudness            10929
Mode                10929
Speechiness         10929
Acousticness        10929
Instrumentalness    10929
Liveness            10929
Valence             10929
Tempo               10929
Time Signature      10929
dtype: int64

In [14]:
# Interrogate dataset
len(merge_datasets_df["Track ID"].unique())

6978

In [15]:
# Interrogate dataset
len(merge_datasets_df["Artist"].unique())

607

In [16]:
# Interrogate dataset
len(merge_datasets_df["Album"].unique())

748

In [17]:
# Interrogate dataset
len(merge_datasets_df["Track"].unique())

6737

In [18]:
# Interrogate dataset
merge_datasets_df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
10924    False
10925    False
10926    False
10927    False
10928    False
Length: 10929, dtype: bool

In [19]:
# Interrogate dataset
merge_datasets_df.loc[merge_datasets_df.duplicated("Track ID")== True]

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
13,Swimming,7.6,2018,Mac Miller,39NDBdU5Xkm5pCFGa5kZtI,Ladders,73,0.802,0.463,G#,8,-8.379,1,0.1620,0.23600,0.005310,0.1050,0.291,103.961,4
14,Swimming,7.6,2018,Mac Miller,01z2fBGB8Hl3Jd3zXe4IXR,Come Back to Earth,72,0.272,0.238,D,2,-12.148,0,0.0349,0.89000,0.009250,0.2750,0.119,83.507,4
15,Swimming,7.6,2018,Mac Miller,5p7GiBZNL1afJJDUrOA6C8,Hurt Feelings,71,0.690,0.462,G#,8,-8.054,1,0.1300,0.34300,0.002180,0.1130,0.325,157.655,4
16,Swimming,7.6,2018,Mac Miller,2dgrYdgguVZKeCsrVb9XEs,What's the Use?,70,0.759,0.492,C#,1,-10.338,0,0.1200,0.73600,0.009890,0.1070,0.561,104.974,4
17,Swimming,7.6,2018,Mac Miller,0Xcy81PsJCOO6mhLZaQyQ4,Perfecto,64,0.599,0.317,C#,1,-11.212,1,0.1740,0.74800,0.000000,0.1110,0.203,148.090,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10437,Wrecking Ball,5.9,1995,Emmylou Harris,6RDv00YIrMNMolgqqdk0pc,Black Hawk - Alternate Version,18,0.519,0.303,A#,10,-12.038,1,0.0307,0.78800,0.000016,0.0874,0.379,95.255,4
10438,Wrecking Ball,5.9,1995,Emmylou Harris,1mjeZETkm2fkRnHNAKrivA,May This Be Love - Acoustic,17,0.457,0.185,A,9,-10.076,1,0.0462,0.22400,0.002720,0.1280,0.298,178.458,3
10439,Wrecking Ball,5.9,1995,Emmylou Harris,4myfVuO4VG3nRsOrdjmkNn,Goin' Back to Harlan,33,0.633,0.420,C#,1,-10.833,1,0.0705,0.02610,0.008360,0.1180,0.604,79.253,4
10440,Wrecking Ball,5.9,1995,Emmylou Harris,4fW3vlPsJ1pV4SWLHBs2tS,Where Will I Be? - Alternate Version - Incomplete,17,0.428,0.518,C#,1,-10.863,0,0.0612,0.33600,0.000002,0.2570,0.334,97.737,4


In [34]:
# Interrogate dataset

# Sort by multiple columns
merge_datasets_df.sort_values(by=['Album','Artist','Track','Track ID','Score'], inplace=True)

# display the contents of the data frame 
merge_datasets_df

# groupby_album_merge_datasets_df=merge_datasets_df.groupby(['Album'])
# groupby_album_merge_datasets_df.head()

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
6926,(What's the Story) Morning Glory?,8.9,1995,Oasis,42qhBnmhRdye0AkG6fguab,Acquiesce (Live At Earls Court),23,0.311,0.801,C,0,-3.409,1,0.0602,0.000051,0.055400,0.595,0.379,121.340,4
6913,(What's the Story) Morning Glory?,8.9,1995,Oasis,5mhZxnrzzXVNJiy24hLeNj,Acquiesce - Remastered,39,0.208,0.904,G,7,-4.330,1,0.0810,0.012000,0.001160,0.337,0.194,112.125,4
6923,(What's the Story) Morning Glory?,8.9,1995,Oasis,7ytT7YUeNBKYdqaRzBE8wc,Bonehead's Bank Holiday - Remastered,29,0.367,0.792,G,7,-5.038,1,0.0969,0.308000,0.000000,0.403,0.533,166.134,4
6936,(What's the Story) Morning Glory?,8.9,1995,Oasis,0GhjO8IyrfczKpn2iEme5K,Boneheadâ€™s Bank Holiday (Demo),22,0.340,0.639,G,7,-7.518,1,0.0306,0.784000,0.002390,0.173,0.657,176.768,4
6938,(What's the Story) Morning Glory?,8.9,1995,Oasis,61OmtwKId37JLGmShjRXlj,Cast No Shadow (Live At Maine Road),23,0.305,0.822,C,0,-5.951,1,0.0396,0.005150,0.000000,0.373,0.323,96.281,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7836,"thank u, next",7.9,2019,Ariana Grande,1TEL6MlSSVLSdhOSddidlJ,needy,74,0.647,0.309,G,7,-7.948,0,0.0366,0.780000,0.000007,0.202,0.195,87.045,4
7833,"thank u, next",7.9,2019,Ariana Grande,3e9HZxeyfWwjeyPAMmWSSQ,"thank u, next",85,0.717,0.653,C#,1,-5.634,1,0.0658,0.229000,0.000000,0.101,0.412,106.966,4
6196,untitled unmastered.,8.6,2016,Kendrick Lamar,3I0FBDc1c1BLNtXWKVjmFg,u,60,0.667,0.508,F,5,-10.337,1,0.4510,0.341000,0.000000,0.644,0.093,89.436,3
4934,xx,8.7,2009,The xx,0DAsxISzun85PbsqAfIzeC,Intro,63,0.617,0.778,A,9,-8.871,0,0.0270,0.459000,0.925000,0.128,0.152,100.363,4


In [35]:
# Interrogate dataset

# Sort by multiple columns
merge_datasets_df.sort_values(by=['Artist','Album','Track','Track ID','Score'], inplace=True)

# display the contents of the data frame 
merge_datasets_df

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
1935,Greatest Hits,5.8,2005,*NSYNC,72otaqywVqwyXaCjk75JKm,(God Must Have Spent) A Little More Time On Yo...,58,0.468,0.535,A#,10,-8.264,1,0.0436,0.440,0.000000,0.1010,0.294,168.001,4
459,Greatest Hits,6.1,2005,*NSYNC,72otaqywVqwyXaCjk75JKm,(God Must Have Spent) A Little More Time On Yo...,58,0.468,0.535,A#,10,-8.264,1,0.0436,0.440,0.000000,0.1010,0.294,168.001,4
828,Greatest Hits,6.4,2005,*NSYNC,72otaqywVqwyXaCjk75JKm,(God Must Have Spent) A Little More Time On Yo...,58,0.468,0.535,A#,10,-8.264,1,0.0436,0.440,0.000000,0.1010,0.294,168.001,4
1197,Greatest Hits,7.0,2005,*NSYNC,72otaqywVqwyXaCjk75JKm,(God Must Have Spent) A Little More Time On Yo...,58,0.468,0.535,A#,10,-8.264,1,0.0436,0.440,0.000000,0.1010,0.294,168.001,4
1566,Greatest Hits,7.6,2005,*NSYNC,72otaqywVqwyXaCjk75JKm,(God Must Have Spent) A Little More Time On Yo...,58,0.468,0.535,A#,10,-8.264,1,0.0436,0.440,0.000000,0.1010,0.294,168.001,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9342,Watershed,7.5,2008,k.d. lang,6dRIT3U4JsnZ6bp2yaTuVR,Once in a While,12,0.489,0.320,A#,10,-11.756,1,0.0349,0.610,0.000054,0.0976,0.140,132.833,4
9348,Watershed,7.5,2008,k.d. lang,66ckLbNWRlwVj566tjfCYZ,Shadow and the Frame,11,0.368,0.147,F#,6,-15.767,0,0.0308,0.871,0.000896,0.1000,0.111,110.902,4
9345,Watershed,7.5,2008,k.d. lang,6CHkj5XsUY2wNrUxgrs0vi,Sunday,12,0.339,0.450,D#,3,-11.646,1,0.0318,0.523,0.115000,0.1070,0.171,143.852,3
9343,Watershed,7.5,2008,k.d. lang,3cFCWGQbUjM60aOoB3TdYQ,Thread,12,0.511,0.402,F,5,-8.381,1,0.0277,0.369,0.000009,0.1630,0.184,93.648,4


In [36]:
# Interrogate dataset

# Sort by multiple columns
merge_datasets_df.sort_values(by=['Track','Track ID','Artist','Album','Score'], inplace=True)

# display the contents of the data frame 
merge_datasets_df

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
9200,Absolute Garbage,5.3,1995,Garbage,0P6USuYzHP8GdAyNKLkTZi,#1 Crush,45,0.635,0.647,G,7,-7.055,0,0.0235,0.000256,0.00132,0.358,0.4640,94.196,4
4873,Double Nickels on the Dime,9.5,1984,Minutemen,60nriFcKARUpnYksJkDqJR,#1 Hit Song,34,0.576,0.820,E,4,-9.387,1,0.1130,0.014400,0.73900,0.200,0.4450,91.208,4
3243,The Ultimate Collection,8.0,1997,Luciano Pavarotti,6APIeDnJBcgEFOWv5plRhD,'O sole mio,52,0.265,0.270,A,9,-13.995,1,0.0371,0.948000,0.12600,0.148,0.2780,73.309,3
3172,The Ultimate Collection,9.6,1997,Luciano Pavarotti,6APIeDnJBcgEFOWv5plRhD,'O sole mio,52,0.265,0.270,A,9,-13.995,1,0.0371,0.948000,0.12600,0.148,0.2780,73.309,3
8245,Kaleidoscope,3.8,2006,Sonny Stitt,2ykZ83DT9W2OFJrAj6rOtp,'S Wonderful,0,0.482,0.380,D,2,-12.091,0,0.0447,0.982000,0.77700,0.230,0.6700,89.371,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5661,Collapse,6.8,2013,Dangerkids,1vfXmcA7ZhpBNjIKyrFcc7,we're all in danger,30,0.407,0.974,C#,1,-2.549,0,0.0863,0.001240,0.00000,0.342,0.4540,175.156,4
5650,Collapse,7.4,2013,Dangerkids,1vfXmcA7ZhpBNjIKyrFcc7,we're all in danger,30,0.407,0.974,C#,1,-2.549,0,0.0863,0.001240,0.00000,0.342,0.4540,175.156,4
5663,Collapse,6.8,2013,Dangerkids,1BNJmdPhNojCQCk1rpKbg2,where the sky breaks,22,0.138,0.188,D#,3,-14.243,0,0.0381,0.112000,0.97900,0.109,0.0397,141.729,4
5652,Collapse,7.4,2013,Dangerkids,1BNJmdPhNojCQCk1rpKbg2,where the sky breaks,22,0.138,0.188,D#,3,-14.243,0,0.0381,0.112000,0.97900,0.109,0.0397,141.729,4


In [37]:
# Interrogate dataset

# Sort by multiple columns
merge_datasets_df.sort_values(by=['Track ID','Track','Artist','Album','Score'], inplace=True)

# display the contents of the data frame 
merge_datasets_df

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
5028,Gold,7.0,1978,Jefferson Starship,000ytv4XKDWJHyBa64jlaj,Love Too Good,28,0.552,0.507,D,2,-14.198,0,0.0250,0.24100,0.148000,0.120,0.692,91.875,4
5082,Gold,7.2,1978,Jefferson Starship,000ytv4XKDWJHyBa64jlaj,Love Too Good,28,0.552,0.507,D,2,-14.198,0,0.0250,0.24100,0.148000,0.120,0.692,91.875,4
5136,Gold,9.3,1978,Jefferson Starship,000ytv4XKDWJHyBa64jlaj,Love Too Good,28,0.552,0.507,D,2,-14.198,0,0.0250,0.24100,0.148000,0.120,0.692,91.875,4
1860,Greatest Hits,5.8,1993,Clint Black,002eRBobl0nTnJTdnjCRBU,No Time To Kill,31,0.584,0.670,F,5,-9.329,1,0.0297,0.10300,0.000000,0.302,0.805,92.959,4
384,Greatest Hits,6.1,1993,Clint Black,002eRBobl0nTnJTdnjCRBU,No Time To Kill,31,0.584,0.670,F,5,-9.329,1,0.0297,0.10300,0.000000,0.302,0.805,92.959,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7674,Flowers,8.0,2001,Echo & the Bunnymen,7zoTaebBtXFW8NsusFnukW,Supermellow Man,12,0.515,0.740,G,7,-8.548,1,0.0246,0.00114,0.000041,0.664,0.611,89.991,4
5755,Elastica,8.5,1995,Elastica,7zpkb07R0SAKFugvMBZhKb,Hold Me Now,19,0.705,0.807,C,0,-6.958,1,0.0349,0.02780,0.000000,0.164,0.904,108.123,4
4536,Lights Out,6.9,2005,Lil Wayne,7zuYF8rZ6TTyaJx9DSL1g3,Skit,35,0.644,0.232,F,5,-13.846,1,0.9220,0.32500,0.000000,0.530,0.517,166.041,1
4517,Lights Out,7.0,2005,Lil Wayne,7zuYF8rZ6TTyaJx9DSL1g3,Skit,35,0.644,0.232,F,5,-13.846,1,0.9220,0.32500,0.000000,0.530,0.517,166.041,1


In [20]:
# verify counts - validate we now have a clean dataset
merge_datasets_df.count()

Album               10929
Score               10929
Release Date        10929
Artist              10929
Track ID            10929
Track               10929
Popularity          10929
Danceability        10929
Energy              10929
Key                 10929
Key Val             10929
Loudness            10929
Mode                10929
Speechiness         10929
Acousticness        10929
Instrumentalness    10929
Liveness            10929
Valence             10929
Tempo               10929
Time Signature      10929
dtype: int64

# Create final DataFrame for database creation

In [26]:
# Create new clean datafrome
album_reviews_df=merge_datasets_df

# display the contents of the data frame 
album_reviews_df.head()

Unnamed: 0,Album,Score,Release Date,Artist,Track ID,Track,Popularity,Danceability,Energy,Key,Key Val,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,Swimming,7.5,2018,Mac Miller,39NDBdU5Xkm5pCFGa5kZtI,Ladders,73,0.802,0.463,G#,8,-8.379,1,0.162,0.236,0.00531,0.105,0.291,103.961,4
1,Swimming,7.5,2018,Mac Miller,01z2fBGB8Hl3Jd3zXe4IXR,Come Back to Earth,72,0.272,0.238,D,2,-12.148,0,0.0349,0.89,0.00925,0.275,0.119,83.507,4
2,Swimming,7.5,2018,Mac Miller,5p7GiBZNL1afJJDUrOA6C8,Hurt Feelings,71,0.69,0.462,G#,8,-8.054,1,0.13,0.343,0.00218,0.113,0.325,157.655,4
3,Swimming,7.5,2018,Mac Miller,2dgrYdgguVZKeCsrVb9XEs,What's the Use?,70,0.759,0.492,C#,1,-10.338,0,0.12,0.736,0.00989,0.107,0.561,104.974,4
4,Swimming,7.5,2018,Mac Miller,0Xcy81PsJCOO6mhLZaQyQ4,Perfecto,64,0.599,0.317,C#,1,-11.212,1,0.174,0.748,0.0,0.111,0.203,148.09,4


In [24]:
# Display data information for review and setup of database
album_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10929 entries, 0 to 10928
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Album             10929 non-null  object 
 1   Score             10929 non-null  float64
 2   Release Date      10929 non-null  int64  
 3   Artist            10929 non-null  object 
 4   Track ID          10929 non-null  object 
 5   Track             10929 non-null  object 
 6   Popularity        10929 non-null  int64  
 7   Danceability      10929 non-null  float64
 8   Energy            10929 non-null  float64
 9   Key               10929 non-null  object 
 10  Key Val           10929 non-null  int64  
 11  Loudness          10929 non-null  float64
 12  Mode              10929 non-null  int64  
 13  Speechiness       10929 non-null  float64
 14  Acousticness      10929 non-null  float64
 15  Instrumentalness  10929 non-null  float64
 16  Liveness          10929 non-null  float6

In [29]:
# Display data types and review
album_reviews_df.dtypes

Album                object
Score               float64
Release Date          int64
Artist               object
Track ID             object
Track                object
Popularity            int64
Danceability        float64
Energy              float64
Key                  object
Key Val               int64
Loudness            float64
Mode                  int64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
Time Signature        int64
dtype: object

In [28]:
#Save remade data frame to a new csv file: /Output_Data/album_reviews_dataframe.csv
album_reviews_df.to_csv('Output_Data/album_reviews_dataframe.csv', encoding="utf-8", index="true",header="true")

## Connect to local database 

In [None]:
rds_connection_string = "<insert user name>:<insert password>@localhost:5432/album_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
engine.table_names()