In [12]:
# import packages
import pandas as pd
import numpy as np
import unicodedata
from pymongo import MongoClient
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [13]:
# set up client
client = MongoClient('mongodb://cradduhj:fsbbigdata@mongodb.fsb.miamioh.edu:27017', authSource="admin")

# input username
db = client['cradduhj']

In [14]:
# specify collection
ARTISTS = db.ARTISTS

# read all documents from the collection
cursor = ARTISTS.find()  # returns a cursor of all documents

# convert to a pandas DataFrame
artists = pd.DataFrame(list(cursor))

# optional: remove the MongoDB _id field if not needed
if '_id' in artists.columns:
    artists = artists.drop(columns=['_id'])

# inspect the DataFrame
print(artists.shape)
print(artists.columns)
# artists.head()

(4322, 9)
Index(['name', 'type', 'country', 'begin_date', 'end_date', 'genres',
       'country_wikidata', 'bio', 'artist'],
      dtype='object')


In [15]:
# specify collection
LYRICS = db.LYRICS

# read all documents from the collection
cursor = LYRICS.find()  # returns a cursor of all documents

# convert to a pandas DataFrame
lyrics = pd.DataFrame(list(cursor))

# optional: remove the MongoDB _id field if not needed
if '_id' in lyrics.columns:
    lyrics = lyrics.drop(columns=['_id'])

# inspect the DataFrame
print(lyrics.shape)
print(lyrics.columns)
# lyrics.head()

(13873, 4)
Index(['title', 'artist', 'chart_year', 'lyrics'], dtype='object')


In [16]:
# specify collection
YOUTUBE = db.YOUTUBE

# read all documents from the collection
cursor = YOUTUBE.find()  # returns a cursor of all documents

# convert to a pandas DataFrame
youtube = pd.DataFrame(list(cursor))

# optional: remove the MongoDB _id field if not needed
if '_id' in youtube.columns:
    youtube = youtube.drop(columns=['_id'])

# inspect the DataFrame
print(youtube.shape)
print(youtube.columns)
# youtube.head()

(13876, 7)
Index(['artist', 'title', 'video_id', 'video_title', 'view_count',
       'like_count', 'comment_count'],
      dtype='object')


In [17]:
# specify collection
SPOTIFY = db.SPOTIFY

# read all documents from the collection
cursor = SPOTIFY.find()  # returns a cursor of all documents

# convert to a pandas DataFrame
spotify = pd.DataFrame(list(cursor))

# optional: remove the MongoDB _id field if not needed
if '_id' in spotify.columns:
    spotify = spotify.drop(columns=['_id'])

# inspect the DataFrame
print(spotify.shape)
print(spotify.columns)
# spotify.head()

(6578, 14)
Index(['track_id', 'title', 'artist', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'speechiness', 'tempo', 'valence'],
      dtype='object')


In [18]:
def clean_text_keep_punct(series):
    # convert to string, lowercase, strip whitespace
    series = series.astype(str).str.lower().str.strip()
    
    # normalize unicode accents (é -> e)
    series = series.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8'))
    # collapse multiple spaces
    series = series.apply(lambda x: ' '.join(x.split()))
    return series

# clean both DataFrames
for df in [spotify, youtube, artists, lyrics]:
    if 'title' in df.columns:
        df['title'] = clean_text_keep_punct(df['title'])
    if 'artist' in df.columns:
        df['artist'] = clean_text_keep_punct(df['artist'])

In [19]:
# merge Spotify and YouTube data on title and artist
final_df = pd.merge(spotify, youtube, on=['title', 'artist'], how='left')

# list of columns to drop
cols_to_drop = ['track_id','video_id', 'video_title']

# drop unwanted columns
final_df.drop(columns=cols_to_drop, inplace=True)

print(final_df.shape)
print(final_df.columns)
print(final_df.isna().sum())
final_df.head()

(6578, 16)
Index(['title', 'artist', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'key', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'view_count', 'like_count',
       'comment_count'],
      dtype='object')
title                 0
artist                0
acousticness          0
danceability          0
duration_ms           0
energy                0
instrumentalness      0
key                   0
liveness              0
loudness              0
speechiness           0
tempo                 0
valence               0
view_count           12
like_count           26
comment_count       127
dtype: int64


Unnamed: 0,title,artist,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,valence,view_count,like_count,comment_count
0,...baby one more time,britney spears,0.202,0.759,211067.0,0.699,0.000131,0.0,0.443,-5.745,0.0307,92.96,0.907,1118930000.0,6138863.0,220000.0
1,doo wop (that thing),lauryn hill,0.0393,0.535,320267.0,0.505,0.0,2.0,0.0923,-8.926,0.245,99.935,0.495,220560700.0,1416591.0,28000.0
2,have you ever?,brandy,0.542,0.698,273440.0,0.533,0.0,2.0,0.333,-6.246,0.0437,134.001,0.275,87564090.0,603346.0,15000.0
3,love like this,faith evans,0.00364,0.767,275707.0,0.551,0.0,0.0,0.0451,-7.328,0.0616,100.904,0.796,10499470.0,138294.0,3100.0
4,this kiss,faith hill,0.175,0.398,194307.0,0.804,0.0,11.0,0.181,-5.559,0.0451,186.752,0.709,21090600.0,124231.0,5100.0


In [20]:
# drop any duplicates of artists
artists = artists.drop_duplicates(subset=['artist'])

# merge Spotify and YouTube data on title and artist
final_df = pd.merge(final_df, artists, on=['artist'], how='left')

# list of columns to drop
cols_to_drop = ['name','end_date', 'bio', 'country_wikidata']

# drop unwanted columns
final_df.drop(columns=cols_to_drop, inplace=True)

print(final_df.shape)
print(final_df.columns)
print(final_df.isna().sum())
final_df.head()

(6578, 20)
Index(['title', 'artist', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'key', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'view_count', 'like_count',
       'comment_count', 'type', 'country', 'begin_date', 'genres'],
      dtype='object')
title                 0
artist                0
acousticness          0
danceability          0
duration_ms           0
energy                0
instrumentalness      0
key                   0
liveness              0
loudness              0
speechiness           0
tempo                 0
valence               0
view_count           12
like_count           26
comment_count       127
type                 12
country             270
begin_date          132
genres                0
dtype: int64


Unnamed: 0,title,artist,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,valence,view_count,like_count,comment_count,type,country,begin_date,genres
0,...baby one more time,britney spears,0.202,0.759,211067.0,0.699,0.000131,0.0,0.443,-5.745,0.0307,92.96,0.907,1118930000.0,6138863.0,220000.0,Person,US,1981-12-02,"['dance-pop', 'synth-pop', 'teen pop', 'electr..."
1,doo wop (that thing),lauryn hill,0.0393,0.535,320267.0,0.505,0.0,2.0,0.0923,-8.926,0.245,99.935,0.495,220560700.0,1416591.0,28000.0,Person,US,1975-05-26,"['reggae', 'soul', 'hip-hop']"
2,have you ever?,brandy,0.542,0.698,273440.0,0.533,0.0,2.0,0.333,-6.246,0.0437,134.001,0.275,87564090.0,603346.0,15000.0,Person,US,1979-02-11,"['popular music', 'Spaghetti Western', 'contem..."
3,love like this,faith evans,0.00364,0.767,275707.0,0.551,0.0,0.0,0.0451,-7.328,0.0616,100.904,0.796,10499470.0,138294.0,3100.0,Person,US,1973-06-10,"['rhythm and blues', 'hip-hop', 'soul', 'conte..."
4,this kiss,faith hill,0.175,0.398,194307.0,0.804,0.0,11.0,0.181,-5.559,0.0451,186.752,0.709,21090600.0,124231.0,5100.0,Person,US,1967-09-21,['country music']


In [21]:
# merge Spotify and YouTube data on title and artist
final_df = pd.merge(final_df, lyrics, on=['title', 'artist'], how='left')

print(final_df.shape)
print(final_df.columns)
print(final_df.isna().sum())
print(final_df.describe())
final_df.head()

(6578, 22)
Index(['title', 'artist', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'key', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'view_count', 'like_count',
       'comment_count', 'type', 'country', 'begin_date', 'genres',
       'chart_year', 'lyrics'],
      dtype='object')
title                 0
artist                0
acousticness          0
danceability          0
duration_ms           0
energy                0
instrumentalness      0
key                   0
liveness              0
loudness              0
speechiness           0
tempo                 0
valence               0
view_count           12
like_count           26
comment_count       127
type                 12
country             270
begin_date          132
genres                0
chart_year            3
lyrics                3
dtype: int64
       acousticness  danceability    duration_ms       energy  \
count   6578.000000   6578.000000    6578.000000  6

Unnamed: 0,title,artist,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,...,valence,view_count,like_count,comment_count,type,country,begin_date,genres,chart_year,lyrics
0,...baby one more time,britney spears,0.202,0.759,211067.0,0.699,0.000131,0.0,0.443,-5.745,...,0.907,1118930000.0,6138863.0,220000.0,Person,US,1981-12-02,"['dance-pop', 'synth-pop', 'teen pop', 'electr...",1999.0,"Oh, baby, baby\nOh, baby, baby\n\nOh, baby, ba..."
1,doo wop (that thing),lauryn hill,0.0393,0.535,320267.0,0.505,0.0,2.0,0.0923,-8.926,...,0.495,220560700.0,1416591.0,28000.0,Person,US,1975-05-26,"['reggae', 'soul', 'hip-hop']",1999.0,"Yo, 'member back on the bouley when cats used ..."
2,have you ever?,brandy,0.542,0.698,273440.0,0.533,0.0,2.0,0.333,-6.246,...,0.275,87564090.0,603346.0,15000.0,Person,US,1979-02-11,"['popular music', 'Spaghetti Western', 'contem...",1999.0,Have you ever loved somebody so much it makes ...
3,love like this,faith evans,0.00364,0.767,275707.0,0.551,0.0,0.0,0.0451,-7.328,...,0.796,10499470.0,138294.0,3100.0,Person,US,1973-06-10,"['rhythm and blues', 'hip-hop', 'soul', 'conte...",1999.0,I never knew there was a love like this before...
4,this kiss,faith hill,0.175,0.398,194307.0,0.804,0.0,11.0,0.181,-5.559,...,0.709,21090600.0,124231.0,5100.0,Person,US,1967-09-21,['country music'],1999.0,I don't want another heartbreak\nI don't need ...


In [22]:
# convert key to factor
final_df['key'] = final_df['key'].astype('Int64').astype('category')

# perform median imputation
final_df['view_count'] = final_df['view_count'].fillna(final_df['view_count'].median())
final_df['like_count'] = final_df['like_count'].fillna(final_df['like_count'].median())

# perform zero imputation
final_df['comment_count'] = final_df['comment_count'].fillna(0)

# perform mode imputation 
mode_type = final_df['type'].mode()[0]
final_df['type'] = final_df['type'].fillna(mode_type)

# convert type to string
final_df['type'] = final_df['type'].astype(str)

# conditional collapsing of values of type
def recode_type(row):
    if row['artist'] == 'Hannah Montana':
        return 'Person'
    elif row['type'] in ['Character', 'Orchestra', 'Other']:
        return 'Group'
    else:
        return row['type']

final_df['type'] = final_df.apply(recode_type, axis=1)

# convert type to category
final_df['type'] = final_df['type'].astype('category')

# collapse country into US, CA, GB, and Other
def group_country(x):
    if pd.isna(x):
        return "Other"
    x = str(x).upper()
    if x in ['US']:
        return 'US'
    elif x in ['GB']:
        return 'GB'
    elif x in ['CA']:
        return 'CA'
    else:
        return 'Other'

# convert country_grouped to factor
final_df['country_grouped'] = final_df['country'].apply(group_country).astype('category')

# convert type and county_grouped to dummies
final_df = pd.get_dummies(
    final_df,
    columns=['type', 'country_grouped', 'key'],
    prefix=['type', 'country', 'key'],
    drop_first=False,
    dtype=int
)

# convert duration_ms to minutes
final_df['duration_min'] = final_df['duration_ms'] / 60000

# convert chart_year and duration_ms to integers
final_df['chart_year'] = pd.to_numeric(final_df['chart_year'], errors='coerce').astype('Int64')
final_df['duration_ms'] = pd.to_numeric(final_df['duration_ms'], errors='coerce').astype('Int64')


# pull year out of begin_date and set as integer
final_df['begin_year'] = final_df['begin_date'].astype(str).str[:4]
final_df['begin_year'] = pd.to_numeric(final_df['begin_year'], errors='coerce').astype('Int64')

# impute missing values of chart and begin years as average difference between chart and begin years
final_df.loc[final_df['chart_year'].isna(), 'chart_year'] = (final_df.loc[final_df['chart_year'].isna(), 'begin_year'] + (final_df['chart_year'] - final_df['begin_year']).mean()).round().astype(int)
final_df.loc[final_df['begin_year'].isna(), 'begin_year'] = (final_df.loc[final_df['begin_year'].isna(), 'chart_year'] + (final_df['begin_year'] - final_df['chart_year']).mean()).round().astype(int)

# drop columns
final_df.drop(columns=['country', 'begin_date', 'duration_ms'], inplace=True)

# remove "\n" from lyrics
final_df['lyrics'] = final_df['lyrics'].astype(str).str.replace('\n', ' ', regex=False)

# fix inaccruate values of chart_year
final_df.loc[final_df['chart_year'] == 2029, 'chart_year'] = 2022

In [23]:
print(final_df.shape)
print(final_df.columns)
print(final_df.dtypes)
print(final_df.describe())
final_df.head()

(6578, 37)
Index(['title', 'artist', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'view_count', 'like_count', 'comment_count', 'genres',
       'chart_year', 'lyrics', 'type_Group', 'type_Person', 'country_CA',
       'country_GB', 'country_Other', 'country_US', 'key_0', 'key_1', 'key_2',
       'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'duration_min', 'begin_year'],
      dtype='object')
title                object
artist               object
acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
liveness            float64
loudness            float64
speechiness         float64
tempo               float64
valence             float64
view_count          float64
like_count          float64
comment_count       float64
genres               object
chart_year            Int64
lyrics               o

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_min,begin_year
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,0,0,0,0,0,0,0,0,3.517783,1981
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0,0,0,0,0,0,0,5.337783,1975
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,0,0,0,0,0,0,0,0,4.557333,1979
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0,0,0,0,0,0,0,4.595117,1973
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0,0,0,0,0,0,0,1,3.23845,1967


In [24]:
# final_df['genres'] = final_df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# # Flatten all genres that contain "pop"
# pop_genres = set(
#     g
#     for genres_list in final_df['genres'] if isinstance(genres_list, list)
#     for g in genres_list
#     if 'pop' in g.lower()
# )

# # Print the result
# print(pop_genres)

In [25]:
# Broad genre mapping
genre_groups = {
    'pop': ['dream pop', 'Europop', 'cantopop', 'folk-pop', 'Iranian pop', 'Britpop', 'African popular music', 'hyperpop', 'pop music', 'psychedelic pop', 'K-pop', 'baroque pop', 'indie pop', 'country pop', 'teen pop', 'synth-pop', 'chamber pop', 'sophisti-pop', 'alternative pop', 'traditional pop', 'pop urbaine', 'city pop', 'J-pop', 'dance-pop', 'mandopop', 'disco-pop', 'pop-punk', 'Latin pop', 'electropop', 'power pop', 'popular music', 'art pop', 'pop rock', 'jangle pop', 'emo pop', 'French pop', 'Gothic pop', 'bitpop', 'progressive pop', 'Nederpop', 'bedroom pop', 'pop soul', 'pop rap', 'Indian pop', 'jazz pop', 'futurepop', 'avant-pop', 'operatic pop'],
    'rock': ['folk punk', 'pop-punk', 'punk rap', 'post-punk', 'garage punk', 'hardcore punk', 'skate punk', 'dance-punk', 'punk blues', 'proto-punk', 'anarcho-punk', 'ska punk', 'art punk', 'cyberpunk', 'punk rock', 'post-punk revival', 'boogie rock', 'rockabilly', 'hard rock', 'electronic rock', 'dark rock', 'heartland rock', 'geek rock', 'garage rock', 'funk rock', 'blues rock', 'acid rock', 'sleaze rock', 'punk rock', 'symphonic rock', 'Chinese rock', 'cello rock', 'Celtic rock', 'country rock', 'piano rock', 'glam rock', 'shock rock', 'space rock', 'comedy rock', 'rap rock', 'arena rock', 'experimental rock', 'yacht rock', 'rocksteady', 'pub rock', 'modern rock', 'industrial rock', 'post-rock', 'soft rock', 'Chicano rock', 'roots rock', 'reggae rock', 'progressive rock', 'dance-rock', 'art rock', 'Christian rock', 'classic rock', 'Latin rock', 'college rock', 'raga rock', 'jazz rock', 'folk rock', 'indie rock', 'rock and roll', 'French rock', 'lovers rock', 'Southern rock', 'pop rock', 'Christian alternative rock', 'instrumental rock', 'rock music', 'alternative rock', 'gothic rock', 'psychedelic rock', 'adult-oriented rock', 'Krautrock'],
    'hip-hop': ['trap metal', 'EDM trap music', 'Latin trap', 'trap music', 'hip-hop', 'rap', 'gangsta rap', 'trap','East Coast hip-hop', 'political hip-hop', 'conscious hip-hop', 'hip-hop', 'hip-hop soul', 'golden age hip-hop', 'hardcore hip-hop', 'industrial hip-hop', 'Christian hip-hop', 'experimental hip-hop', 'Turkish hip-hop', 'Korean hip-hop', 'old-school hip-hop', 'German hip-hop', 'lofi hip-hop', 'Midwest hip-hop', 'Latin hip-hop', 'underground hip-hop', 'alternative hip-hop', 'West Coast hip-hop', 'comedy hip-hop', 'Southern hip-hop', 'Atlanta hip-hop', 'indie hip-hop', 'Chicago hip-hop', 'mafioso rap', 'mumble rap', 'punk rap', 'Michigan rap', 'emo rap', 'frat rap', 'UK rap', 'cloud rap', 'Memphis rap', 'jazz rap', 'country rap', 'pop rap', 'SoundCloud rap', 'dirty rap', 'progressive rap', 'gangsta rap', 'psychedelic rap'],
    'r&b': ['country blues', 'British blues', 'rhythm and blues', 'punk blues', 'blues rock', 'blues', 'Latin R&B', 'Smooth R&B', 'contemporary R&B', 'French contemporary R&B', 'alternative R&B', 'progressive soul', 'blue-eyed soul', 'psychedelic soul', 'soul', 'Southern soul', 'pop soul', 'chipmunk soul', 'neo soul', 'British soul', 'hip-hop soul'],
    'country': ['country blues', 'country rock', 'alternative country', 'country music', 'country rap', 'bro-country', 'neotraditional country', 'country pop'],
    'jazz': ['free jazz', 'smooth jazz', 'vocal jazz', 'acid jazz', 'jazz-funk', 'avant-garde jazz', 'jazz rock', 'jazz rap', 'jazz fusion', 'jazz', 'jazz pop'],
    'electronic': ['techno-thriller', 'techno', 'electronic music', 'electronic literature', 'electronic rock', 'electronicore', 'electronica', 'electronic dance music', 'EDM trap music', 'house music', 'Dutch house', 'big room house', 'leftfield house', 'future house', 'deep house', 'funky house', 'tech house', 'tropical house', 'electro house', 'progressive house', 'hip house', 'French house'],
    'reggae': ['reggae rock', 'roots reggae', 'reggae', 'reggae fusion', 'reggaeton', 'dance-rock', 'alternative dance', 'dance music', 'dancehall', 'dance-punk', 'dance-pop', 'dance', 'intelligent dance music', 'Italo dance', 'Eurodance', 'electronic dance music'],
    'metal': ['traditional heavy metal', 'sludge metal', 'alternative metal', 'neo-classical metal', 'rap metal', 'avant-garde metal', 'gothic metal', 'funk metal', 'groove metal', 'nu metal', 'industrial metal', 'melodic death metal', 'speed metal', 'thrash metal', 'black metal', 'Christian metal', 'power metal', 'progressive metal', 'metalcore', 'symphonic metal', 'heavy metal', 'death metal', 'trap metal', 'glam metal', 'new wave of British heavy metal', 'doom metal'],
    'folk': ['folktronica', 'folk punk', 'neofolk', 'folk music', 'indie folk', 'traditional folk music', 'American folk music', 'folk-pop', 'folk rock', 'anti-folk', 'contemporary folk music']
}

# Function to map genres
def map_to_group(genres):
    mapped = set()

    if pd.isna(genres):
        return []

    # Convert string list → real list
    if isinstance(genres, str):
        try:
            genres = ast.literal_eval(genres)
        except:
            return []

    # Normalize
    genres = [g.lower().strip() for g in genres]

    for broad, subs in genre_groups.items():
        subs = [s.lower() for s in subs]
        
        # Partial match (safer than exact)
        if any(sub in g for g in genres for sub in subs):
            mapped.add(broad)

    return list(mapped)

# apply mapping
grouped_genres = final_df['genres'].apply(map_to_group)

# create indicator columns
mlb = MultiLabelBinarizer(classes=list(genre_groups.keys()))
genre_indicators = pd.DataFrame(mlb.fit_transform(grouped_genres),
                                columns=mlb.classes_,
                                index=final_df.index)

# overwrite any existing genre columns safely
final_df[genre_indicators.columns] = genre_indicators

final_df.drop(columns='genres', inplace=True)

final_df.head()

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,pop,rock,hip-hop,r&b,country,jazz,electronic,reggae,metal,folk
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,1,0,0,0,0,0,1,1,0,0
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0,1,1,0,0,0,1,0,0
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,1,0,0,1,0,0,0,0,0,0
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0,1,1,0,0,0,0,0,0
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0,0,0,0,1,0,0,0,0,0


In [26]:
print(final_df.shape)
print(final_df.columns)
print(final_df.isna().sum())
print(final_df.dtypes)
print(final_df.describe())
final_df.head()

(6578, 46)
Index(['title', 'artist', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'view_count', 'like_count', 'comment_count', 'chart_year',
       'lyrics', 'type_Group', 'type_Person', 'country_CA', 'country_GB',
       'country_Other', 'country_US', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'duration_min', 'begin_year', 'pop', 'rock', 'hip-hop', 'r&b',
       'country', 'jazz', 'electronic', 'reggae', 'metal', 'folk'],
      dtype='object')
title               0
artist              0
acousticness        0
danceability        0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
view_count          0
like_count          0
comment_count       0
chart_year          0
lyrics              0
type_Group          0
type_P

Unnamed: 0,title,artist,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,...,pop,rock,hip-hop,r&b,country,jazz,electronic,reggae,metal,folk
0,...baby one more time,britney spears,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,...,1,0,0,0,0,0,1,1,0,0
1,doo wop (that thing),lauryn hill,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,...,0,0,1,1,0,0,0,1,0,0
2,have you ever?,brandy,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,...,1,0,0,1,0,0,0,0,0,0
3,love like this,faith evans,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,...,0,0,1,1,0,0,0,0,0,0
4,this kiss,faith hill,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,...,0,0,0,0,1,0,0,0,0,0


In [28]:
# save final cleaned dataset
final_df.to_csv("final_dataset.csv", index = False)