In [16]:
# Importing necessary packages 
import pandas as pd
import numpy as np
import os


# Reading in the dataset

In [9]:
df = pd.read_csv('../data/all_songs_data_processed.csv')

# Keeping only the necessary columns

In [10]:
df = df[['Year','Artist','Song Title','Rank','Lyrics','Nouns','Verbs','Adverbs','Corpus','Word Counts','Unique Word Counts']]
df.head()

Unnamed: 0,Year,Artist,Song Title,Rank,Lyrics,Nouns,Verbs,Adverbs,Corpus,Word Counts,Unique Word Counts
0,1959.0,Johnny Horton,The Battle Of New Orleans,1,In 1814 we took a little trip Along with Colo...,trip bacon bean town gun comin while river of'...,take take take catch fire keep be be fire begi...,along as ago once more so as ago once more the...,take little trip Colonel Jackson mighty Missis...,435,155
1,1959.0,Bobby Darin,Mack The Knife,2,"Oh the shark, babe Has such teeth, dear And he...",shark babe tooth jackknife babe sight shark to...,have show have keep know bite billow spread be...,pearly just so never never now just just down ...,oh shark babe tooth dear show pearly white jac...,224,145
2,1959.0,Lloyd Price,Personality,3,Over and over I tried to prove my love to you ...,love friend fool fool personality personality ...,try prove do say get get do cause get get do s...,over over over over over over so over now over...,try prove love friend say fool fool cause get ...,215,55
3,1959.0,Frankie Avalon,Venus,4,"Hey, Venus! Oh, Venus! Venus, if you will Ple...",girl girl kiss arm girl charm girl sunlight ha...,send thrill want make take place ask promise g...,surely too always as long surely too always as...,hey Venus oh Venus Venus send little girl thri...,166,74
4,1959.0,Paul Anka,Lonely Boy,5,I'm just a lonely boy Lonely and blue I'm all ...,boy nothing moment love night day boy nothing ...,do get think want love love kiss hold like hea...,just all just all just so hard just all,lonely boy lonely blue get think want love yes...,140,68


In [11]:
df.shape

(6292, 11)

# Handling Instrumental Songs and Missing Values 

In [12]:
# Filter all 'Instrumental' songs out
df = df[df['Lyrics'] != 'Instrumental']         #6280 rows after this line is ran 
df = df[df['Lyrics'] != '(Instrumental)']       #6279 rows after this line is ran

# Dropping all NaNs in 'Lyrics' and 'Corpus'
df.dropna(subset=['Lyrics'], inplace=True)      #6258 rows after this line is ran 
df.dropna(subset=['Corpus'], inplace=True)      #6255 rows after this line is ran 

df.shape

(6255, 11)

# Lowercase Conversion

In [13]:
# Convert 'Nouns', 'Verbs', 'Adverbs', and 'Corpus' to lowercase

columns_to_lower = ['Nouns','Verbs','Adverbs','Corpus']
for col in columns_to_lower:
    df[col] = df[col].str.lower()

df.head()


Unnamed: 0,Year,Artist,Song Title,Rank,Lyrics,Nouns,Verbs,Adverbs,Corpus,Word Counts,Unique Word Counts
0,1959.0,Johnny Horton,The Battle Of New Orleans,1,In 1814 we took a little trip Along with Colo...,trip bacon bean town gun comin while river of'...,take take take catch fire keep be be fire begi...,along as ago once more so as ago once more the...,take little trip colonel jackson mighty missis...,435,155
1,1959.0,Bobby Darin,Mack The Knife,2,"Oh the shark, babe Has such teeth, dear And he...",shark babe tooth jackknife babe sight shark to...,have show have keep know bite billow spread be...,pearly just so never never now just just down ...,oh shark babe tooth dear show pearly white jac...,224,145
2,1959.0,Lloyd Price,Personality,3,Over and over I tried to prove my love to you ...,love friend fool fool personality personality ...,try prove do say get get do cause get get do s...,over over over over over over so over now over...,try prove love friend say fool fool cause get ...,215,55
3,1959.0,Frankie Avalon,Venus,4,"Hey, Venus! Oh, Venus! Venus, if you will Ple...",girl girl kiss arm girl charm girl sunlight ha...,send thrill want make take place ask promise g...,surely too always as long surely too always as...,hey venus oh venus venus send little girl thri...,166,74
4,1959.0,Paul Anka,Lonely Boy,5,I'm just a lonely boy Lonely and blue I'm all ...,boy nothing moment love night day boy nothing ...,do get think want love love kiss hold like hea...,just all just all just so hard just all,lonely boy lonely blue get think want love yes...,140,68


# Datatype Check

In [14]:
df['Year'] = df['Year'].astype(int)
df['Artist'] = df['Artist'].astype('string')
df['Song Title'] = df['Song Title'].astype('string')
df['Lyrics'] = df['Lyrics'].astype('string')
df['Nouns'] = df['Nouns'].astype('string')
df['Verbs'] = df['Verbs'].astype('string')
df['Adverbs'] = df['Adverbs'].astype('string')
df['Corpus'] = df['Corpus'].astype('string')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6255 entries, 0 to 6291
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year                6255 non-null   int64 
 1   Artist              6255 non-null   string
 2   Song Title          6255 non-null   string
 3   Rank                6255 non-null   int64 
 4   Lyrics              6255 non-null   string
 5   Nouns               6246 non-null   string
 6   Verbs               6243 non-null   string
 7   Adverbs             6196 non-null   string
 8   Corpus              6255 non-null   string
 9   Word Counts         6255 non-null   int64 
 10  Unique Word Counts  6255 non-null   int64 
dtypes: int64(4), string(7)
memory usage: 586.4 KB


In [15]:
df.head()

Unnamed: 0,Year,Artist,Song Title,Rank,Lyrics,Nouns,Verbs,Adverbs,Corpus,Word Counts,Unique Word Counts
0,1959,Johnny Horton,The Battle Of New Orleans,1,In 1814 we took a little trip Along with Colo...,trip bacon bean town gun comin while river of'...,take take take catch fire keep be be fire begi...,along as ago once more so as ago once more the...,take little trip colonel jackson mighty missis...,435,155
1,1959,Bobby Darin,Mack The Knife,2,"Oh the shark, babe Has such teeth, dear And he...",shark babe tooth jackknife babe sight shark to...,have show have keep know bite billow spread be...,pearly just so never never now just just down ...,oh shark babe tooth dear show pearly white jac...,224,145
2,1959,Lloyd Price,Personality,3,Over and over I tried to prove my love to you ...,love friend fool fool personality personality ...,try prove do say get get do cause get get do s...,over over over over over over so over now over...,try prove love friend say fool fool cause get ...,215,55
3,1959,Frankie Avalon,Venus,4,"Hey, Venus! Oh, Venus! Venus, if you will Ple...",girl girl kiss arm girl charm girl sunlight ha...,send thrill want make take place ask promise g...,surely too always as long surely too always as...,hey venus oh venus venus send little girl thri...,166,74
4,1959,Paul Anka,Lonely Boy,5,I'm just a lonely boy Lonely and blue I'm all ...,boy nothing moment love night day boy nothing ...,do get think want love love kiss hold like hea...,just all just all just so hard just all,lonely boy lonely blue get think want love yes...,140,68


# Saving and Exporting CLEAN Data

In [None]:
folder_name = "../data"
file_name = "clean_processed.csv"

full_path = os.path.join(folder_name, file_name)
#df.to_csv(full_path, index=False)