#### Data Processing 

This notebook reads in the raw datasets, drops and renames columns, and merges the lyric and artist data together. We also retrieve Country and Heavy Metal lyrics, which we split into train, validation, and test sets for our models to share. 

This notebook can be run as-is to produce and save a CSV with all of our cleaned data as well as the genre-specific train/validation/test CSVs.

In [None]:
import pandas as pd
import utils 
from sklearn.model_selection import train_test_split
import nltk

#### Format Lyric Data

In [None]:
# read in lyrics data
lyrics_df = pd.read_csv('data/lyrics-data.csv')

# preview the raw data 
print(lyrics_df.shape)
lyrics_df.head(5)

In [None]:
def clean_artist_name(name: str) -> str:
    """
    Formats the column with the artist's name. Ensures that this column is formatted consistently, 
    as it will be used to merge datasets.
    Old format is '/firstname-lastname/', update to 'firstname lastname' 
    """
    name = name.lower()
    name = name.replace('-', ' ')
    name = name.replace('/', '')
    return name

In [None]:
# rename columns and drop unnecessary ones 
lyrics_df.rename(columns={'SName': 'song_name', 'Lyric': 'lyrics', 'ALink': 'artist'}, inplace = True)
lyrics_df.drop(columns=['SLink'], inplace=True) 
lyrics_df.dropna(inplace=True)

# clean the artist name
lyrics_df['artist'] = lyrics_df['artist'].apply(clean_artist_name)

# only keep songs in English
lyrics_df = lyrics_df[lyrics_df['language'] == 'en']

# print info about the cleaned lyric data 
print(lyrics_df.shape)
lyrics_df.head(5)

#### Format Artist Data

In [None]:
# read in artist data
artist_df = pd.read_csv('data/artists-data.csv')

# preview the raw data 
artist_df.head(5)

In [None]:
# rename columns and drop unnecessary ones 
artist_df.rename(columns={'Artist': 'artist', 'Genres': 'genres'}, inplace = True)
artist_df.drop(columns=['Popularity', 'Link', 'Songs'], inplace = True)
artist_df.dropna(inplace=True)

# clean the artist name
artist_df['artist'] = artist_df['artist'].apply(clean_artist_name)

# print out info about cleaned data 
print(artist_df.shape)
artist_df.head(5)

#### Merge Lyric and Artist Datasets

In [None]:
# merge datasets 
df = pd.merge(lyrics_df, artist_df, on='artist', how='inner')
df.dropna(inplace=True)

# turn genres into list
df['genres'] = df['genres'].apply(lambda genres: genres.split(';'))
df.reset_index(drop=True, inplace=True) 

# print out info on merged dataset 
print(df.shape)
df.head(5)

In [None]:
# save data as csv
df.to_csv('data/clean_data.csv', index=False)

#### Create separate train, validation, and test datasets to be shared across models 

In [None]:
def get_lyrics_in_genre(df: pd.DataFrame, genre: str) -> list:
	"""
	 Returns the lyrics of songs in df with the given genre.

	 Args:
			df (pandas DataFrame): dataframe of artist and lyric data
			genre (str): a music genre found in df
		Returns:
			A list of song lyrics, where each string is a single song
	""" 
	genre_df = df[df['genres'].apply(lambda x: genre in x)]
	return genre_df['lyrics'].tolist()

In [None]:
country_songs = get_lyrics_in_genre(df, "Country")
print("Number of Country Songs:", len(country_songs))

metal_songs = get_lyrics_in_genre(df, "Heavy Metal")
print("Number of Heavy Metal Songs:", len(metal_songs))

In [None]:
# shuffle songs and split into 80/10/10 train/val/test sets 
# we make our train/val/test splits here to ensure that the different sets do not have lyrics from the same song 
# (ensuring that the model does not see our validation or test data beforehand due to repetitive lyrics)

country_train, country_other = train_test_split(country_songs, train_size=.8, random_state=42) # split 80% / 20% 
country_val, country_test = train_test_split(country_other, train_size=.5, random_state=42) # split remaining 20% 50/50

# check sizes 
print("Country song splits:", len(country_train), len(country_val), len(country_test))

metal_train, metal_other = train_test_split(metal_songs, train_size=.8, random_state=42) # split 80% / 20% 
metal_val, metal_test = train_test_split(metal_other, train_size=.5, random_state=42) # split remaining 20% 50/50

# check sizes 
print("Heavy Metal song splits:", len(metal_train), len(metal_val), len(metal_test))

In [None]:
# split the songs into lines 
country_train_lines = utils.split_songs_into_lines(country_train)
country_val_lines = utils.split_songs_into_lines(country_val)
country_test_lines = utils.split_songs_into_lines(country_test)
print("Country line counts:", len(country_train_lines), len(country_val_lines), len(country_test_lines))


metal_train_lines = utils.split_songs_into_lines(metal_train)
metal_val_lines = utils.split_songs_into_lines(metal_val)
metal_test_lines = utils.split_songs_into_lines(metal_test)
print("Heavy Metal line counts:", len(metal_train_lines), len(metal_val_lines), len(metal_test_lines))

In [None]:
# we want a consistent number of samples per genre -- limit by lowest count
train_line_count = min(len(country_train_lines), len(metal_train_lines))
val_line_count = min(len(country_val_lines), len(metal_val_lines))
test_line_count = min(len(country_test_lines), len(metal_test_lines))

country_train_lines = country_train_lines[:train_line_count]
country_val_lines = country_val_lines[:val_line_count]
country_test_lines = country_test_lines[:test_line_count]
print("Country line counts:", len(country_train_lines), len(country_val_lines), len(country_test_lines))

metal_train_lines = metal_train_lines[:train_line_count]
metal_val_lines = metal_val_lines[:val_line_count]
metal_test_lines = metal_test_lines[:test_line_count]
print("Heavy Metal line counts:", len(metal_train_lines), len(metal_val_lines), len(metal_test_lines))

print()
print("Country line example:", country_train_lines[0])
print("Heavy Metal line example:", metal_train_lines[0])

In [None]:
# for validation and test data, limit each line to 10 tokens 
# (lines longer than this are likely due to inconsistent newline formatting and will inflate our perplexity)
# further, the gpt2 model sets all sequences to length 10, so we'd like to be more consistent between models 
# leave train data as-is to potentially give more text to train on 
def truncate_lines(lines: list, max_length: int=10) -> list:
    """
    Limits each line to the first max_length tokens

    Args:
        lines (list): a list of strings representing individual lines in a song
        max_length (int): the number of tokens to keep from each line

    Returns:
        The given lines, truncated to the given length
    """
    tokenized_lines = [nltk.word_tokenize(line)[:max_length] for line in lines]
    return [' '.join(line) for line in tokenized_lines]

country_val_lines = truncate_lines(country_val_lines)
country_test_lines = truncate_lines(country_test_lines)
metal_val_lines = truncate_lines(metal_val_lines)
metal_test_lines = truncate_lines(metal_test_lines)

In [None]:
# save to CSVs to be used across models 
pd.Series(country_train_lines).to_csv('data/country_train.csv', index=False, header=False)
pd.Series(country_val_lines).to_csv('data/country_val.csv', index=False, header=False)
pd.Series(country_test_lines).to_csv('data/country_test.csv', index=False, header=False)

pd.Series(metal_train_lines).to_csv('data/metal_train.csv', index=False, header=False)
pd.Series(metal_val_lines).to_csv('data/metal_val.csv', index=False, header=False)
pd.Series(metal_test_lines).to_csv('data/metal_test.csv', index=False, header=False)