## Data cleaning...
*Collect updated music track list and lyrics from web...*
    
#### Final Features ####
*The final dataset is intended to contain the following features for each song acquired...*
- song_name 
- song_artist
- song_lyrics
- date_added
- Spotify Acoustic Features (11 total, ~2 need normalizing)
    - energy
    - liveness
    - tempo
    - speechiness
    - acousticness
    - instrumentalness
    - time_signature
    - danceability
    - duration_ms
    - loudness
    - valence
- Utils for data_mapping (2)
    - utils_spotify_id
    - utils_genius_data

### NOTE: Will need to re-download all song lyrics

In [10]:
# libraries & tool imports...
import os
import re
import ast
import sys
import nltk
import json
import pprint
import requests

import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from datetime import datetime
from spotify import spotifyApi
from text_miner import textMiner
from text_miner import geniusApi
from pandas.io.json import json_normalize  # Packages required for preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer  # for lemmatization
from nltk.corpus import stopwords

nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dayoorigunwa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dayoorigunwa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dayoorigunwa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# initializing helpers..
helper = spotifyApi()
txtMiner = textMiner()
genius = geniusApi()

tools = [helper, txtMiner, genius]
# for t in tools:
#     print(f"\n\nExisting Methods for: {print(t)} are: \n {[val for val in list(dir(t)) if str(val)[0] != '_']}")

In [3]:
# existing dataset imports... df2 = df.rename({'a': 'X', 'b': 'Y'}, axis=1)
data_path = "/Users/dayoorigunwa/code_base/music_mapping/data/"
allfiles = [f for f in listdir(data_path) if isfile(join(data_path, f))]
scraped_dfs = [filename for filename in allfiles if "scraped_dataset" in filename]
scraped_dfs.sort()
scraped_df = pd.read_csv(data_path + scraped_dfs[-1])
scraped_df.head()
# l_v1 = pd.read_csv(data_path + "cleaned_dataset.csv")
# l_v2 = pd.read_csv(data_path + "cleaned_dataset_v2.csv")

# print(
#     f"v1 NA count: {l_v1.lyrics.isna().sum()} of {len(l_v1.index)}\nv2 NA count: {l_v2.lyrics.isna().sum()} of {len(l_v2.index)}"
# )

Unnamed: 0.1,Unnamed: 0,name,artist,lyrics,date_added,energy,liveness,tempo,speechiness,acousticness,instrumentalness,time_signature,danceability,valence,duration_ms,loudness,utils_spotify_id,utils_genius_data
0,0,family ties (with Kendrick Lamar),Baby Keem,,2022-04-21T00:18:15Z,0.611,0.23,134.093,0.33,0.00588,0.0,4.0,0.711,0.144,252262.0,-5.453,3QFInJAm9eyaho5vBzxInN,
1,1,trademark usa,Baby Keem,[Part I] [Intro] I can't help but feel neglect...,2022-04-21T00:18:29Z,0.6,0.274,130.732,0.281,0.108,2e-06,4.0,0.613,0.067,270671.0,-5.621,6G9aDedv5hYaTgNYDuduqk,
2,2,VALENTINO,24kGoldn,"[Chorus] I don't want a valentine, I just wa...",2022-04-21T00:19:39Z,0.717,0.132,150.964,0.179,0.199,0.0,4.0,0.746,0.523,179133.0,-4.841,6piAUJJQFD8oHDUr0b7l7q,
3,3,Pepas,Farruko,"[Letra de ""Pepas""] [Refrán] No me importa lo q...",2022-04-21T00:41:47Z,0.766,0.128,130.001,0.0343,0.00776,7e-05,4.0,0.762,0.442,287120.0,-3.955,5fwSHlTEWpluwOM0Sxnh5k,
4,4,Stick (with JID & J. Cole feat. Kenny Mason & ...,Dreamville,,2022-04-21T00:42:09Z,0.857,0.668,118.574,0.292,0.266,0.0,4.0,0.671,0.597,309323.0,-5.435,1BzXvBpIFWJgu0P8P6xmP4,


In [4]:
# formatting cols...
scraped_df.dropna(inplace=True)
scraped_df.drop("Unnamed: 0", axis=1, inplace=True)
scraped_df["utils_genius_data"] = scraped_df["utils_genius_data"].apply(
    lambda x: ast.literal_eval(x)
)
scraped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 627 entries, 33 to 1070
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               627 non-null    object 
 1   artist             627 non-null    object 
 2   lyrics             627 non-null    object 
 3   date_added         627 non-null    object 
 4   energy             627 non-null    float64
 5   liveness           627 non-null    float64
 6   tempo              627 non-null    float64
 7   speechiness        627 non-null    float64
 8   acousticness       627 non-null    float64
 9   instrumentalness   627 non-null    float64
 10  time_signature     627 non-null    float64
 11  danceability       627 non-null    float64
 12  valence            627 non-null    float64
 13  duration_ms        627 non-null    float64
 14  loudness           627 non-null    float64
 15  utils_spotify_id   627 non-null    object 
 16  utils_genius_data  627 n

#### Defining Lyric Cleaning Methods...

In [5]:
# Data Discovery Tools Continued: Text Cleaning...

# TOOLS FOR REMOVING SONG STRUCTURE FROM LYRICS...
# helper a
def get_bracket_idx(lyric_str):
    bracket_count = 0
    bracket_locs = dict()
    for i, char in enumerate(lyric_str):
        if char == "[":
            start_ind = i
        elif char == "]":
            bracket_locs[bracket_count] = (start_ind, i)
            bracket_count += 1
    return bracket_locs


# helper b
def extract_brackets(row):
    brackets = row["flavor_text_idx"]
    lyric_str = row["lyrics"]
    bracket_list = []
    for key in list(brackets.keys()):
        bracket_list.append(lyric_str[brackets[key][0] : brackets[key][1] + 1])
    return bracket_list


# helper c
def scrub_brackets(row):
    brackets = row["flavor_text"]
    lyric_str = row["lyrics"]
    for br in brackets:
        lyric_str = lyric_str.replace(br, "")
    return lyric_str


# scrubbing Genius captions from lyrics with helpers a-c...
def scrub_song_structure(df, col):
    """
    This function takes in the raw webscraped lyric df and
    and a string for column name and returns the df with
    the original columns plus a column with the flavor text
    indices as flavor_text_idx, a column with the flavor text
    as flavor_text, and a column cleaned of its flavor text as lyrics.
    """
    df["flavor_text_idx"] = df[col].apply(lambda x: get_bracket_idx(x))
    df["flavor_text"] = df.apply(lambda x: extract_brackets(x), axis=1)
    df["lyrics"] = df.apply(lambda x: scrub_brackets(x), axis=1)
    return df


# TOOLS FOR PREPPING TEXT DATA FOR NATURAL LANGUAGE PROCESSING...
#### Copied methods from: https://medium.com/swlh/nlp-sentiment-analysis-music-to-my-ears-fcf075eaea60
## Recall: Definitions of categories: https://medium.com/mlearning-ai/nlp-tokenization-stemming-lemmatization-and-part-of-speech-tagging-9088ac068768
def basic_clean(df, col):
    """
    This function takes in a df and a string for a column and
    returns the df with a new column named 'basic_clean' with the
    passed column text normalized.
    """
    df["basic_clean"] = (
        df[col]
        .str.lower()
        .replace(r"[^\w\s]", "", regex=True)
        .str.normalize("NFKC")
        .str.encode("ascii", "ignore")
        .str.decode("utf-8", "ignore")
    )
    return df


def tokenize(df, col):
    """
    This function takes in a df and a string for a column and
    returns a df with a new column named 'clean_tokes' with the
    passed column text tokenized and in a list.
    """
    tokenizer = nltk.tokenize.ToktokTokenizer()
    df["clean_tokes"] = df[col].apply(tokenizer.tokenize)
    return df


def stem(df, col):
    """
    This function takes in a df and a string for a column name and
    returns a df with a new column named 'stemmed'.
    """
    # Create porter stemmer
    ps = nltk.porter.PorterStemmer()

    # Stem each token from our clean_tokes Series of lists
    stems = df[col].apply(lambda row: [ps.stem(word) for word in row])

    # Join our cleaned, stemmed lists of words back into sentences
    df["stemmed"] = stems.str.join(" ")

    return df


def lemmatize(df, col):
    """
    This function takes in a df and a string for column name and
    returns the original df with a new column called 'lemmatized'.
    """
    # Create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()

    # Lemmatize each token from our clean_tokes Series of lists
    lemmas = df[col].apply(lambda row: [wnl.lemmatize(word) for word in row])

    # Join the cleaned and lemmatized tokens back into sentences
    df["lemmatized"] = lemmas.str.join(" ")
    return df


def remove_stopwords(df, col):
    """
    This function takes in a df and a string for column name and
    returns the df with a new column named 'clean' with stopwords removed.
    """
    # Create stopword_list
    stopword_list = nltk.corpus.stopwords.words("english")
    stopword_list.extend("&#9;")

    # Split words in column
    words = df[col].str.split()

    # Check each word in each row of the column against stopword_list and return only those that are not in list
    filtered_words = words.apply(
        lambda row: [word for word in row if word not in stopword_list]
    )

    # Create new column of words that have stopwords removed
    df["clean_" + col] = filtered_words.str.join(" ")

    return df


def tag_parts_of_speech(df, col):
    """
    This function takes in a df and a string for column name and
    returns the df with a new column named 'pos_tags' with each token
    tagged with its respective part of speech .
    """
    df["pos_tags"] = df[col].apply(lambda row: nltk.pos_tag(row))
    return df


def prep_nlp_data(df, col):
    """
    This function takes in the raw lyric df and
    and a string for column name and
    returns the df with original columns plus cleaned
    and lemmatized content without stopwords.
    """
    # Remove song structure from lyrics
    df = scrub_song_structure(df, col)

    # Do basic clean on repo content
    df = basic_clean(df, col)

    # Tokenize clean content
    df = tokenize(df, "basic_clean")

    # Stem cleaned and tokenized content
    df = stem(df, "clean_tokes")

    # Apply Part of Speach tagging to tokenized content
    df = lemmatize(df, "clean_tokes")

    # Lemmatize cleaned and tokenized content
    df = tag_parts_of_speech(df, "clean_tokes")

    # Remove stopwords from Lemmatized content
    df = remove_stopwords(df, "lemmatized")

    return df

In [6]:
# New and improved data cleaning...

# Cleaning song text...
scraped_df = prep_nlp_data(scraped_df, "lyrics")

# Create a feature that gives us a length of each song...
scraped_df["song_length"] = scraped_df["lyrics"].str.len()

# Sanity check...
scraped_df.head()

Unnamed: 0,name,artist,lyrics,date_added,energy,liveness,tempo,speechiness,acousticness,instrumentalness,...,utils_genius_data,flavor_text_idx,flavor_text,basic_clean,clean_tokes,stemmed,lemmatized,pos_tags,clean_lemmatized,song_length
33,Then the Quiet Explosion,Hammock,I can’t feel you There’s no trace Lights will ...,2019-02-14T01:14:15Z,0.357,0.0827,95.663,0.0351,0.652,0.894,...,"{'annotation_count': 0, 'api_path': '/songs/64...",{},[],i cant feel you theres no trace lights will bu...,"[i, cant, feel, you, theres, no, trace, lights...",i cant feel you there no trace light will burn...,i cant feel you there no trace light will burn...,"[(i, NN), (cant, VBP), (feel, NN), (you, PRP),...",cant feel trace light burn blood clay falling ...,244
38,Two Thousand and Seventeen,Four Tet,,2019-04-09T15:36:30Z,0.469,0.0939,75.495,0.0296,0.327,0.161,...,"{'annotation_count': 1, 'api_path': '/songs/32...","{0: (0, 19)}",[[Non-Lyrical Vocals]],,[],,,[],,0
39,Immunity,Jon Hopkins,You've answered my prayer For a worthless diam...,2019-11-03T13:41:48Z,0.305,0.111,139.878,0.0364,0.892,0.942,...,"{'annotation_count': 1, 'api_path': '/songs/57...",{},[],youve answered my prayer for a worthless diamo...,"[youve, answered, my, prayer, for, a, worthles...",youv answer my prayer for a worthless diamond ...,youve answered my prayer for a worthless diamo...,"[(youve, RB), (answered, VBN), (my, PRP$), (pr...",youve answered prayer worthless diamond carbon...,386
52,Mumma Don't Tell,Leifur James,My mama don't tell I'm the same My mama don't ...,2020-10-04T19:20:37Z,0.283,0.11,108.028,0.0482,0.345,0.762,...,"{'annotation_count': 0, 'api_path': '/songs/62...",{},[],my mama dont tell im the same my mama dont tel...,"[my, mama, dont, tell, im, the, same, my, mama...",my mama dont tell im the same my mama dont tel...,my mama dont tell im the same my mama dont tel...,"[(my, PRP$), (mama, NN), (dont, NN), (tell, NN...",mama dont tell im mama dont tell fall line mam...,336
53,Quick Musical Doodles,Two Feet,You remember You remember my love You sold yo...,2020-10-04T19:23:11Z,0.349,0.374,169.773,0.27,0.241,0.685,...,"{'annotation_count': 0, 'api_path': '/songs/28...","{0: (0, 8), 1: (100, 108)}","[[Verse 1], [Verse 2]]",you remember you remember my love you sold yo...,"[you, remember, you, remember, my, love, you, ...",you rememb you rememb my love you sold your so...,you remember you remember my love you sold you...,"[(you, PRP), (remember, VBP), (you, PRP), (rem...",remember remember love sold soul sold soul dru...,183


In [7]:
# drop zero-length songs... NO - TODO: Fix webscraping for songs that are missing lyrics!!

In [8]:
date_piece = datetime.today().strftime("%Y-%m-%d")
scraped_df.to_csv(data_path + "prepped_dataset_" + date_piece + ".csv")

In [9]:
scraped_df.lyrics.isna().sum()

0