This notebook will perform some amount of wrangling, repeat all previous Natural Language Processing (NLP) preprocessing, and conduct feature engineering.  The feature engineering leverages work conducted in Step_5_Create_Stop_and_Unique_words and will be used to domain-specifc scoring (like sentiment) and expanded, domain-specific stopwords list.

This notebook will have a companion python script in the 'sample' folder of this Git repository.

In [1]:
import s3fs

import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from tqdm.autonotebook import tqdm
tqdm.pandas(desc="progress-bar", leave=False)
import string

import spacy
from spacy.lang import punctuation
from spacy.lang.en import English
from spacy import displacy
nlp = spacy.load("en_core_web_lg")

import unicodedata  # might need to pip install unicodedate2 on aws sagemaker
import contractions
from contractions import contractions_dict ## pip installed this
from wordcloud import WordCloud, STOPWORDS #pip install
from textblob import TextBlob
!python -m textblob.download_corpora
from afinn import Afinn

import nltk
import nltk.corpus 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import word2vec
import multiprocessing as mp

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')
cores = mp.cpu_count()

import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style='darkgrid',palette='Dark2',rc={'figure.figsize':(9,6),'figure.dpi':90})

punctuation = string.punctuation + '”' + '“' + '–' + '““' + "’’" + '”'
stopword = stopwords.words('english')
stopwords = set(STOPWORDS)
wordnet_lemmatizer = WordNetLemmatizer()

# Increase screen size.
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 100000)
pd.set_option('display.max_columns', 100000)
pd.set_option('display.width', 1000)

%matplotlib inline
sns.set(style='darkgrid',palette='Dark2', rc={'figure.figsize':(9,6), 'figure.dpi':100})

  from tqdm.autonotebook import tqdm


[nltk_data] Downloading package brown to /Users/Gretzky/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Gretzky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Gretzky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Gretzky/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/Gretzky/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/Gretzky/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [9]:
#File Admin Issues

import os
import io
import boto3

from dotenv import load_dotenv
load_dotenv(verbose=True)

def aws_session(region_name='us-east-1'):
    return boto3.session.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), #looks for any .env file
                                aws_secret_access_key=os.getenv('AWS_ACCESS_KEY_SECRET'), #Has to be in same directory
                                region_name=region_name) #from above

def make_bucket(name, acl): 
    session = aws_session()
    s3_resource = session.resource('s3')
    return s3_resource.create_bucket(Bucket=name, ACL=acl)

def upload_file_to_bucket(bucket_name, file_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    file_dir, file_name = os.path.split(file_path)

    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(
      Filename=file_path,
      Key=file_name,
      ExtraArgs={'ACL': 'public-read'}
    )

    s3_url = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
    return s3_url

fs = s3fs.S3FileSystem(anon=False,key='####',secret='####'')

#g_df = pd.read_csv('s3://music-lyrics-chain/g_df')#entire dataset, index, song_name, lyrics, genre
#g_stop = pd.read_csv('s3://music-lyrics-chain/g_stopwords')#from 80% g_train dataset, domain specific stop words
hiphop = pd.read_csv('s3://music-lyrics-chain/uniquely_hiphop')# from 80% g_train dataset, uniquely hiphop
pop = pd.read_csv('s3://music-lyrics-chain/uniquely_pop')# from 80% g_train dataset, uniquely pop
rock = pd.read_csv('s3://music-lyrics-chain/uniquely_rock')# from 80% g_train dataset, uniquely rock

In [3]:
# With appreciation for the Fake News Way
def remove_special_characters(text): 
    """
    Removes special characters from the text document
    """
    # define the pattern to keep. You can check the regex using this url https://regexr.com/
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    return re.sub(pat, '', text)

def remove_extra_whitespace_tabs(text): 
    """
    Removes extra whitespaces and remove_extra_whitespace_tabs
    """
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

def remove_digits(text): 
    """
    Remove all digits from the text document
     take string input and return a clean text without numbers.
        Use regex to discard the numbers.
    """
    result = ''.join(i for i in text if not i.isdigit()).lower()
    return ' '.join(result.split())

def remove_newlines(text): 
    """
    Remove newline characters from the text document
    """
    return text.replace('\\n', ' ').replace('\\r', ' ').replace('\n', ' ').replace('\r', ' ').replace('\\', ' ')

#normalize to the NFKD (Normalization Form Compatibility Decomposition) form
#that present in the Unicode standard to remain compatible with other encodings
def remove_accented_chars(text): 
    """
    Removes accented characters from the test
    """
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

import contractions
#contractions.fix(g_df['lyrics'][10])

#expands contractions found in the text
def expand_contractions(text):
    expanded_text = contractions.fix(text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# replace punctuation characters with spaces
def replace_punctuation(text):
    filters = string.punctuation + '”' + '“' + '–' + '!' + '?' + '.' + ',' #added !, ?, . , and comma
    translate_dict = dict((c, " ") for c in filters)   
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    return text

# Remove stopwords and remove words with 2 or less characters
def stops_letters(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2 and token not in stopword:
            result.append(token)
            
    return " ".join(result)

#Removes any word that starts with either http or https
def remove_urls(vTEXT):
    #vTEXT = re.sub('http://\S+|https://\S+', '', vTEXT,flags=re.MULTILINE)
    vTEXT = re.sub('http[s]?://\S+', '', vTEXT,flags=re.MULTILINE)
    return(vTEXT)

#Remove words that starts with www
def remove_www(vTEXT):
    vTEXT = re.sub('www\S+', '', vTEXT,flags=re.MULTILINE)
    return(vTEXT)

In [4]:
g_stop = g_stop.dropna(subset=['All'])
g_stop.info()#confirm this fixed a known issue

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94804 entries, 0 to 94804
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  94804 non-null  int64 
 1   All         94804 non-null  object
 2   0           94804 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.9+ MB


In [5]:
g_df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [6]:
g_df.head()

Unnamed: 0,song_name,lyrics,genre
0,More Than This,I could feel at the time. There was no way of ...,Rock
1,Because The Night,"Take me now, baby, here as I am. Hold me close...",Rock
2,These Are Days,These are. These are days you'll remember. Nev...,Rock
3,A Campfire Song,"A lie to say, ""O my mountain has coal veins an...",Rock
4,Everyday Is Like Sunday,Trudging slowly over wet sand. Back to the ben...,Rock


In [8]:
#Standard NLP run through.
g_df['lyrics'] = g_df['lyrics'].apply(remove_urls)
g_df['lyrics'] = g_df['lyrics'].apply(remove_www)
g_df['lyrics'] = g_df['lyrics'].apply(remove_special_characters)
g_df['lyrics'] = g_df['lyrics'].apply(remove_extra_whitespace_tabs)
g_df['lyrics'] = g_df['lyrics'].apply(remove_digits)
g_df['lyrics'] = g_df['lyrics'].apply(remove_accented_chars)
g_df['lyrics'] = g_df['lyrics'].apply(expand_contractions)
g_df['lyrics'] = g_df['lyrics'].apply(replace_punctuation)

In [9]:
g_df.iloc[50]['lyrics']

'an augist day in the hills of spain   a pair of children emerged from a cave   the strangest sight there alone they stood   with skin of green and words no one had heard   the girl was stronger  the boy was weak   with her new mother she learned to speak   and wove a tale of a dying sun   they had left darkness  a dark world come undone   they travelled so far   believing they came from a star   she fell through life  through time  through parallel lives   the men of science  the men of fame   the men of letters tried to explain   was it parallel worlds or a twist of time  to make her think she would fallen from the sky   a whirlwind spun them all alone   took them from their twilight home   believing they came from a star  '

In [10]:
# word counts
g_df['full_word_count'] = g_df["lyrics"].apply(lambda x: len(str(x).split(" ")))

# Character counts
g_df['full_character_count'] = g_df["lyrics"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

#average word length
g_df['full_avg_word_length'] = g_df['full_character_count'] / g_df['full_word_count']

In [11]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   song_name             86294 non-null  object 
 1   lyrics                86294 non-null  object 
 2   genre                 86294 non-null  object 
 3   full_word_count       86294 non-null  int64  
 4   full_character_count  86294 non-null  int64  
 5   full_avg_word_length  86294 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 4.0+ MB


In [12]:
#Gensim stopword removal.  Creating a medium sized lyrics set.  I'll run a couple feature engineering
#functions on it.  Then create a smaller set with the domain specific stopwords list and compare the two.
g_df['med_lyrics'] =g_df['lyrics'].apply(stops_letters)

In [13]:
# word counts
g_df['med_word_count'] = g_df["med_lyrics"].apply(lambda x: len(str(x).split(" ")))

# Character counts
g_df['med_character_count'] = g_df["med_lyrics"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

#average word length
g_df['med_avg_word_length'] = g_df['med_character_count'] / g_df['med_word_count']

In [14]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   song_name             86294 non-null  object 
 1   lyrics                86294 non-null  object 
 2   genre                 86294 non-null  object 
 3   full_word_count       86294 non-null  int64  
 4   full_character_count  86294 non-null  int64  
 5   full_avg_word_length  86294 non-null  float64
 6   med_lyrics            86294 non-null  object 
 7   med_word_count        86294 non-null  int64  
 8   med_character_count   86294 non-null  int64  
 9   med_avg_word_length   86294 non-null  float64
dtypes: float64(2), int64(4), object(4)
memory usage: 6.6+ MB


In [15]:
g_df.iloc[50]['med_lyrics']

'augist day hills spain pair children emerged cave strangest sight stood skin green words heard girl stronger boy weak new mother learned speak wove tale dying sun left darkness dark world come undone travelled far believing came star fell life time parallel lives men science men fame men letters tried explain parallel worlds twist time think fallen sky whirlwind spun took twilight home believing came star'

In [16]:
type(g_df.iloc[50]['med_lyrics'])

str

In [17]:
print("Current Time =", datetime.now())

Current Time = 2021-06-08 14:22:02.394337


In [18]:
#Feature engineering, Affinity score.

afinn = Afinn()

def get_affinity_scores(lyrics):
    scores = []
    count = 0
    for t in lyrics:
        if len(t) > 0:
            scores.append(afinn.score(t) / len(t))
        else:
            count += 1
            scores.append(0)
    return scores

new_affin = get_affinity_scores(g_df['med_lyrics'].tolist())

g_df['med_content_affin'] = new_affin

print("Current Time =", datetime.now())

Current Time = 2021-06-08 14:26:19.521217


In [19]:
print("Current Time =", datetime.now())

Current Time = 2021-06-08 14:32:28.254643


Something was broken in this.  The sent_score was always the same number 
and the labels were incorrect sometimes.  I fixed it with some changes however
the med_sent_score is cast as a list, an object.  Need it as a Float for ML pipeline.

Will fix later.

In [20]:
#Feature engineering, Sentiment score and label

def sentiment_check (text):
    polarity_score = TextBlob(text).sentiment.polarity
    if polarity_score < 0:
        return 'negative'
    elif polarity_score == 0:
        return 'neutral'
    else:
        return 'positive'
    
g_df['med_sent_label'] = g_df['med_lyrics'].apply(sentiment_check)

print("Label done. Current Time =", datetime.now())

def new_sent_ck (text):
    polarity_score = TextBlob(text).sentiment.polarity
    sent_score = []
    sent_score.append(polarity_score)
    return sent_score

g_df['med_sent_score'] = g_df['med_lyrics'].apply(new_sent_ck) 

print("Both med_sent tasks done. Current Time =", datetime.now())

Label done. Current Time = 2021-06-08 14:33:04.082967
Both med_sent tasks done. Current Time = 2021-06-08 14:33:37.066180


In [21]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   song_name             86294 non-null  object 
 1   lyrics                86294 non-null  object 
 2   genre                 86294 non-null  object 
 3   full_word_count       86294 non-null  int64  
 4   full_character_count  86294 non-null  int64  
 5   full_avg_word_length  86294 non-null  float64
 6   med_lyrics            86294 non-null  object 
 7   med_word_count        86294 non-null  int64  
 8   med_character_count   86294 non-null  int64  
 9   med_avg_word_length   86294 non-null  float64
 10  med_content_affin     86294 non-null  float64
 11  med_sent_label        86294 non-null  object 
 12  med_sent_score        86294 non-null  object 
dtypes: float64(3), int64(4), object(6)
memory usage: 8.6+ MB


In [22]:
type(g_df['med_sent_score'])

pandas.core.series.Series

Need it as a Float...

In [23]:
type(g_df.iloc[50]['med_sent_score'])

list

In [24]:
g_df.iloc[50]['med_sent_score']

[-0.061079545454545456]

In [25]:
g_df.iloc[50]['med_sent_label']

'negative'

In [26]:
print("Current Time =", datetime.now())

Current Time = 2021-06-08 14:38:21.592758


In [27]:
#Feature engineering, giant string for a vectorizer, later.

import nltk
nltk.download('punkt')
nltk.download('wordnet')
  
def lemmatized_word(text):

    word_tokens = nltk.word_tokenize(text)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    return  " ".join(lemmatized_word) #combine the words into a giant string that vectorizer can accept

g_df['med_vector'] = g_df['med_lyrics'].apply(lemmatized_word)

print("Current Time =", datetime.now())

[nltk_data] Downloading package punkt to /Users/Gretzky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Gretzky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Current Time = 2021-06-08 14:39:23.391426


In [29]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   song_name             86294 non-null  object 
 1   lyrics                86294 non-null  object 
 2   genre                 86294 non-null  object 
 3   full_word_count       86294 non-null  int64  
 4   full_character_count  86294 non-null  int64  
 5   full_avg_word_length  86294 non-null  float64
 6   med_lyrics            86294 non-null  object 
 7   med_word_count        86294 non-null  int64  
 8   med_character_count   86294 non-null  int64  
 9   med_avg_word_length   86294 non-null  float64
 10  med_content_affin     86294 non-null  float64
 11  med_sent_label        86294 non-null  object 
 12  med_sent_score        86294 non-null  object 
 13  med_vector            86294 non-null  object 
dtypes: float64(3), int64(4), object(7)
memory usage: 9.2+ MB


In [30]:
g_df.iloc[50]['med_vector']

'augist day hill spain pair child emerged cave strangest sight stood skin green word heard girl stronger boy weak new mother learned speak wove tale dying sun left darkness dark world come undone travelled far believing came star fell life time parallel life men science men fame men letter tried explain parallel world twist time think fallen sky whirlwind spun took twilight home believing came star'

In [31]:
type(g_df.iloc[50]['med_vector'])

str

In [32]:
print("Current Time =", datetime.now())

Current Time = 2021-06-08 14:43:16.510021


In [33]:
#Feature engineering, create domain specific scores based on words unique to particulary genres.
def genre_count(text):
    result = 0
    text_tokenized = word_tokenize(text)
    for i in range(0, len(text_tokenized)):
        if text_tokenized[i] in stop_words:
            result += digit
        else:
            pass
    if result != 0:
        return result
    else:
        pass

#Set Rock! words...
stop_words = nltk.corpus.stopwords.words('english')

stop_words = []

rock2 = rock['Word'].to_dict()
rock3 = list(rock2.values())
digit = .01

stop_words.extend(rock3)
print(len(stop_words), 'Rock!')
print("Current Time =", datetime.now())

#Run genre_count with Rock!
g_df['med_rock_genre_count'] =g_df['med_lyrics'].apply(genre_count)

#Reset to Hip Hop...
stop_words = nltk.corpus.stopwords.words('english')

stop_words = []

hiphop2 = hiphop['Word'].to_dict()
hiphop3 = list(hiphop2.values())
digit = 100

stop_words.extend(hiphop3)
print(len(stop_words), 'Hip Hop')
print("Current Time =", datetime.now())

#Run genre_count with Hip Hop
g_df['med_hiphop_genre_count'] =g_df['med_lyrics'].apply(genre_count)

#Reset to Pop...
stop_words = nltk.corpus.stopwords.words('english')

stop_words = []

pop2 = pop['Word'].to_dict()
pop3 = list(pop2.values())
digit = 1

stop_words.extend(pop3)
print(len(stop_words), 'Pop')
print("Current Time =", datetime.now())

#Run genre_count with Hip Hop
g_df['med_pop_genre_count'] =g_df['med_lyrics'].apply(genre_count)

print("Current Time =", datetime.now())

23091 Rock!
Current Time = 2021-06-08 14:43:19.525702
29843 Hip Hop
Current Time = 2021-06-08 15:23:33.054847
13757 Pop
Current Time = 2021-06-08 16:28:44.141510
Current Time = 2021-06-08 16:50:21.369156


In [34]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   song_name               86294 non-null  object 
 1   lyrics                  86294 non-null  object 
 2   genre                   86294 non-null  object 
 3   full_word_count         86294 non-null  int64  
 4   full_character_count    86294 non-null  int64  
 5   full_avg_word_length    86294 non-null  float64
 6   med_lyrics              86294 non-null  object 
 7   med_word_count          86294 non-null  int64  
 8   med_character_count     86294 non-null  int64  
 9   med_avg_word_length     86294 non-null  float64
 10  med_content_affin       86294 non-null  float64
 11  med_sent_label          86294 non-null  object 
 12  med_sent_score          86294 non-null  object 
 13  med_vector              86294 non-null  object 
 14  med_rock_genre_count    19381 non-null

In [35]:
type(g_df.iloc[50]['med_pop_genre_count'])

numpy.float64

In [37]:
g_df.loc[50]

song_name                                                    Green Children
lyrics                    an augist day in the hills of spain   a pair o...
genre                                                                  Rock
full_word_count                                                         177
full_character_count                                                    558
full_avg_word_length                                               3.152542
med_lyrics                augist day hills spain pair children emerged c...
med_word_count                                                           66
med_character_count                                                     343
med_avg_word_length                                                 5.19697
med_content_affin                                                 -0.012255
med_sent_label                                                     negative
med_sent_score                                      [-0.061079545454545456]
med_vector  

In [41]:
g_df.to_csv('g_df_final_halfway')

def upload_file_to_bucket(bucket_name, file_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    file_dir, file_name = os.path.split(file_path)

    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(
      Filename=file_path,
      Key=file_name,
      ExtraArgs={'ACL': 'private'}
    )

    s3_url = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
    return s3_url



upload_file_to_bucket('final-music-flow','g_df_final_halfway')

'https://final-music-flow.s3.amazonaws.com/g_df_final_halfway'

Start the domain-specific round of smoothing and feature engineering.

Next step is to use the domain-specific stopwords list with NLTK stopwords function (and list of ~170 words).  Will run 

In [42]:
stop_words = nltk.corpus.stopwords.words('english')
g_stop2 = g_stop['All'].to_dict()
g_stop3 = list(g_stop2.values())
stop_words.extend(g_stop3)
len(stop_words)

94983

In [43]:
import nltk
import nltk.corpus 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words.extend(g_stop3)

def stops_word(text):
    result = []
    text_tokenized = word_tokenize(text)
    for i in range(0, len(text_tokenized)):
        if text_tokenized[i] not in stop_words:
            result.append(text_tokenized[i])
        else:
            pass
            
    return str(result).replace("'","")

In [44]:
print("Current Time =", datetime.now())

Current Time = 2021-06-08 18:03:44.846767


In [45]:
g_df['sml_lyrics'] =g_df['lyrics'].apply(stops_word)

print("Current Time =", datetime.now())

Current Time = 2021-06-09 06:48:29.850277


In [46]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   song_name               86294 non-null  object 
 1   lyrics                  86294 non-null  object 
 2   genre                   86294 non-null  object 
 3   full_word_count         86294 non-null  int64  
 4   full_character_count    86294 non-null  int64  
 5   full_avg_word_length    86294 non-null  float64
 6   med_lyrics              86294 non-null  object 
 7   med_word_count          86294 non-null  int64  
 8   med_character_count     86294 non-null  int64  
 9   med_avg_word_length     86294 non-null  float64
 10  med_content_affin       86294 non-null  float64
 11  med_sent_label          86294 non-null  object 
 12  med_sent_score          86294 non-null  object 
 13  med_vector              86294 non-null  object 
 14  med_rock_genre_count    19381 non-null

In [47]:
type(g_df.iloc[50]['sml_lyrics'])

str

In [48]:
g_df.iloc[50]['sml_lyrics']

'[day, hills, spain, pair, children, cave, strangest, sight, alone, stood, skin, green, words, one, heard, girl, stronger, boy, weak, new, mother, learned, speak, tale, dying, sun, left, darkness, dark, world, come, undone, travelled, far, believing, came, star, fell, life, time, parallel, lives, men, science, men, fame, men, letters, tried, explain, parallel, worlds, twist, time, make, think, would, fallen, sky, whirlwind, spun, alone, took, twilight, home, believing, came, star]'

In [49]:
g_df['sml_lyrics']=g_df['sml_lyrics'].str.replace(',' ,'')# Fixes the srings with commas issue.
g_df['sml_lyrics']=g_df['sml_lyrics'].str.replace('[' ,'')
g_df['sml_lyrics']=g_df['sml_lyrics'].str.replace(']' ,'')

In [50]:
g_df.iloc[50]['sml_lyrics']

'day hills spain pair children cave strangest sight alone stood skin green words one heard girl stronger boy weak new mother learned speak tale dying sun left darkness dark world come undone travelled far believing came star fell life time parallel lives men science men fame men letters tried explain parallel worlds twist time make think would fallen sky whirlwind spun alone took twilight home believing came star'

In [54]:
# word counts
g_df['sml_word_count'] = g_df["sml_lyrics"].apply(lambda x: len(str(x).split(" ")))

# Character counts
g_df['sml_character_count'] = g_df["sml_lyrics"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

#average word length
g_df['sml_avg_word_length'] = g_df['sml_character_count'] / g_df['sml_word_count']

In [52]:
# Total words after domain-specific stopwords but before gensim stopwords.
g_df['sml_word_count'].sum()

11371013

In [53]:
#Gensim stopword removal.  Same as what was run on med_lyrics.  IOT limit differences between 
#sml_ and med_ portions of dataset to just domain-specific stopwords and scoring.

g_df['sml_lyrics'] = g_df['sml_lyrics'].apply(stops_letters)

In [55]:
g_df['sml_word_count'].sum()

8830376

In [56]:
g_df['med_word_count'].sum()

9358663

In [57]:
print("Current Time =", datetime.now())

Current Time = 2021-06-09 06:56:52.967291


In [58]:
#Feature engineering, Affinity score.

afinn = Afinn()

new_affin = get_affinity_scores(g_df['sml_lyrics'].tolist())

g_df['sml_content_affin'] = new_affin

print("Current Time =", datetime.now())

Current Time = 2021-06-09 07:00:50.880932


In [59]:
#Feature engineering, Sentiment score and label

""" Something was broken in this.  The sent_score was always the same number 
and the labels were incorrect sometimes.  I fixed it with some changes, however
the sml_sent_score is cast as a list, an object.  Need it as a Float.

Will fix later."""

print("Current Time =", datetime.now())

def sentiment_check (text):
    polarity_score = TextBlob(text).sentiment.polarity
    if polarity_score < 0:
        return 'negative'
    elif polarity_score == 0:
        return 'neutral'
    else:
        return 'positive'
    
g_df['sml_sent_label'] = g_df['sml_lyrics'].apply(sentiment_check)

print("Label done. Current Time =", datetime.now())

def new_sent_ck (text):
    polarity_score = TextBlob(text).sentiment.polarity
    sent_score = []
    sent_score.append(polarity_score)
    return sent_score

g_df['sml_sent_score'] = g_df['sml_lyrics'].apply(new_sent_ck) 

print("Both sml_sent tasks done. Current Time =", datetime.now())

Current Time = 2021-06-09 07:01:52.115977
Label done. Current Time = 2021-06-09 07:02:24.116838
Both sml_sent tasks done. Current Time = 2021-06-09 07:02:55.707841


In [60]:
#Feature engineering, giant string for a vectorizer, later.

print("Current Time =", datetime.now())

g_df['sml_vector'] = g_df['sml_lyrics'].apply(lemmatized_word)

print("Current Time =", datetime.now())

Current Time = 2021-06-09 07:03:18.487704
Current Time = 2021-06-09 07:04:12.593434


In [61]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86294 entries, 0 to 86293
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   song_name               86294 non-null  object 
 1   lyrics                  86294 non-null  object 
 2   genre                   86294 non-null  object 
 3   full_word_count         86294 non-null  int64  
 4   full_character_count    86294 non-null  int64  
 5   full_avg_word_length    86294 non-null  float64
 6   med_lyrics              86294 non-null  object 
 7   med_word_count          86294 non-null  int64  
 8   med_character_count     86294 non-null  int64  
 9   med_avg_word_length     86294 non-null  float64
 10  med_content_affin       86294 non-null  float64
 11  med_sent_label          86294 non-null  object 
 12  med_sent_score          86294 non-null  object 
 13  med_vector              86294 non-null  object 
 14  med_rock_genre_count    19381 non-null

In [62]:
type(g_df.iloc[50]['sml_vector'])

str

In [63]:
g_df.iloc[50]['sml_vector']

'day hill spain pair child cave strangest sight stood skin green word heard girl stronger boy weak new mother learned speak tale dying sun left darkness dark world come undone travelled far believing came star fell life time parallel life men science men fame men letter tried explain parallel world twist time think fallen sky whirlwind spun took twilight home believing came star'

In [64]:
g_df.to_csv('g_df_final_most_the_way')

upload_file_to_bucket('final-music-flow','g_df_final_most_the_way')

'https://final-music-flow.s3.amazonaws.com/g_df_final_most_the_way'

In [4]:
g_df = pd.read_csv('s3://final-music-flow/g_df_final_most_the_way') #complete, final dataset - less the sml_genre_counts

In [16]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86290 entries, 0 to 86293
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              86290 non-null  int64  
 1   song_name               86290 non-null  object 
 2   lyrics                  86290 non-null  object 
 3   genre                   86290 non-null  object 
 4   full_word_count         86290 non-null  int64  
 5   full_character_count    86290 non-null  int64  
 6   full_avg_word_length    86290 non-null  float64
 7   med_lyrics              86290 non-null  object 
 8   med_word_count          86290 non-null  int64  
 9   med_character_count     86290 non-null  int64  
 10  med_avg_word_length     86290 non-null  float64
 11  med_content_affin       86290 non-null  float64
 12  med_sent_label          86290 non-null  object 
 13  med_sent_score          86290 non-null  object 
 14  med_vector              86290 non-null

In [15]:
#Clean up NaN values, which will stop the genre_count function below.
g_df.dropna(axis=0, subset=['sml_lyrics'], inplace=True)

In [17]:
#Feature engineering, create domain specific scores based on words unique to particulary genres.

def genre_count(text):
    result = 0
    text_tokenized = word_tokenize(text)
    for i in range(0, len(text_tokenized)):
        if text_tokenized[i] in stop_words:
            result += digit
        else:
            pass
    if result != 0:
        return result
    else:
        pass

#Set Rock! words...
stop_words = nltk.corpus.stopwords.words('english')

stop_words = []

rock2 = rock['Word'].to_dict()
rock3 = list(rock2.values())
digit = .01

stop_words.extend(rock3)
print(len(stop_words), 'Rock!')
print("Current Time =", datetime.now())

#Run genre_count with Rock!
g_df['sml_rock_genre_count'] =g_df['sml_lyrics'].apply(genre_count)

#Reset to Hip Hop...
stop_words = nltk.corpus.stopwords.words('english')

stop_words = []

hiphop2 = hiphop['Word'].to_dict()
hiphop3 = list(hiphop2.values())
digit = 100

stop_words.extend(hiphop3)
print(len(stop_words), 'Hip Hop')
print("Current Time =", datetime.now())

#Run genre_count with Hip Hop
g_df['sml_hiphop_genre_count'] =g_df['sml_lyrics'].apply(genre_count)

#Reset to Pop...
stop_words = nltk.corpus.stopwords.words('english')

stop_words = []

pop2 = pop['Word'].to_dict()
pop3 = list(pop2.values())
digit = 1

stop_words.extend(pop3)
print(len(stop_words), 'Pop')
print("Current Time =", datetime.now())

#Run genre_count with Hip Hop
g_df['sml_pop_genre_count'] =g_df['sml_lyrics'].apply(genre_count)

print("Current Time =", datetime.now())

23091 Rock!
Current Time = 2021-06-09 16:17:51.449676
29843 Hip Hop
Current Time = 2021-06-09 16:51:20.023943
13757 Pop
Current Time = 2021-06-09 17:38:52.220396
Current Time = 2021-06-09 17:59:23.716548


In [28]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86290 entries, 0 to 86293
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              86290 non-null  int64  
 1   song_name               86290 non-null  object 
 2   lyrics                  86290 non-null  object 
 3   genre                   86290 non-null  object 
 4   full_word_count         86290 non-null  int64  
 5   full_character_count    86290 non-null  int64  
 6   full_avg_word_length    86290 non-null  float64
 7   med_lyrics              86290 non-null  object 
 8   med_word_count          86290 non-null  int64  
 9   med_character_count     86290 non-null  int64  
 10  med_avg_word_length     86290 non-null  float64
 11  med_content_affin       86290 non-null  float64
 12  med_sent_label          86290 non-null  object 
 13  med_sent_score          86290 non-null  object 
 14  med_vector              86290 non-null

In [20]:
g_df.to_csv('g_df_final_almost_dun')

def upload_file_to_bucket(bucket_name, file_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    file_dir, file_name = os.path.split(file_path)

    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(
      Filename=file_path,
      Key=file_name,
      ExtraArgs={'ACL': 'private'}
    )

    s3_url = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
    return s3_url

upload_file_to_bucket('final-music-flow','g_df_final_almost_dun')

'https://final-music-flow.s3.amazonaws.com/g_df_final_almost_dun'

In [23]:
g_df['med_rock_genre_count'] = g_df['med_rock_genre_count'].fillna(0)

In [25]:
g_df['med_hiphop_genre_count'] = g_df['med_hiphop_genre_count'].fillna(0)
g_df['med_pop_genre_count'] = g_df['med_pop_genre_count'].fillna(0)

In [27]:
g_df['sml_rock_genre_count'] = g_df['sml_rock_genre_count'].fillna(0)
g_df['sml_hiphop_genre_count'] = g_df['sml_hiphop_genre_count'].fillna(0)
g_df['sml_pop_genre_count'] = g_df['sml_pop_genre_count'].fillna(0)

In [29]:
g_df['med_genre_count'] = g_df['med_rock_genre_count']+g_df['med_hiphop_genre_count']+g_df['med_pop_genre_count']

In [30]:
g_df['sml_genre_count'] = g_df['sml_rock_genre_count']+g_df['sml_hiphop_genre_count']+g_df['sml_pop_genre_count']

In [31]:
g_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86290 entries, 0 to 86293
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              86290 non-null  int64  
 1   song_name               86290 non-null  object 
 2   lyrics                  86290 non-null  object 
 3   genre                   86290 non-null  object 
 4   full_word_count         86290 non-null  int64  
 5   full_character_count    86290 non-null  int64  
 6   full_avg_word_length    86290 non-null  float64
 7   med_lyrics              86290 non-null  object 
 8   med_word_count          86290 non-null  int64  
 9   med_character_count     86290 non-null  int64  
 10  med_avg_word_length     86290 non-null  float64
 11  med_content_affin       86290 non-null  float64
 12  med_sent_label          86290 non-null  object 
 13  med_sent_score          86290 non-null  object 
 14  med_vector              86290 non-null

In [32]:
g_df.describe(include='all')

Unnamed: 0.1,Unnamed: 0,song_name,lyrics,genre,full_word_count,full_character_count,full_avg_word_length,med_lyrics,med_word_count,med_character_count,med_avg_word_length,med_content_affin,med_sent_label,med_sent_score,med_vector,med_rock_genre_count,med_hiphop_genre_count,med_pop_genre_count,sml_lyrics,sml_word_count,sml_character_count,sml_avg_word_length,sml_content_affin,sml_sent_label,sml_sent_score,sml_vector,sml_rock_genre_count,sml_hiphop_genre_count,sml_pop_genre_count,med_genre_count,sml_genre_count
count,86290.0,86290,86290,86290,86290.0,86290.0,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290,86290,86290,86290.0,86290.0,86290.0,86290,86290.0,86290.0,86290.0,86290.0,86290,86290,86290,86290.0,86290.0,86290.0,86290.0,86290.0
unique,,66799,86203,3,,,,85378,,,,,3,53904,85355,,,,85286,,,,,3,53696,85264,,,,,
top,,Intro,so so you think you can tell heaven from hel...,Rock,,,,dreaming white christmas like ones know treeto...,,,,,positive,[0.0],merry little christmas let heart light trouble...,,,,feel like home feel like feel like young feel ...,,,,,positive,[0.0],merry little christmas let heart light trouble...,,,,,
freq,,50,3,47406,,,,4,,,,,58208,1447,4,,,,4,,,,,58255,1473,4,,,,,
mean,43146.60715,,,,355.093638,1067.958303,3.032924,,108.455858,552.463715,5.120146,0.01088,,,,0.006332,91.105574,0.358106,,102.333666,509.263924,5.011926,0.011444,,,,0.000238,5.672731,0.014162,91.470013,5.687131
std,24911.499451,,,,218.656149,651.998326,0.300625,,74.646193,379.487446,0.467359,0.046185,,,,0.021912,351.449939,3.012329,,68.819909,338.19807,0.429539,0.048637,,,,0.006379,110.012447,0.774038,351.424018,110.014459
min,0.0,,,,1.0,8.0,0.051852,,1.0,4.0,3.0,-0.493151,,,,0.0,0.0,0.0,,1.0,4.0,3.0,-0.503979,,,,0.0,0.0,0.0,0.0,0.0
25%,21573.25,,,,209.0,638.0,2.853833,,61.0,312.0,4.8125,-0.012953,,,,0.0,0.0,0.0,,58.0,291.0,4.733333,-0.013661,,,,0.0,0.0,0.0,0.0,0.0
50%,43146.5,,,,299.0,908.0,3.026846,,88.0,450.0,5.083333,0.008451,,,,0.0,0.0,0.0,,84.0,421.0,4.982759,0.009302,,,,0.0,0.0,0.0,0.0,0.0
75%,64720.75,,,,436.0,1303.0,3.206452,,129.0,661.0,5.384615,0.032754,,,,0.0,0.0,0.0,,123.0,615.0,5.25685,0.0348,,,,0.0,0.0,0.0,0.12,0.0


In [34]:
g_df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [35]:
g_df.to_csv('g_df_final_fix_obj')

upload_file_to_bucket('final-music-flow','g_df_final_fix_obj')

'https://final-music-flow.s3.amazonaws.com/g_df_final_fix_obj'