In [3]:
#general packages
import os
import string
import copy
import json
import pickle
from ast import literal_eval
import random

#dataframe and data science packages
import pandas as pd
import numpy as np
import re

#nltk packages
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

#gensim packages
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel

#emotion detection packages
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import text2emotion as te

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
%run ChordFeaturizer.ipynb
%run LdaFeaturizer.ipynb
%run GeneralLyricsFeaturizer.ipynb

# Import the Cleaned DataFrame

## Will be used to generate our features

In [3]:
#regenerate the DF if new data has been added or other changes have been made to GenerateCleanDF
regenerateDF = 0

In [4]:
if regenerateDF == 1:
    print('Generating New DF:')
    %run GenerateCleanDF.ipynb
    print('New DF Generated')

In [5]:
print('Importing Clean DF -> ', end='')
pickle_in = open("Clean_DF.pickle","rb")
df = pickle.load(pickle_in)

df = df.rename(columns={'index':'track_name'})
print('Completed Import')

Importing Clean DF -> Completed Import


In [6]:
df.to_csv('Final_DF.csv')

# Build Bag of Words Features

In [6]:
def buildBowFeatures(BowType, Tfidf, runSVD):
    print('Building BoW Features -> ', end='')
    
    if BowType == 'Lyrics':
        vocab, trainX, testX = buildBoW(df.loc[:,'clean_lyrics'].tolist())
    elif BowType == 'Chords':
        chord_list = df.clean_tabs.tolist()
        chord_list = [song_chord.replace(',', ' ') for song_chord in chord_list]
        
        #buildBoW(lyrics_list, Tfidf = False, runSVD = 0, num_components = 100, train_perc=0.8):
        vocab, trainX, testX = buildBoW(chord_list, runSVD)
    print('BoW Features Built')
    
    return vocab, trainX, testX

# Build the Chord Progression Features

In [2]:
def buildChordProgressionFeatures(key_col, chordNormType, ngram_low, ngram_high, binary, runSVD, Tfidf, num_components, min_df):
    print('Building Chord Progression Features -> ', end='')

    full, partial, bare = buildNormTab(df, key_col)
    
    if chordNormType == 'Full':
        chords = full
    elif chordNormType == 'Partial':
        chords = partial
    elif chordNormType == 'Bare':
        chords = bare
    #vectorizeTab(chords, ngram_low, ngram_high,  binary, Tfidf = False, runSVD = 0, num_components = 100, train_perc = 0.8, min_df = 5)
    print(num_components)
    vocab, trainX, testX = vectorizeTab(chords, ngram_low, ngram_high, binary, Tfidf, runSVD, num_components = num_components, min_df = min_df)
    
    print('Chord Progression Features Built')
    
    return vocab, trainX, testX

# Build LDA Features

In [56]:
def buildLdaFeatures(num_topics = 50, train_perc = 0.8):
    
    train_break = int(len(df)*train_perc)
    print('Building LDA Features -> ', end='')
    
    topics, LdaFeatures = buildFeatures(df, num_topics)

    trainX, testX = csr_matrix(LdaFeatures[:train_break]), csr_matrix(LdaFeatures[train_break:])
    
    print('LDA Features Built')
    
    return topics, trainX, testX

# Build Emotion Features

In [57]:
def buildEmotionFeatures(emotionFeatureType, train_perc = 0.8):
    print('Building Emotion Features -> ', end='')
    
    train_break = int(len(df)*train_perc)
    
    if emotionFeatureType == 'Vader':
        feature = [list(song.values()) for song in list(df['vader_emotion'])]
    else:
        feature = [list(song.values()) for song in list(df['text2emotion'])]
        
    trainX, testX = csr_matrix(feature[:train_break]), csr_matrix(feature[train_break:])
    
    print('Emotion Features Built')
    return trainX, testX

# Build Text Complexity Feature

In [21]:
def buildTextComplexityFeatures(train_perc = 0.8):
    print('Building Text Complexty Features -> ', end='')

    train_break = int(len(df)*train_perc)
    
    text_complexity_feature = []
    for i in range(len(df)):
        text_complexity_feature.append(buildTypeTokenRatio(df.clean_lyrics[i]))

    trainX, testX = csr_matrix(text_complexity_feature[:train_break]), csr_matrix(text_complexity_feature[train_break:])
    
    print('Text Complexity Features Built')
    return trainX, testX
    

# Build Train & Test Prediction Series (y)

In [1]:
def buildY(predAttribute, randomY = False, train_perc = 0.8):
    train_break = int(len(df)*train_perc)
    
    pred_list = df.loc[:, predAttribute].tolist()
    
    if randomY:
        random.shuffle(pred_list)
    
    trainy, testy = pred_list[:train_break], pred_list[train_break:]
    return trainy, testy

# Check the Feature Covariance

In [10]:
"""vocab, vectorTrain, vectorTest = vectorizeTab(firstNote_full, 3, 3, True, 0.8)
#print(vectorTrain)
vectorTrain, vectorTest, vocab = cleanForDocFequency(vectorTrain, vectorTest, vocab, 1, 10)
print(vectorTrain.shape)
print(vectorTest.shape)    
print(len(vocab))
"""

'vocab, vectorTrain, vectorTest = vectorizeTab(firstNote_full, 3, 3, True, 0.8)\n#print(vectorTrain)\nvectorTrain, vectorTest, vocab = cleanForDocFequency(vectorTrain, vectorTest, vocab, 1, 10)\nprint(vectorTrain.shape)\nprint(vectorTest.shape)    \nprint(len(vocab))\n'