# 04 - Final Data Prep Part 2

## Data files needed to run this notebook:
- `df_total_cleaned.pkl.gz` (notebook 02)
- `lyrics_bert_vectors_total.pkl.gz` (notebook 03)

## Settings:
- set `COLAB = True` if you run this on Colab. Data can be placed in the root directory

In [2]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re
from numpy import log, mean, matmul


required = {'spacy', 'scikit-learn', 'numpy', 
            'pandas', 'torch', 'matplotlib',
            'transformers', 'allennlp==0.9.0'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)
import spacy
import numpy as np
import pandas as pd

# SciKit Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC


# Spacy
from spacy.lang.en import English
en = English()

# !python -m spacy download en_core_web_md # includes GloVe Vectors
# !python -m spacy download en_core_web_sm
# !python -m spacy download en

# import en_core_web_sm
# import en_core_web_md


# PyTorch
import torch
# import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split


# File managment
import os
from os import listdir
from pathlib import Path
import pickle
import gzip

In [3]:
LOAD_DATA = False # read save data or regenerate data
SAVE_DATA = False # overwrite generated data? 

COLAB = False

In [4]:
if COLAB:
  # Google Colab
  path = "./"
  device = torch.device("cuda:0") # use GPU, change 
else:
  # Laptop
  path = "./data/"
  device = torch.device("cpu")
#   !pip install ipywidgets
#   !jupyter nbextension enable --py widgetsnbextension


In [5]:
genres = ["Rock", "Hip Hop", "Metal"]
# genres = ["Rock", "Hip Hop", "Metal"]

# Load Data

In [6]:
# Load data
file_name = "df_total_cleaned"
df_total_cleaned = pd.read_pickle(f'{path}{file_name}.pkl.gz')
df_total_cleaned

Unnamed: 0,SName,Lyric,Artist,Genre
0,More Than This,I could feel at the time. There was no way of ...,10000 Maniacs,Rock
1,Because The Night,"Take me now, baby, here as I am. Hold me close...",10000 Maniacs,Rock
2,These Are Days,These are. These are days you'll remember. Nev...,10000 Maniacs,Rock
3,A Campfire Song,"A lie to say, ""O my mountain has coal veins an...",10000 Maniacs,Rock
4,Everyday Is Like Sunday,Trudging slowly over wet sand. Back to the ben...,10000 Maniacs,Rock
...,...,...,...,...
155478,The New Dawn,Through the storm like the wind we ride Leavin...,ensiferum,Metal
155479,Victory Song,The plan of invasion an Evil deception Was mad...,ensiferum,Metal
155480,Lady In Black,originally by Uriah Heep She came to me one m...,ensiferum,Metal
155481,One More Magic Potion,Once when we were returning from a battle and ...,ensiferum,Metal


## Combine with BERT vectors

In [7]:
file_name = "lyrics_bert_vectors_total"
df_bert = pd.DataFrame(pd.read_pickle(f'{path}{file_name}.pkl.gz'))
df_bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.217896,-0.593750,0.455811,-0.447998,-0.802734,0.007118,0.272949,0.699219,-0.209595,-0.895996,...,0.191772,-0.121521,-0.271973,-0.139160,0.092102,0.530762,-0.535645,-0.178833,-0.014542,0.893066
1,-0.253418,0.034302,0.264893,-0.346436,-0.215942,0.130859,0.826172,0.392578,0.022354,-0.866699,...,0.299561,-0.802246,-0.322266,-0.077515,0.096436,0.434570,-0.032776,-0.457275,0.075989,0.207886
2,0.205078,0.183472,0.151123,-0.067505,-0.395020,0.097168,0.658203,0.778809,-0.466064,-1.068359,...,-0.119385,-0.666504,-0.226074,0.029739,0.395264,0.644531,-0.340576,-0.006119,0.203735,0.310303
3,-0.332764,0.265869,0.083008,-0.172241,-0.324951,0.158569,0.770508,0.537598,-0.128052,-0.583008,...,0.391602,-0.105042,-0.028946,-0.063293,-0.048950,0.453369,-0.013588,-0.079834,0.524902,0.351807
4,-0.181641,-0.309326,0.474609,-0.314209,-0.166382,-0.434082,0.667480,0.855469,-0.082275,-0.748047,...,0.348389,-0.264160,-0.317871,-0.597168,0.686035,0.274414,-0.168823,-0.144531,0.224731,0.798340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155478,-0.039032,0.102905,0.117432,0.104980,-0.610352,-0.161377,0.614746,0.468262,-0.131348,-0.739746,...,0.072083,-0.441650,-0.036102,-0.124695,0.332520,0.499512,-0.148560,-0.178101,0.112610,-0.257080
155479,-0.579590,0.541016,-0.306885,0.086121,-1.462891,0.405273,1.433594,0.609863,-0.283936,-0.157349,...,0.006924,-0.718750,0.127563,-0.323242,0.212036,0.557617,0.067261,-0.646484,0.237915,-0.065796
155480,-0.097046,-0.382080,-0.370361,0.367188,-0.879883,0.195190,0.914062,0.380127,-0.024200,-0.707031,...,0.417480,-1.181641,0.124268,0.272217,0.704102,0.727051,-0.068298,-0.228271,-0.011215,-0.425781
155481,-0.732910,0.011490,0.063232,0.125244,-1.290039,0.443359,0.866211,0.615234,-0.226318,-0.495605,...,0.017197,-0.617188,-0.320557,-0.198242,0.032928,0.569824,0.001340,-0.693359,-0.083801,-0.187378


In [8]:
# Combine the two sets

In [9]:
df_total  = pd.concat([df_total_cleaned, df_bert], axis=1)
df_total

Unnamed: 0,SName,Lyric,Artist,Genre,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,More Than This,I could feel at the time. There was no way of ...,10000 Maniacs,Rock,-0.217896,-0.593750,0.455811,-0.447998,-0.802734,0.007118,...,0.191772,-0.121521,-0.271973,-0.139160,0.092102,0.530762,-0.535645,-0.178833,-0.014542,0.893066
1,Because The Night,"Take me now, baby, here as I am. Hold me close...",10000 Maniacs,Rock,-0.253418,0.034302,0.264893,-0.346436,-0.215942,0.130859,...,0.299561,-0.802246,-0.322266,-0.077515,0.096436,0.434570,-0.032776,-0.457275,0.075989,0.207886
2,These Are Days,These are. These are days you'll remember. Nev...,10000 Maniacs,Rock,0.205078,0.183472,0.151123,-0.067505,-0.395020,0.097168,...,-0.119385,-0.666504,-0.226074,0.029739,0.395264,0.644531,-0.340576,-0.006119,0.203735,0.310303
3,A Campfire Song,"A lie to say, ""O my mountain has coal veins an...",10000 Maniacs,Rock,-0.332764,0.265869,0.083008,-0.172241,-0.324951,0.158569,...,0.391602,-0.105042,-0.028946,-0.063293,-0.048950,0.453369,-0.013588,-0.079834,0.524902,0.351807
4,Everyday Is Like Sunday,Trudging slowly over wet sand. Back to the ben...,10000 Maniacs,Rock,-0.181641,-0.309326,0.474609,-0.314209,-0.166382,-0.434082,...,0.348389,-0.264160,-0.317871,-0.597168,0.686035,0.274414,-0.168823,-0.144531,0.224731,0.798340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155478,The New Dawn,Through the storm like the wind we ride Leavin...,ensiferum,Metal,-0.039032,0.102905,0.117432,0.104980,-0.610352,-0.161377,...,0.072083,-0.441650,-0.036102,-0.124695,0.332520,0.499512,-0.148560,-0.178101,0.112610,-0.257080
155479,Victory Song,The plan of invasion an Evil deception Was mad...,ensiferum,Metal,-0.579590,0.541016,-0.306885,0.086121,-1.462891,0.405273,...,0.006924,-0.718750,0.127563,-0.323242,0.212036,0.557617,0.067261,-0.646484,0.237915,-0.065796
155480,Lady In Black,originally by Uriah Heep She came to me one m...,ensiferum,Metal,-0.097046,-0.382080,-0.370361,0.367188,-0.879883,0.195190,...,0.417480,-1.181641,0.124268,0.272217,0.704102,0.727051,-0.068298,-0.228271,-0.011215,-0.425781
155481,One More Magic Potion,Once when we were returning from a battle and ...,ensiferum,Metal,-0.732910,0.011490,0.063232,0.125244,-1.290039,0.443359,...,0.017197,-0.617188,-0.320557,-0.198242,0.032928,0.569824,0.001340,-0.693359,-0.083801,-0.187378


## Create Training and Test Set

Create a balance subselection of the data

In [10]:
def create_subset(df, n, genres):

  df_temp = pd.DataFrame(columns=list(df.columns))

  for g in genres:

    df_small = df.query(f"Genre == '{g}'")
    df_genre = df_small.sample(n=round(n/len(genres)), replace="False", random_state=42)
    df_temp = df_temp.append(df_genre)

  return df_temp
   

In [11]:
  
n = 12000

df_subset = create_subset(df_total, n, genres)
df_subset["Genre"].value_counts()


Rock       4000
Metal      4000
Hip Hop    4000
Name: Genre, dtype: int64

In [12]:
X=df_subset.drop("Genre", axis = 1)
y=df_subset["Genre"]

In [13]:

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0, shuffle = True, stratify = y)

In [14]:
X_train.shape

(8400, 771)

In [15]:
y_train.value_counts()

Rock       2800
Metal      2800
Hip Hop    2800
Name: Genre, dtype: int64

In [16]:
pickle.dump(X_train, gzip.open(f'{path}X_train.pkl.gz', 'wb'))
pickle.dump(X_test, gzip.open(f'{path}X_test.pkl.gz', 'wb'))
pickle.dump(y_train, gzip.open(f'{path}y_train.pkl.gz', 'wb'))
pickle.dump(y_test, gzip.open(f'{path}y_test.pkl.gz', 'wb'))

In [17]:
X_train

Unnamed: 0,SName,Lyric,Artist,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
19167,Back To Sleep,"I know it's late, I know it's late. And baby I...",Chris Brown,0.001492,-0.273682,0.150513,-0.246460,-0.050476,-0.072327,0.447510,...,0.408203,-1.025391,-0.107178,-0.021561,0.352051,0.488770,-0.528320,-0.062469,0.002211,0.423340
88089,The Best,The Best. Soulja Boy. Soulja! Soulja! Soulja! ...,Soulja Boy,0.037231,0.150635,0.496826,0.196777,-0.322998,-0.077271,0.991211,...,0.160156,-0.338623,0.003363,0.037933,0.612305,0.417236,-0.303955,-0.520508,0.493164,0.340820
41285,Just Askin',"Wassup, in your world?. And are you still cool...",Iggy Azalea,-0.041718,-0.246704,-0.162720,0.137085,-0.043579,-0.229004,0.499023,...,0.738770,-0.558594,-0.090698,-0.180176,0.313232,0.296143,-0.604004,-0.382080,0.578613,0.207520
12664,You Wear A Crown But You're No King,You'll never stop 'til you get what you want. ...,Blessthefall,-0.325684,0.143311,-0.226196,-0.002642,-0.749023,0.127930,0.955078,...,0.154175,-0.275879,-0.046143,-0.173706,0.242676,0.716797,-0.252197,-0.583008,0.321533,0.259033
52249,Kevin Gates,Workout. Tell. Workout. Tell. Gates. Gates. Ga...,Kevin Gates,0.061066,0.076416,0.814453,-0.269531,-0.344238,0.150757,0.812500,...,-0.408936,-0.527344,-0.401855,0.095398,0.742676,0.281250,-0.496582,-0.437256,0.364258,0.228882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127879,When The Night Comes,Dark rider has shuttered all light in the sky ...,unsilentphenomenon,-0.169678,0.041229,0.236816,-0.042542,-0.390625,-0.249268,0.797852,...,-0.206543,-0.823730,-0.051056,-0.105835,0.450439,0.190796,-0.207031,-0.497314,0.233032,-0.636230
80491,Infatuation,. Early in the morning I can't sleep. I can't ...,Rod Stewart,-0.017868,0.076111,0.343506,-0.726562,-0.599609,0.000035,0.419678,...,0.453857,-0.279053,-0.021866,-0.483154,0.136719,0.191284,-0.330811,-0.092896,0.070190,0.769043
66750,I Won't Look Back,"Love, love isn’t always. Love, the way that we...",Needtobreathe,-0.179810,-0.208984,-0.071289,0.151611,-0.094727,-0.214478,0.788574,...,0.017578,-0.797852,0.142456,-0.049011,0.182617,0.599609,-0.137939,-0.099487,-0.040466,0.257568
133337,Fire,Let the fire enter you Let the anger start to ...,LACUNA COIL,-0.553223,0.139282,0.312256,0.208862,-0.581055,-0.247070,0.706543,...,-0.300049,-0.753906,-0.207153,-0.024521,0.374512,0.519043,0.003078,-0.461670,0.013908,-0.375977
