In [1]:
!pip install bert-tensorflow



In [2]:
import math
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import bert
from bert import run_classifier

  from ._conv import register_converters as _register_converters


In [3]:
def is_nan(string_value):
    if isinstance(string_value, float):
        if math.isnan(string_value):
            return True
    return False

lyrics = pd.read_csv('lyrics.csv')
print('Original Number of Records:',len(lyrics))

# Preprocessed records to remove 
# a. strings containing less than 10 words
# b. non-english records using langdetect package
# See Preprocessor.ipynb 
with open("drop_list.txt") as f:
    content = f.readlines()
    
non_english =[int(x.strip()) for x in content[0].split(',')]

# find all records with nan
nans = []
for i,record in enumerate(lyrics['lyrics']):
    if is_nan(record):
        nans.append(i)
                
drop_set = set()
for i in non_english: drop_set.add(i)
for i in nans: drop_set.add(i)
drop_list = list(drop_set)
lyrics = lyrics.drop(drop_list,axis=0)
lyrics = lyrics.drop(lyrics[lyrics.genre == 'Not Available'].index)

print('Number of deleted records:',len(drop_list))
print('Number of Records after records deleted:',len(lyrics))

lyrics['line_count'] = [lyric.count('\n')+1 for lyric in lyrics['lyrics']]

lyrics_train, lyrics_test = train_test_split(lyrics, test_size=0.4, random_state=5)

rock_train = lyrics_train[lyrics_train['genre'] == 'Rock']
pop_train = lyrics_train[lyrics_train['genre'] == 'Pop']
hiphop_train = lyrics_train[lyrics_train['genre'] == 'Hip-Hop']
metal_train = lyrics_train[lyrics_train['genre'] == 'Metal']
country_train = lyrics_train[lyrics_train['genre'] == 'Country']
jazz_train = lyrics_train[lyrics_train['genre'] == 'Jazz']
electronic_train = lyrics_train[lyrics_train['genre'] == 'Electronic']
other_train = lyrics_train[lyrics_train['genre'] == 'Other']
rnb_train = lyrics_train[lyrics_train['genre'] == 'R&B']
indie_train = lyrics_train[lyrics_train['genre'] == 'Indie']
folk_train = lyrics_train[lyrics_train['genre'] == 'Folk']

# Oversample the success records to balance the skewed classes.
# train_data1 = pd.concat([train_df_fail, train_df_success.sample(len(train_df_fail.index),replace=True)], axis = 0)
#lyrics_train_oversample = pd.concat([rock_train, \
#                                  pop_train.sample(len(rock_train.index),replace=True), \
#                                  hiphop_train.sample(len(rock_train.index),replace=True), \
#                                  metal_train.sample(len(rock_train.index),replace=True), \
#                                  country_train.sample(len(rock_train.index),replace=True), \
#                                  jazz_train.sample(len(rock_train.index),replace=True), \
#                                  electronic_train.sample(len(rock_train.index),replace=True), \
#                                  other_train.sample(len(rock_train.index),replace=True), \
#                                  rnb_train.sample(len(rock_train.index),replace=True), \
#                                  indie_train.sample(len(rock_train.index),replace=True), \
#                                  folk_train.sample(len(rock_train.index),replace=True)], axis = 0)


lyrics_train_undersample = pd.concat([rock_train.sample(len(folk_train.index),replace=False), \
                                  pop_train.sample(len(folk_train.index),replace=False), \
                                  hiphop_train.sample(len(folk_train.index),replace=False), \
                                  metal_train.sample(len(folk_train.index),replace=False), \
                                  country_train.sample(len(folk_train.index),replace=False), \
                                  jazz_train.sample(len(folk_train.index),replace=False), \
                                  electronic_train.sample(len(folk_train.index),replace=False), \
                                  other_train.sample(len(folk_train.index),replace=False), \
                                  rnb_train.sample(len(folk_train.index),replace=False), \
                                  indie_train.sample(len(folk_train.index),replace=False), \
                                  folk_train], axis = 0)

genres = lyrics['genre'].value_counts().keys()

num_test = len(lyrics_test)
test_data, test_labels = lyrics_test.drop(['genre'],axis=1)[int(num_test/2):], lyrics_test['genre'][int(num_test/2):]
dev_data, dev_labels = lyrics_test.drop(['genre'],axis=1)[:int(num_test/2)], lyrics_test['genre'][:int(num_test/2)]
#train_data, train_labels = lyrics_train.drop(['genre'],axis=1), lyrics_train['genre']
#train_data, train_labels = lyrics_train_oversample.drop(['genre'],axis=1), lyrics_train_oversample['genre']
train_data, train_labels = lyrics_train_undersample.drop(['genre'],axis=1), lyrics_train_undersample['genre']


print ('training label shape:', train_labels.shape)
print ('test label shape:', test_labels.shape)
print ('dev label shape:', dev_labels.shape)
print ('labels names:', genres)

Original Number of Records: 362237
Number of deleted records: 117033
Number of Records after records deleted: 223560
training label shape: (13464,)
test label shape: (44712,)
dev label shape: (44712,)
labels names: Index(['Rock', 'Pop', 'Hip-Hop', 'Metal', 'Country', 'Jazz', 'Electronic',
       'Other', 'R&B', 'Indie', 'Folk'],
      dtype='object')


In [4]:
train_InputExamples = train_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = train_data.lyrics, 
                                                                   text_b = None, 
                                                                   label = train_labels), axis = 1)

test_InputExamples = test_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = test_data.lyrics, 
                                                                   text_b = None, 
                                                                   label = test_labels), axis = 1)

## Link to download the BERT model used in next cell
https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

In [5]:
from bert import tokenization

BERT_VOCAB= "uncased_L-12_H-768_A-12/vocab.txt"
BERT_INIT_CHKPNT = "uncased_L-12_H-768_A-12/bert_model.ckpt"
BERT_CONFIG = "uncased_L-12_H-768_A-12/bert_config.json"
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True)

In [6]:
len(tokenizer.vocab)

30522

In [7]:
tokenizer.tokenize(train_data.lyrics.values[1][0:78])

['this',
 'is',
 'just',
 'an',
 'ordinary',
 'day',
 'wipe',
 'the',
 'ins',
 '##ec',
 '##urities',
 'away',
 'i',
 'can',
 'see',
 'that',
 'the',
 'dar']