# Making the model

In [6]:
import pandas as pd

import nltk
import csv

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davebudhram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0,artist,song,link,text,genre,generic_genre
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...","['europop', 'swedish pop']",pop
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...","['europop', 'swedish pop']",pop
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,"['europop', 'swedish pop']",pop
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"['europop', 'swedish pop']",pop
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"['europop', 'swedish pop']",pop


In [8]:
N_GRAM = 4
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"

In [9]:
def tokenize_line(line: str, ngram: int, 
                   by_char: bool = True, 
                   space_char: str = ' ',
                   sentence_begin: str=SENTENCE_BEGIN, 
                   sentence_end: str=SENTENCE_END):
  """
  Tokenize a single string. Glue on the appropriate number of 
  sentence begin tokens and sentence end tokens (ngram - 1), except
  for the case when ngram == 1, when there will be one sentence begin
  and one sentence end token.
  Args:
    line (str): text to tokenize
    ngram (int): ngram preparation number
    by_char (bool): default value True, if True, tokenize by character, if
      False, tokenize by whitespace
    space_char (str): if by_char is True, use this character to separate to replace spaces
    sentence_begin (str): sentence begin token value
    sentence_end (str): sentence end token value

  Returns:
    list of strings - a single line tokenized
  """
  # PROVIDED
  inner_pieces = None
  if by_char:
    line = line.replace(' ', space_char)
    inner_pieces = list(line)
  else:
    # otherwise use nltk's word tokenizer
    inner_pieces = nltk.word_tokenize(line)

  if ngram == 1:
    tokens = [sentence_begin] + inner_pieces + [sentence_end]
  else:
    tokens = ([sentence_begin] * (ngram - 1)) + inner_pieces + ([sentence_end] * (ngram - 1))
  # always count the unigrams
  return tokens


In [71]:
def my_tokenize_line(line, genre, ngram=N_GRAM):
  result = []
  sentences = line.split('\r\n')
  for sentence in sentences:
    tokens = [SENTENCE_BEGIN] + nltk.word_tokenize(sentence) + [SENTENCE_END]
    ngrams = (list(nltk.ngrams(tokens, ngram-1)))
    for gram in ngrams:
      ngram_as_list = [genre] + list(gram)
      result.append(ngram_as_list)
  return result

In [72]:
print(my_tokenize_line(df['text'][0], df['generic_genre'][0]))


[['pop', '<s>', 'Look', 'at'], ['pop', 'Look', 'at', 'her'], ['pop', 'at', 'her', 'face'], ['pop', 'her', 'face', ','], ['pop', 'face', ',', 'it'], ['pop', ',', 'it', "'s"], ['pop', 'it', "'s", 'a'], ['pop', "'s", 'a', 'wonderful'], ['pop', 'a', 'wonderful', 'face'], ['pop', 'wonderful', 'face', '</s>'], ['pop', '<s>', 'And', 'it'], ['pop', 'And', 'it', 'means'], ['pop', 'it', 'means', 'something'], ['pop', 'means', 'something', 'special'], ['pop', 'something', 'special', 'to'], ['pop', 'special', 'to', 'me'], ['pop', 'to', 'me', '</s>'], ['pop', '<s>', 'Look', 'at'], ['pop', 'Look', 'at', 'the'], ['pop', 'at', 'the', 'way'], ['pop', 'the', 'way', 'that'], ['pop', 'way', 'that', 'she'], ['pop', 'that', 'she', 'smiles'], ['pop', 'she', 'smiles', 'when'], ['pop', 'smiles', 'when', 'she'], ['pop', 'when', 'she', 'sees'], ['pop', 'she', 'sees', 'me'], ['pop', 'sees', 'me', '</s>'], ['pop', '<s>', 'How', 'lucky'], ['pop', 'How', 'lucky', 'can'], ['pop', 'lucky', 'can', 'one'], ['pop', 'can'

In [73]:
data = []
for index, row in df.iterrows():
  song_data = my_tokenize_line(row['text'], row['generic_genre'])
  for gram in song_data:
    data.append(gram)


In [75]:
print(data[100:200])

[['pop', 'holds', 'me', 'and'], ['pop', 'me', 'and', 'squeezes'], ['pop', 'and', 'squeezes', 'my'], ['pop', 'squeezes', 'my', 'hand'], ['pop', 'my', 'hand', '</s>'], ['pop', '<s>', 'We', "'ll"], ['pop', 'We', "'ll", 'go'], ['pop', "'ll", 'go', 'on'], ['pop', 'go', 'on', 'walking'], ['pop', 'on', 'walking', 'for'], ['pop', 'walking', 'for', 'hours'], ['pop', 'for', 'hours', 'and'], ['pop', 'hours', 'and', 'talking'], ['pop', 'and', 'talking', '</s>'], ['pop', '<s>', 'About', 'all'], ['pop', 'About', 'all', 'the'], ['pop', 'all', 'the', 'things'], ['pop', 'the', 'things', 'that'], ['pop', 'things', 'that', 'we'], ['pop', 'that', 'we', 'plan'], ['pop', 'we', 'plan', '</s>'], ['pop', '<s>', 'She', "'s"], ['pop', 'She', "'s", 'just'], ['pop', "'s", 'just', 'my'], ['pop', 'just', 'my', 'kind'], ['pop', 'my', 'kind', 'of'], ['pop', 'kind', 'of', 'girl'], ['pop', 'of', 'girl', ','], ['pop', 'girl', ',', 'she'], ['pop', ',', 'she', 'makes'], ['pop', 'she', 'makes', 'me'], ['pop', 'makes', 'me',

In [78]:
unique_words = set()
for gram in data:
  for word in gram[1:]:
    unique_words.add(word)

In [79]:
print(len(unique_words))

105593


In [80]:
X = []
y = []
for gram in data:
  X.append(gram[:-1])
  y.append(gram[-1])

In [92]:
def data_generator(X_data, y_data, batch_size, epochs, unique_words: int):
  for epoch in range(epochs):
    print('epoch: ', epoch + 1)
    i = 0
    while True:
        batch_x = []
        batch_y = []
        if i + batch_size < len(X_data):
            batch_x.extend(X_data[i:i+batch_size])
            batch_y.extend(y_data[i:i+batch_size]) 
        else:
            break
        yield batch_x, batch_y
        i+=batch_size
  

In [94]:
generator = (data_generator(X, y, 3, 2))
print(next(generator))

epoch:  1
([['pop', '<s>', 'Look'], ['pop', 'Look', 'at'], ['pop', 'at', 'her']], ['at', 'her', 'face'])
