In [None]:
!pip install pythainlp
!wget http://www.donlapark.cmustat.com/229352/thai_lyrics.csv

#Song lyrics generation

In [None]:
from collections import Counter
import csv
from itertools import chain
import numpy as np
import pandas as pd
from pythainlp import word_tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader


df = pd.read_csv('thai_lyrics.csv', engine='python')
df.head()

In [None]:
tokenized_lyrics = df['lyrics'].map(word_tokenize)
print(tokenized_lyrics[0])

### Convert from words to numbers

In [None]:

#[[song , number , one],[song , number , two]] -> [song , number , one , song , number , two]
def flatten(ls):
    """
    Flatten list of list
    """
    return list(chain.from_iterable(ls))

#[song , number ,one, number, two] -> [1,2,3,2,4] and [1,2,3] -> [song , number , one]
def create_lookup_dict(tokenized_lyrics, n_min=None):
    """
    Create lookup dictionary from list of words (lyrics)
    """
    word_counts = Counter(tokenized_lyrics)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    if n_min is not None:
        sorted_vocab = {k: v for k, v in word_counts.items() if v >= n_min}
    vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 0)}
    int_to_vocab = {i: word for word, i in vocab_to_int.items()}
    return (vocab_to_int, int_to_vocab)

In [None]:
tokenized_lyrics = flatten(tokenized_lyrics)
#tokenized_lyrics = [token if token is not '\n' else ' ' for token in tokenized_lyrics]
word_counts = Counter(tokenized_lyrics)
vocab_to_int, int_to_vocab = create_lookup_dict(tokenized_lyrics, n_min=None)

In [None]:
vocab_to_int["ใคร"]

In [None]:
len(vocab_to_int)

In [None]:
int_to_vocab[12]

### Create Features (20 words in a song) and Target (the next word)

In [None]:
sequence_length = 20

tokenized_indices = [vocab_to_int.get(token, 0) for token in tokenized_lyrics]

X, target = [], []
for n in range(0, len(tokenized_indices) - sequence_length, 1):
  x = tokenized_indices[n: n + sequence_length]
  y = tokenized_indices[n + sequence_length]
  X.append(np.array(x))
  target.append(y)
X = np.array(X)
target = np.array(target)

In [None]:
X[0]

In [None]:
target[0]

In [None]:
class MyDataSet(torch.utils.data.Dataset):
  def __init__(self, X, y):
    super(MyDataSet, self).__init__()
    self._X = X
    self._y = y

  def __len__(self):
    return self._X.shape[0]

  def __getitem__(self, index):
    X = self._X[index]
    y = self._y[index]
    return X, y

In [None]:
# Hyperparameters
LEARNING_RATE = 0.001
BATCH_SIZE = 128
NUM_EPOCHS = 5

# Classification
NUM_CLASSES = len(vocab_to_int)

dataset = MyDataSet(X, target)

trainloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

## New layers

## 1. `nn.Embedding(num_vocabs, hidden_dim)`

[PyTorch Documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)

![emb](https://miro.medium.com/max/720/1*NuWIU2Iew3Bm8NR78tRj8A.png)

In [None]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3)
# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
output = embedding(input)
print(output)

## 2. LSTM
[PyTorch Documentation](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

![lstm](https://i.stack.imgur.com/sBEBp.png)

In [None]:
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)  # initial hidden state
c0 = torch.randn(2, 3, 20)  # initial cell state
output, (h1, c1) = lstm(input, (h0, c0))

output.shape

### Exercise 1: fill in the code below

In [None]:
class Simple_LSTM(nn.Module):
    def __init__(self):
        super(Simple_LSTM, self).__init__()

        # TODO: Fill in the layers' parameters. Suggested hidden dimensions: 64, 128, 256, 512
        self.embeddings = nn.Embedding(num_embeddings=####, embedding_dim=####)
        self.lstm = nn.LSTM(input_size=####, hidden_size=####, dropout = 0.2, num_layers=####)
        self.fc = nn.Linear(####, ####)  # Hint: predicting the next word is a classification problem with num_vocabs classes

    def forward(self, x):
        # for LSTM, input should be (Sequnce_length, batch_size, hidden_layer),
        # so we need to transpose the input
        x = x.t()
        # Apply the Embedding layer
        x = self.embeddings(x)
        # Apply the LSTM layer (note: LSTM's output is a tuple!)
        h, _ = self.lstm(x)
        # Only need to keep the last element of the sequence
        ht=h[-1]
        out = self.fc(ht)
        return out

In [None]:
model = Simple_LSTM().to('cuda')
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

### Exercise 2: The `generate` functions is used to generate full text from user's starting words (`start_word`)

### Complete the code in `TODO#1` and `TODO#2` in the `generate ` function below.

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to('cuda')
        y = y.to('cuda')
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

@torch.no_grad()
def generate(model, start_word, int_to_vocab, vocab_to_int, predict_len=100):

    words = word_tokenize(start_word)
    start_word_ids = []

    predicted = words  # we will append new words to this list

    pad_value = vocab_to_int[" "]
    word_ids = [vocab_to_int.get(word, pad_value) for word in words]

    # Pad with zeros Ex: [28,15,16] -> [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,15,16]
    current_seq = [np.pad(word_ids, (20 - len(word_ids), pad_value), 'constant')]

    for _ in range(predict_len):
        # transform the array of words into a tensor
        current_seq = torch.LongTensor(np.array(current_seq)).to('cuda')
        ############### TODO#1: Fill in the following steps##############
        # 1. With the trained model, use current_seq as input and obtain the output
        # 2. Apply the Softmax function (nn.Softmax) to turn the output from step 1 into a vector of probabilities.
        #    nn.Softmax Documentation: https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html
        # 3. Make sure that the output has shape (NUM_CLASSES,). Name the output p.




        p =
        ############################end#################################

        # top-k sampling
        topk_probs, _ = torch.topk(p, k=100)
        kth_prob = topk_probs[-1]
        p = torch.where(p < kth_prob,
                        torch.full_like(p, 0), p)
        p /= p.sum()
        p = p.cpu().detach().numpy()

        # Sample from probability distribution p
        # word_i is an integer representing a word.
        word_i = np.random.choice(np.arange(0,p.shape[0]), p=p)

        ############### TODO#2: Fill in the following code##############
        # 1. Convert from word_i (int)--> word (str)
        # 2. Append the word from 1. into the `predicted` list defined above.





        ############################end#################################

        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = current_seq.cpu().detach().numpy()
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    gen_sentences = ''.join(predicted)
    return gen_sentences

### Exercise 3: use `generate` function to generate three more songs. You may try using different starting words.

In [None]:
for t in range(NUM_EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader, model, loss_fn, optimizer)
    print(generate(model, 'วันที่ฉันเดินอยู่คนเดียว', int_to_vocab, vocab_to_int, predict_len=100))
print("Done!")

# Extra: Web scraping with Beautiful soup

In [None]:
from itertools import chain
from collections import Counter
import requests
from bs4 import BeautifulSoup



def scrape_siamzone_url(d):
    """
    Script to scrape Siamzone lyrics from a given song_id (integer)
    """
    soup = BeautifulSoup(requests.get('https://www.siamzone.com/music/thailyric/{}'.format(d)).content, 'html.parser')
    song_title, artist_name = soup.find('title').text.split('|')
    song_title, artist_name = song_title.replace("เนื้อเพลง ", "").strip(), artist_name.strip()
    try:
        n_views = ' '.join(soup.find('div', attrs={'class': 'has-text-info'}).text.strip().split())
    except:
        n_views = ''
    try:
        full_lyrics = soup.find_all('div', attrs={'class': 'column is-6-desktop'})[1]
        lyrics = full_lyrics.find("div", attrs={'style': "margin-bottom: 1rem;"}).text.strip()
    except:
        lyrics = ""
    return {
        'url': 'https://www.siamzone.com/music/thailyric/%d' % d,
        'soup': soup,
        'song_title': song_title,
        'artist_name': artist_name,
        'n_views': n_views,
        'lyrics': lyrics
    }

def clean_lyrics(lyric):
    """
    Clean lines that do not contain lyrics
    """
    lines = lyric.split('\n')
    lyrics_clean = []
    for line in lines:
        # remove headers from the file
        headers = [
            'เพลง ', 'คำร้อง ', 'คำร้อง/ทำนอง ', 'ศิลปิน ', 'ทำนอง ',
            'เรียบเรียง ', 'เพลงประกอบละคร ', 'อัลบัม ', 'ร่วมร้องโดย ',
            'เนื้อร้อง/ทำนอง', 'ทำนอง/เรียบเรียง ', 'เพลงประกอบภาพยนตร์ ',
            'เพลงประกอบละครซิทคอม ', 'คำร้อง/ทำนอง/เรียบเรียง ',
            'คำร้อง/เรียบเรียง ', 'เพลงประกอบ ', 'ร้องโดย ',
            'ทำนอง / เรียบเรียง :', ' สังกัด'
        ]
        if any(line.startswith(s) for s in headers):
            pass
        else:
            line = ' '.join(line.replace('(', ' ').replace(')', ' ').replace('-', ' ').split())
            lyrics_clean.append(line)
    return '\n'.join(lyrics_clean).strip()

def scrape_siamzone():
    data = []
    for i in range(23649, 28649):
        try:
            data.append(scrape_siamzone_url(i))
        except:
            pass
        if i % 100 == 0:
            print(i)

    df = pd.DataFrame(data)
    df['lyrics'] = df['lyrics'].map(clean_lyrics)
    return df