### Statistical Learning for Data Science 2 (229352)
#### Instructor: Donlapark Ponnoprat

#### [Course website](https://donlapark.pages.dev/229352/)

## Lab #11

In [None]:
!pip install pythainlp
!wget http://www.donlapark.cmustat.com/229352/thai_lyrics.csv

# Song lyrics generation

In [None]:
from collections import Counter
import csv
from itertools import chain
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from pythainlp import word_tokenize


df = pd.read_csv('thai_lyrics.csv', engine='python')
df.head()

In [None]:
tokenized_lyrics = df['lyrics'].map(word_tokenize)
print(tokenized_lyrics[0])

### Convert from words to numbers

In [None]:
#[[song , number , one],[song , number , two]] -> [song , number , one , song , number , two]
def flatten(ls):
    """
    Flatten list of list
    """
    return list(chain.from_iterable(ls))

#[song , number ,one, number, two] -> [1,2,3,2,4] and [1,2,3] -> [song , number , one]
def create_lookup_dict(tokenized_lyrics, n_min=None):
    """
    Create lookup dictionary from list of words (lyrics)
    """
    word_counts = Counter(tokenized_lyrics)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    if n_min is not None:
        sorted_vocab = {k: v for k, v in word_counts.items() if v >= n_min}
    vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 0)}
    int_to_vocab = {i: word for word, i in vocab_to_int.items()}
    return (vocab_to_int, int_to_vocab)

In [None]:
tokenized_lyrics = flatten(tokenized_lyrics)
tokenized_lyrics = [token if token != '\n' else ' ' for token in tokenized_lyrics]
word_counts = Counter(tokenized_lyrics)
vocab_to_int, int_to_vocab = create_lookup_dict(tokenized_lyrics, n_min=None)

In [None]:
vocab_to_int["ใคร"]

In [None]:
len(vocab_to_int)

In [None]:
int_to_vocab[12]

### Create Features (4 words in a song) and Target (the next word)

In [None]:
sequence_length = 4

tokenized_indices = [vocab_to_int.get(token, 0) for token in tokenized_lyrics]

X, target = [], []
for n in range(0, len(tokenized_indices) - sequence_length, 1):
  x = tokenized_indices[n: n + sequence_length]
  y = tokenized_indices[n + sequence_length]
  X.append(np.array(x))
  target.append(y)
X = np.array(X)
target = np.array(target)

In [None]:
X[0]

In [None]:
target[0]

In [None]:
class MyDataSet(torch.utils.data.Dataset):
  def __init__(self, X, y):
    super(MyDataSet, self).__init__()
    self._X = X
    self._y = y

  def __len__(self):
    return self._X.shape[0]

  def __getitem__(self, index):
    X = self._X[index]
    y = self._y[index]
    return X, y

In [None]:
# Hyperparameters
LEARNING_RATE = 0.001
BATCH_SIZE = 256
NUM_EPOCHS = 5

# Classification
NUM_CLASSES = 25203

dataset = MyDataSet(X, target)

trainloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

## Layers

1. `nn.Embedding(num_embeddings, embedding_dim)` [Documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)

![emb](https://miro.medium.com/max/720/1*NuWIU2Iew3Bm8NR78tRj8A.png)

2. LSTM [Documentation](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

![lstm](https://i.stack.imgur.com/sBEBp.png)

In [None]:
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
input = torch.randn(5, 3, 10)  # (batch_size, time_steps, input_size)
h0 = torch.randn(2, 5, 20)  # (num_layers, batch_size, hidden_size)
c0 = torch.randn(2, 5, 20)  # (num_layers, batch_size, hidden_size)
output, (h1, c1) = lstm(input, (h0, c0))
output.shape

### Exercise 1: Fill in the code in `TODO 1` and `TODO 2` with the following requirements:

- The embedding layer must transform the one-hot-encoding input into a vector of 200 dimension
- The model must have a stack of 2 LSTMs
- Each LSTM's hidden state vectors must have 256 dimensions

In [None]:
from typing_extensions import Self
class Simple_LSTM(nn.Module):
    def __init__(self):
        super(Simple_LSTM, self).__init__()

        # TODO 1: Fill in the layers' parameters
        # The output of the final layer must be a vector representing the next word
        self.embeddings = nn.Embedding(num_embeddings= , embedding_dim= )
        self.lstm = nn.LSTM(input_size= , hidden_size= , num_layers= , dropout=0.2, batch_first=True)
        self.fc = nn.Linear( , )

    def forward(self, x):
        # TODO 2: Apply the Embedding layer
        return out

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Simple_LSTM().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

### Exercise 2: fill in the code below

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def generate(model, start_word, pad_value=0, predict_len=40):
    # Tokenize the input sentence
    words = word_tokenize(start_word)
    start_word_ids = []
    # List to store the predictions
    predicted = words

    # Words -> Integers
    word_ids = [vocab_to_int.get(word, pad_value) for word in words]

    #[28,15,16] -> [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,15,16]
    current_seq = [np.pad(word_ids, (4 - len(word_ids), pad_value), 'constant')]

    for _ in range(predict_len):
        current_seq = torch.LongTensor(np.array(current_seq)).to(device)
        # get the next word probabilities
        p = model(current_seq)
        p = nn.Softmax(dim=1)(p).to(device).detach().numpy()
        # p = [[0.1,0.2,0.05,0.03,0.02,0.3,0.2,0.1]]
        p = p[0]
        # p = [0.1,0.2,0.05,0.03,0.02,0.3,0.2,0.1]


        # Sample from probability distribution p
        word_i = np.random.choice(np.arange(0, p.shape[0]), p=p)
        #word_i is an integer representing a word.

        #### TODO: Fill in the following two lines of code#########
        # 1. Convert from word_i (int)--> word (str)

        # 2. Append the word from 1. into `predicted` list.

        ##################end#####################################

        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = current_seq.detach().to(device).detach().numpy()
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    gen_sentences = ''.join(predicted)
    return gen_sentences

### Exercise 3: use `generate` function to generate new text for 10 epochs.

In [None]:
pad_int = vocab_to_int[' ']

for t in range(NUM_EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader, model, loss_fn, optimizer)
    with torch.no_grad():
      print(generate(model, 'ฉันเห็น', pad_value=pad_int, predict_len=40))
print("Done!")

# Web scraping with Beautiful soup

In [None]:

from itertools import chain
from collections import Counter
import requests
from bs4 import BeautifulSoup



def scrape_siamzone_url(d):
    soup = BeautifulSoup(requests.get('https://www.siamzone.com/music/thailyric/%d' % d).content, 'html.parser')
    title, artist_name = soup.find('title').text.split('|')
    title, artist_name = title.strip(), artist_name.strip()
    n_shares = int(soup.find('span', attrs={'class': 'sz-social-number'}).text.replace(',', ''))
    full_lyrics = soup.find('div', attrs={'itemprop': 'articleBody'}).text.strip()
    return {
        'url': 'https://www.siamzone.com/music/thailyric/%d' % d,
        'soup': soup,
        'title': title,
        'artist_name': artist_name,
        'n_shares': n_shares,
        'full_lyrics': full_lyrics
    }

def scrape_siamzone():
    data = []
    for i in range(14050, 16041):
        try:
            data.append(scrape_siamzone_url(i))
        except:
            pass

    df = pd.DataFrame(data)
    df['lyrics'] = df.full_lyrics.map(clean_lyrics)
    return df