# Init

In [None]:
!pip install pytorch-lightning
import pytorch_lightning as pl

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import re
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
import torch
from sklearn.preprocessing import OneHotEncoder
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data preparation

In [None]:
topics = glob.glob('../input/poemsdataset/topics/*')
topics = [re.sub('../input/poemsdataset/topics/','',x) for x in topics]

In [None]:
poems = pd.DataFrame(columns=['poem','topic'])

for topic in topics:
    path = glob.glob(f'../input/poemsdataset/topics/{topics[0]}/*')
    
    for p in path:
        with open(p,'r') as file:
            poem = file.read()
        poems.loc[len(poems.index)] = [poem, topic]

In [None]:
poems = poems.sample(frac=1)
# poems.to_csv('poems.csv')
# poems = pd.read_csv('poems.csv')
# poems.drop(['Unnamed: 0'],axis=1, inplace=True)

In [None]:
tokenizer = RegexpTokenizer(pattern='\w+')

def tokenizer_func(x):
    return tokenizer.tokenize(x.lower())

poems['poem'] = poems['poem'].apply(tokenizer_func)
poems['poem'] = poems['poem'].apply(lambda x : [y.lower() for y in x])

In [None]:
np.percentile(np.array([len(x) for x in poems.poem.values]), 50)

In [None]:
def pad_sequences(text):
    
    if len(text) <= 105:
        text.extend(['']*(105 - len(text)))
        
    else:
        text = text[:105]
        
    return text

poems['poem'] = poems['poem'].apply(pad_sequences)

In [None]:
# topics = poems['topic'].unique().tolist()
# topic_dict = dict(zip(topics, np.arange(len(topics))))
# poems['topic'] = poems['topic'].map(topic_dict)
o = OneHotEncoder()
o.fit(poems['topic'].values.reshape(-1,1))

poems[[f'topic-{x}' for x in np.arange(144)]] = o.transform(poems['topic'].values.reshape(-1,1)).toarray()

words=[]
for poem in poems['poem']:
    words.extend(poem)
    
words = set(words)

dictionary = defaultdict(default_factory=-1)
dictionary.update(zip(words,np.arange(len(words))))
    
poems_embedded=[]

for i in np.arange(poems.shape[0]):
    poems_embedded.append([dictionary[x] for x in poems.loc[i,'poem']])
    
poems['poems_embedded'] = poems_embedded

In [None]:
poems.drop(['poem','topic','poems_embedded'], axis=1)

In [None]:
class datamod(pl.LightningDataModule):
    
    def setup(self,stage):
            
        train_indices = np.random.choice(poems.index.tolist(), int(0.8 * poems.shape[0]))
        remaining_indices = list(set(poems.index.tolist()).difference(set(train_indices)))
        val_indices = np.random.choice(remaining_indices, int(0.1 * poems.shape[0]))
        test_indices = list(set(remaining_indices).difference(set(val_indices)))
        
        ## VERY IMPORTANT : IF 0 IS NOT IN INDEX, TORCH.TENSOR() AND TORCH.FROM_NUMPY() DOESN'T WORK
        
        train_indices = np.append(train_indices, 0) if 0 not in train_indices else train_indices
        val_indices = np.append(val_indices, 0) if 0 not in val_indices else val_indices
        test_indices = np.append(test_indices, 0) if 0 not in test_indices else test_indices
        
        self.poems = poems
        
        self.X_train = self.poems.loc[train_indices, 'poems_embedded']
        self.X_val = self.poems.loc[val_indices, 'poems_embedded']
        self.X_test = self.poems.loc[test_indices, 'poems_embedded']
        
        
        ## VERY IMPORTANT : CONVERTING A SERIES INTO A DF TO MAKE IT 2D AND HENCE INTO A NP ARRAY
        
        self.X_train = pd.DataFrame(self.X_train.tolist(), columns = np.arange(105)).values
        self.X_val = pd.DataFrame(self.X_val.tolist(), columns = np.arange(105)).values
        self.X_test = pd.DataFrame(self.X_test.tolist(), columns = np.arange(105)).values
        
        self.Y = self.poems.drop(['poem','topic','poems_embedded'], axis=1)
        self.y_train = self.Y.loc[train_indices, :]
        self.y_val = self.Y.loc[val_indices, :]
        self.y_test = self.Y.loc[test_indices, :]
        
        self.train_dataset = torch.utils.data.TensorDataset(torch.Tensor(self.X_train), torch.Tensor(self.y_train.values))
        self.val_dataset = torch.utils.data.TensorDataset(torch.Tensor(self.X_val), torch.Tensor(self.y_val.values))
        self.test_dataset = torch.utils.data.TensorDataset(torch.Tensor(self.X_test), torch.Tensor(self.y_test.values))
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=32)
    
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=32)
    
    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=32)

# Model

In [None]:
class poem_classifier(pl.LightningModule):
    
    def __init__(self):
        
        super(poem_classifier, self).__init__()
        
        self.embedding = torch.nn.Embedding(num_embeddings=len(list(dictionary.keys())), embedding_dim=64)
        self.lstm = torch.nn.LSTM(input_size = 64, hidden_size=32, num_layers = 1, batch_first=True)
        self.fc1 = torch.nn.Linear(in_features=32, out_features=64)
        self.fc2 = torch.nn.Linear(in_features=64, out_features=128)
        self.fc3 = torch.nn.Linear(in_features=128, out_features=144)
        
    def forward(self, x):
        
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:,-1]
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.nn.functional.softmax(self.fc3(x))
        
        return x
    
    def training_step(self, train_batch, batch_idx):
        
        X, y = train_batch
        logits = self.forward(X.long())
        loss = torch.nn.CrossEntropyLoss()
        train_loss = loss(y, logits)
        self.log('train_loss', train_loss, logger=True, prog_bar=True)
        return train_loss
    
    def validation_step(self, val_batch, batch_idx):
        
        X, y = val_batch
        logits = self.forward(X.long())
        loss = torch.nn.CrossEntropyLoss()
        val_loss = loss(y, logits)
        self.log('val_loss',val_loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        
    def test_step(self, test_batch, batch_idx):
        
        self.eval()
        X, y = test_batch
        logits = self.forward(X.long())
        loss = torch.nn.CrossEntropyLoss()
        val_loss = loss(y, logits)
        self.log('test_loss',val_loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        
    def predict_step(self,x):
        
        op = self.forward(x.long())
        return np.argmax(op.detach().numpy())
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)
        return optimizer

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("lightning_logs", name='poem-classification')
%load_ext tensorboard
!rm -rf ./logs/

In [None]:
data = datamod()
model = poem_classifier()

trainer = pl.Trainer(max_epochs=2, accelerator='gpu', logger=logger)

trainer.fit(model, data)

In [None]:
# %tensorboard --logdir ./lightning_logs

In [None]:
test_dataset = data.test_dataloader()
trainer.test(model, test_dataset)

In [None]:
model.predict_step(torch.Tensor(poems.loc[2549, 'poems_embedded']))

In [None]:
outputs = model(next(iter(test_dataset))[0].long())
outputs

In [None]:
np.argmax(outputs, axis=1)

In [None]:
np.argmax(next(iter(test_dataset))[1].numpy(),axis=1)