In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
from tqdm import tqdm


In [2]:
# Load dataset from CSV
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    text = ' '.join(df['text'].tolist())  # Assuming 'poem' column has text
    return text

In [None]:
# # load data is returning a single string
# text = load_data('/kaggle/input/poems100/poems.csv')

# print(text[0])

In [3]:
def tokenize(text):
    words = text.split()
    vocab = sorted(set(words))
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for word, i in word_to_idx.items()}
    return words, vocab, word_to_idx, idx_to_word


In [None]:
# words, vocab, word_to_idx, idx_to_word = tokenize(text)

# print(words[3])
# print(len(words))
# print(len(vocab))


# word is a lsit of words , vocab is a list of unique words word_to_idx is a dictionar maps each word to an index
# idx_to_word is a dictionar which is the reverse of word_to_idx

In [4]:
def encode_words(words, word_to_idx):
    return torch.tensor([word_to_idx[word] for word in words], dtype=torch.long)


In [7]:
# encoded = one_hot_encode(words,len(vocab),word_to_idx)

# print(encoded[0])

In [5]:
# Define PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, indexed_words, seq_length):
        self.inputs = torch.stack([indexed_words[i:i+seq_length] for i in range(len(indexed_words) - seq_length)])
        self.targets = indexed_words[seq_length:]
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [6]:
# Define LSTM model with Embedding Layer
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, dropout_rate=0.3):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)  # No softmax; CrossEntropyLoss applies it internally
        return out

In [7]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
def train_model(model, train_loader, criterion, optimizer, epochs=30):
    model.to(device)  # Move model to GPU
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device).long()  # Ensure LongTensor
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss/len(train_loader))
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

In [9]:
print(f"Using device: {device}")


Using device: cuda


In [10]:
csv_path = "/kaggle/input/100poem/poems.csv"  # Change this to your actual CSV file path
text = load_data(csv_path)
words, vocab, word_to_idx, idx_to_word = tokenize(text)
indexed_words = encode_words(words, word_to_idx)

seq_length = 10  # Adjusted sequence length for better training
batch_size = 32  # Increased batch size to fully utilize T4 GPU

dataset = TextDataset(indexed_words, seq_length)
train_size = int(0.9 * len(dataset))  # Increased training split
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)



In [19]:
input_size = len(vocab)
embedding_dim = 128  # Embedding size
hidden_size = 256  # Increased hidden size for better learning
output_size = len(vocab)

dropout_rate = 0.3  # Dropout to reduce overfitting
model = LSTMLanguageModel(input_size, embedding_dim, hidden_size, output_size, dropout_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)  # Added L2 regularization

train_model(model, dataloader_train, criterion, optimizer)


                                                                      

Epoch 1, Loss: 7.4310614816073715


                                                                      

Epoch 2, Loss: 6.869580481244229


                                                                      

Epoch 3, Loss: 6.524357519615656


                                                                       

Epoch 4, Loss: 6.114869213652336


                                                                       

Epoch 5, Loss: 5.6425880173157


                                                                       

Epoch 6, Loss: 5.116774798809797


                                                                       

Epoch 7, Loss: 4.588153701404045


                                                                       

Epoch 8, Loss: 4.0614291679585115


                                                                       

Epoch 9, Loss: 3.545469996572911


                                                                        

Epoch 10, Loss: 3.0681398911722777


                                                                        

Epoch 11, Loss: 2.6355178943653215


                                                                        

Epoch 12, Loss: 2.25998718522746


                                                                        

Epoch 13, Loss: 1.9147544487796981


                                                                        

Epoch 14, Loss: 1.6316609098308388


                                                                        

Epoch 15, Loss: 1.3887756811990135


                                                                        

Epoch 16, Loss: 1.165297710090533


                                                                        

Epoch 17, Loss: 0.9703924189182533


                                                                        

Epoch 18, Loss: 0.8169522388656248


                                                                         

Epoch 19, Loss: 0.6872178938710827


                                                                         

Epoch 20, Loss: 0.5716859205658066


                                                                         

Epoch 21, Loss: 0.4825344566555544


                                                                         

Epoch 22, Loss: 0.40267642974939155


                                                                         

Epoch 23, Loss: 0.34695930913861456


                                                                         

Epoch 24, Loss: 0.2974361328022747


                                                                         

Epoch 25, Loss: 0.260317989828429


                                                                         

Epoch 26, Loss: 0.22662238473736349


                                                                         

Epoch 27, Loss: 0.19947206308187423


                                                                         

Epoch 28, Loss: 0.18441646806788684


                                                                         

Epoch 29, Loss: 0.17044859009677138


                                                                         

Epoch 30, Loss: 0.15656379211008892




In [12]:
# Test loop for accuracy
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device).long()  # Ensure LongTensor
            outputs = model(inputs)
            predicted = torch.argmax(outputs, dim=1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

In [20]:
test_model(model, dataloader_test)

Test Accuracy: 7.36%
