In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load dataset from CSV
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    text = ' '.join(df['text'].tolist())  # Assuming 'poem' column has text
    return text


In [4]:
# Tokenization and vocabulary creation
def tokenize(text):
    words = text.split()
    vocab = sorted(set(words))
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for word, i in word_to_idx.items()}
    return words, vocab, word_to_idx, idx_to_word

In [5]:
# Convert words to one-hot encoding
def one_hot_encode(words, vocab_size, word_to_idx):
    encoded = np.zeros((len(words), vocab_size), dtype=np.float32)
    for i, word in enumerate(words):
        encoded[i, word_to_idx[word]] = 1.0
    return encoded


In [7]:
# encoded = one_hot_encode(words,len(vocab),word_to_idx)

# print(encoded[0])

In [14]:

# Define PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, encoded_words, seq_length):
        self.inputs = np.array([encoded_words[i:i+seq_length] for i in range(len(encoded_words) - seq_length)])
        self.targets = np.array([np.argmax(encoded_words[i+seq_length]) for i in range(len(encoded_words) - seq_length)])  # Convert to class indices
        self.inputs = torch.tensor(self.inputs, dtype=torch.float32)
        self.targets = torch.tensor(self.targets, dtype=torch.long)  # Ensure LongTensor for CrossEntropyLoss
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [7]:
# Define LSTM model with One-Hot Encoding
class LSTMLanguageModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.3):
        super(LSTMLanguageModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)  # No softmax; CrossEntropyLoss applies it internally
        return out

In [16]:
# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=50):
    model.to(device)  # Move model to GPU
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
        for inputs, targets in progress_bar:
            inputs = inputs.to(device).float()  # Ensure FloatTensor for LSTM
            targets = targets.to(device).long()  # Ensure LongTensor for CrossEntropyLoss
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss/len(train_loader))
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

In [17]:
# Test loop for accuracy
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs = inputs.to(device).float()  # Ensure FloatTensor for LSTM
            targets = targets.to(device).long()  # Ensure LongTensor for accuracy calculation
            outputs = model(inputs)
            predicted = torch.argmax(outputs, dim=1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

In [10]:
print(f"Using device: {device}")


Using device: cuda


In [18]:
# Main execution
csv_path = "/kaggle/input/100poem/poems.csv"  # Change this to your actual CSV file path
text = load_data(csv_path)
words, vocab, word_to_idx, idx_to_word = tokenize(text)
encoded_words = one_hot_encode(words, len(vocab), word_to_idx)

seq_length = 10  # Adjusted sequence length for better training
batch_size = 32  # Increased batch size to fully utilize T4 GPU

dataset = TextDataset(encoded_words, seq_length)
train_size = int(0.9 * len(dataset))  # Increased training split
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [19]:
input_size = len(vocab)
hidden_size = 256  # Increased hidden size for better learning
output_size = len(vocab)

dropout_rate = 0.3  # Dropout to reduce overfitting
model = LSTMLanguageModel(input_size, hidden_size, output_size, dropout_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)  # Added L2 regularization

train_model(model, dataloader_train, criterion, optimizer)

test_model(model, dataloader_test)


                                                                     

Epoch 1, Loss: 7.464825186921262


                                                                      

Epoch 2, Loss: 7.054550866971071


                                                                      

Epoch 3, Loss: 6.924604674180348


                                                                      

Epoch 4, Loss: 6.75347157456409


                                                                      

Epoch 5, Loss: 6.496244740897212


                                                                      

Epoch 6, Loss: 6.166610841778503


                                                                      

Epoch 7, Loss: 5.787975005719853


                                                                      

Epoch 8, Loss: 5.346546948983751


                                                                      

Epoch 9, Loss: 4.833680253604363


                                                                       

Epoch 10, Loss: 4.287534544865291


                                                                       

Epoch 11, Loss: 3.7380456595585265


                                                                       

Epoch 12, Loss: 3.2006163602245263


                                                                       

Epoch 13, Loss: 2.6661528259858316


                                                                       

Epoch 14, Loss: 2.1883906338406707


                                                                       

Epoch 15, Loss: 1.74339821335228


                                                                       

Epoch 16, Loss: 1.3714973129246426


                                                                       

Epoch 17, Loss: 1.0713105546845787


                                                                       

Epoch 18, Loss: 0.8234375852277909


                                                                        

Epoch 19, Loss: 0.6388864118563032


                                                                        

Epoch 20, Loss: 0.5169267618022431


                                                                        

Epoch 21, Loss: 0.41804694375772583


                                                                        

Epoch 22, Loss: 0.3431486703840823


                                                                        

Epoch 23, Loss: 0.28381360316884585


                                                                        

Epoch 24, Loss: 0.2441704174953288


                                                                        

Epoch 25, Loss: 0.22173176925286822


                                                                        

Epoch 26, Loss: 0.19028067785775524


                                                                        

Epoch 27, Loss: 0.17012512620709752


                                                                        

Epoch 28, Loss: 0.15259110590378785


                                                                        

Epoch 29, Loss: 0.1435304281258977


                                                                        

Epoch 30, Loss: 0.1297616028526648


                                                                        

KeyboardInterrupt: 

In [20]:
test_model(model, dataloader_test)

Test Accuracy: 2.99%
