In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
from tqdm import tqdm


In [2]:
# Load dataset from CSV
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    text = ' '.join(df['text'].tolist())  # Assuming 'poem' column has text
    return text

In [None]:
# # load data is returning a single string
# text = load_data('/kaggle/input/poems100/poems.csv')

# print(text[0])

In [3]:
def tokenize(text):
    words = text.split()
    vocab = sorted(set(words))
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for word, i in word_to_idx.items()}
    return words, vocab, word_to_idx, idx_to_word


In [None]:
# words, vocab, word_to_idx, idx_to_word = tokenize(text)

# print(words[3])
# print(len(words))
# print(len(vocab))


# word is a lsit of words , vocab is a list of unique words word_to_idx is a dictionar maps each word to an index
# idx_to_word is a dictionar which is the reverse of word_to_idx

In [4]:
def one_hot_encode(words, vocab_size, word_to_idx):
    encoded = np.zeros((len(words), vocab_size), dtype=np.float32)
    for i, word in enumerate(words):
        encoded[i, word_to_idx[word]] = 1.0
    return encoded

In [None]:
# encoded = one_hot_encode(words,len(vocab),word_to_idx)

# print(encoded[0])

In [5]:
class TextDataset(Dataset):
    def __init__(self, encoded_words, seq_length):
        self.inputs = np.array([encoded_words[i:i+seq_length] for i in range(len(encoded_words) - seq_length)])
        self.targets = np.array([encoded_words[i+seq_length] for i in range(len(encoded_words) - seq_length)])
        self.inputs = torch.tensor(self.inputs, dtype=torch.float32)
        self.targets = torch.tensor(self.targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [20]:
# Define simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :]) #last timestep ko hi tu fc layer ke through send karega
        return out


In [7]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=20):
    model.to(device)  # Move model to GPU
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to GPU
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss/len(train_loader))
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")


In [9]:
print(f"Using device: {device}")


Using device: cuda


In [29]:
csv_path = "/kaggle/input/poems100/poems.csv"  # Change this to your actual CSV file path
text = load_data(csv_path)
words, vocab, word_to_idx, idx_to_word = tokenize(text)
encoded_words = one_hot_encode(words, len(vocab), word_to_idx)

seq_length = 10  # Adjusted sequence length for better training
batch_size = 32  # Increased batch size to fully utilize T4 GPU

dataset = TextDataset(encoded_words, seq_length)
print(f"Dataset size: {len(dataset)}")
train_size = int(0.8 * len(dataset))  # Increased training split
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)


Dataset size: 24724


In [30]:

input_size = len(vocab)
hidden_size = 256  # Increased hidden size for better learning
output_size = len(vocab)

model = SimpleRNN(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005,weight_decay = 1e-5)  # Reduced learning rate for stability

train_model(model, dataloader_train, criterion, optimizer)


                                                                      

Epoch 1, Loss: 7.579821918623128


                                                                      

Epoch 2, Loss: 6.916244836539175


                                                                      

Epoch 3, Loss: 6.5588430711256285


                                                                      

Epoch 4, Loss: 6.124082422795704


                                                                      

Epoch 5, Loss: 5.663477564859467


                                                                      

Epoch 6, Loss: 5.182097352571749


                                                                       

Epoch 7, Loss: 4.683978407756578


                                                                       

Epoch 8, Loss: 4.186225919615664


                                                                       

Epoch 9, Loss: 3.6747380342930316


                                                                        

Epoch 10, Loss: 3.1807554942918324


                                                                        

Epoch 11, Loss: 2.7052283348675115


                                                                        

Epoch 12, Loss: 2.274581965028951


                                                                        

Epoch 13, Loss: 1.8810657447682444


                                                                        

Epoch 14, Loss: 1.5410831442557167


                                                                        

Epoch 15, Loss: 1.2351031115636686


                                                                        

Epoch 16, Loss: 1.0124573686396363


                                                                        

Epoch 17, Loss: 0.8333989506488085


                                                                         

Epoch 18, Loss: 0.6664990308018994


                                                                         

Epoch 19, Loss: 0.5527187110389762


                                                                         

Epoch 20, Loss: 0.45215869510790063




In [31]:
# Test loop for accuracy
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            predicted = torch.argmax(outputs, dim=1)
            target_labels = torch.argmax(targets, dim=1)
            correct += (predicted == target_labels).sum().item()
            total += targets.size(0)
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

In [32]:
test_model(model, dataloader_test)

Test Accuracy: 3.68%
