In [4]:
from pathlib import Path
import numpy as np

## Defining dataset

In [39]:
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler
import math


class PySourceDataset(Dataset):
    
    def __init__(self, main_folder, seq_len=80):
        self.main_folder = main_folder
        self.seq_len = seq_len
        
        # Read all python source files
        files = list()
        for path in Path(main_folder).rglob('*.py'):
            with open(path, encoding="utf8") as fp:
                files.append(fp.read())       
        print("Number of .py files:", len(files))

        # Concatenate them with a page separator
        # corpus = "\n\nNEW FILE\n\n".join(files)
        self.corpus = "\n".join(files)
        self.corpus_len = len(self.corpus)
        print("Chars in corpus:", self.corpus_len)
        
        # Define useful mappings
        idx = 0
        self.c2i = dict()
        for c in corpus:
            if c not in self.c2i:
                self.c2i[c] = idx
                idx += 1
        self.i2c = {v: k for k, v in self.c2i.items()}
        self.n_chars = len(self.c2i)
        print("Number of distinct chars:", self.n_chars)
    
    def _get_onehot(self, c):
        t = torch.zeros(1, self.n_chars)
        t[0][self.c2i[c]] = 1
        return t
    
    def __len__(self):
        return self.corpus_len - self.seq_len
    
    def __getitem__(self, idx):
        assert idx < len(self)
        
        # raw text sequences
        source_seq = self.corpus[idx : idx + self.seq_len]
        target_seq = self.corpus[idx + 1 : idx + self.seq_len + 1]
        
        # one-hot
        source_seq_t = torch.stack(
            [self._get_onehot(c) for c in source_seq]
        ).squeeze(dim=1)
        target_seq_t = torch.Tensor([c2i[c] for c in target_seq]).long()

        return (source_seq_t, target_seq_t)

In [40]:
pyd = PySourceDataset('./scikit-learn-master')

Number of .py files: 838
Chars in corpus: 10407415
Number of distinct chars: 164


In [41]:
import pytorch_lightning as pl
import torch.nn.functional as F
import IPython


class CharLM(pl.LightningModule):
    def __init__(
        self,
        input_size,
        hidden_size,
        output_size,
        num_layers=2,
        dropout=0.5,
        learning_rate=1e-4
    ):
        super().__init__()
        self.save_hyperparameters()
        
        self.lstm = torch.nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
        )
        self.Who = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=2)
        
    def forward(self, x):
        out, hidden_state = self.lstm(x)
        out = self.Who(out)
        logprob = self.softmax(out)
        return logprob, hidden_state

    def training_step(self, batch, batch_idx):
        source, target = batch
        y_hat, hidden_state = self(source)
        # print(y_hat.shape, target.shape)
        # IPython.embed()
        
        # compute sum of losses across time steps
        loss = F.nll_loss(y_hat.view(-1, y_hat.shape[2]), target.view(-1))
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)

In [42]:
from aim.pytorch_lightning import AimLogger

input_size = output_size = pyd.n_chars
hidden_size = 512
lr = 1e-4
pl.seed_everything(42)
py_dataset = PySourceDataset('./scikit-learn-master')
train_loader = DataLoader(py_dataset, batch_size=3)
model = CharLM(input_size, hidden_size, output_size, learning_rate=lr)
aim_logger = AimLogger(
    experiment="pt_lightning_exp",
    train_metric_prefix="train_",
)
    trainer = pl.Trainer(logger=aim_logger)
    trainer.fit(model, train_loader)

Number of .py files: 838
Chars in corpus: 10407415


GPU available: False, used: False
TPU available: None, using: 0 TPU cores

  | Name    | Type       | Params
---------------------------------------
0 | lstm    | LSTM       | 3.5 M 
1 | Who     | Linear     | 84.1 K
2 | softmax | LogSoftmax | 0     
---------------------------------------
3.6 M     Trainable params
0         Non-trainable params
3.6 M     Total params


Number of distinct chars: 164


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…






1