In [None]:
from google.colab import drive 
drive.mount("/content/drive")
DRIVE_PATH = "/content/drive/MyDrive/NLPProject"

Mounted at /content/drive


In [None]:
! git clone https://github.com/adamklec/copynet.git
! pip install tensorboardX

Cloning into 'copynet'...
remote: Enumerating objects: 30, done.[K
remote: Total 30 (delta 0), reused 0 (delta 0), pack-reused 30[K
Unpacking objects: 100% (30/30), done.
Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)
[K     |████████████████████████████████| 122kB 8.1MB/s 
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.2


In [None]:
import os 
print(os.getcwd())
os.chdir("/content/copynet/")
print(os.getcwd())
if not os.path.exists("/content/copynet/data"):
    os.makedirs("/content/copynet/data")

/content
/content/copynet


In [None]:
import argparse
import time
import numpy as np
import pandas as pd
import torch
import string
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
import re
from dataset import SequencePairDataset
from model.encoder_decoder import EncoderDecoder
from evaluate import evaluate
from utils import to_np, trim_seqs

from tensorboardX import SummaryWriter
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [None]:
def train(encoder_decoder: EncoderDecoder,
          train_data_loader: DataLoader,
          model_name,
          val_data_loader: DataLoader,
          keep_prob,
          teacher_forcing_schedule,
          lr,
          max_length):

    global_step = 0
    loss_function = torch.nn.NLLLoss(ignore_index=0)
    optimizer = optim.Adam(encoder_decoder.parameters(), lr=lr)
    model_path = './model/' + model_name + '/'

    for epoch, teacher_forcing in enumerate(teacher_forcing_schedule):
        print('epoch %i' % epoch, flush=True)

        for batch_idx, (input_idxs, target_idxs, input_tokens, target_tokens) in enumerate(tqdm(train_data_loader)):
            # input_idxs and target_idxs have dim (batch_size x max_len)
            # they are NOT sorted by length

            lengths = (input_idxs != 0).long().sum(dim=1)
            sorted_lengths, order = torch.sort(lengths, descending=True)

            input_variable = Variable(input_idxs[order, :][:, :max(lengths)])
            target_variable = Variable(target_idxs[order, :])

            optimizer.zero_grad()
            output_log_probs, output_seqs = encoder_decoder(input_variable,
                                                            list(sorted_lengths),
                                                            targets=target_variable,
                                                            keep_prob=keep_prob,
                                                            teacher_forcing=teacher_forcing)

            batch_size = input_variable.shape[0]

            flattened_outputs = output_log_probs.view(batch_size * max_length, -1)

            batch_loss = loss_function(flattened_outputs, target_variable.contiguous().view(-1))
            batch_loss.backward()
            optimizer.step()

            batch_outputs = trim_seqs(output_seqs)

            batch_targets = [[list(seq[seq > 0])] for seq in list(to_np(target_variable))]

            batch_bleu_score = corpus_bleu(batch_targets, batch_outputs, smoothing_function=SmoothingFunction().method1)

            if global_step < 10 or (global_step % 10 == 0 and global_step < 100) or (global_step % 100 == 0 and epoch < 2):
                input_string = "Amy, Please schedule a meeting with Marcos on Tuesday April 3rd. Adam Kleczewski"
                output_string = encoder_decoder.get_response(input_string)
                writer.add_text('schedule', output_string, global_step=global_step)

                input_string = "Amy, Please cancel this meeting. Adam Kleczewski"
                output_string = encoder_decoder.get_response(input_string)
                writer.add_text('cancel', output_string, global_step=global_step)

            if global_step % 100 == 0:

                writer.add_scalar('train_batch_loss', batch_loss, global_step)
                writer.add_scalar('train_batch_bleu_score', batch_bleu_score, global_step)

                for tag, value in encoder_decoder.named_parameters():
                    tag = tag.replace('.', '/')
                    writer.add_histogram('weights/' + tag, value, global_step, bins='doane')
                    writer.add_histogram('grads/' + tag, to_np(value.grad), global_step, bins='doane')

            global_step += 1

        val_loss, val_bleu_score = evaluate(encoder_decoder, val_data_loader)

        writer.add_scalar('val_loss', val_loss, global_step=global_step)
        writer.add_scalar('val_bleu_score', val_bleu_score, global_step=global_step)

        encoder_embeddings = encoder_decoder.encoder.embedding.weight.data
        encoder_vocab = encoder_decoder.lang.tok_to_idx.keys()
        writer.add_embedding(encoder_embeddings, metadata=encoder_vocab, global_step=0, tag='encoder_embeddings')

        decoder_embeddings = encoder_decoder.decoder.embedding.weight.data
        decoder_vocab = encoder_decoder.lang.tok_to_idx.keys()
        writer.add_embedding(decoder_embeddings, metadata=decoder_vocab, global_step=0, tag='decoder_embeddings')

        input_string = "Amy, Please schedule a meeting with Marcos on Tuesday April 3rd. Adam Kleczewski"
        output_string = encoder_decoder.get_response(input_string)
        writer.add_text('schedule', output_string, global_step=global_step)

        input_string = "Amy, Please cancel this meeting. Adam Kleczewski"
        output_string = encoder_decoder.get_response(input_string)
        writer.add_text('cancel', output_string, global_step=global_step)

        print('val loss: %.5f, val BLEU score: %.5f' % (val_loss, val_bleu_score), flush=True)
        torch.save(encoder_decoder, "%s%s_%i.pt" % (model_path, model_name, epoch))

        print('-' * 100, flush=True)


def main(model_name, use_cuda, batch_size, teacher_forcing_schedule, keep_prob, val_size, lr, decoder_type, vocab_limit, hidden_size, embedding_size, max_length, seed=42):

    model_path = './model/' + model_name + '/'

    # TODO: Change logging to reflect loaded parameters

    print("training %s with use_cuda=%s, batch_size=%i"% (model_name, use_cuda, batch_size), flush=True)
    print("teacher_forcing_schedule=", teacher_forcing_schedule, flush=True)
    print("keep_prob=%f, val_size=%f, lr=%f, decoder_type=%s, vocab_limit=%i, hidden_size=%i, embedding_size=%i, max_length=%i, seed=%i" % (keep_prob, val_size, lr, decoder_type, vocab_limit, hidden_size, embedding_size, max_length, seed), flush=True)

    if os.path.isdir(model_path):

        print("loading encoder and decoder from model_path", flush=True)
        encoder_decoder = torch.load(model_path + model_name + '.pt')

        print("creating training and validation datasets with saved languages", flush=True)
        train_dataset = SequencePairDataset(lang=encoder_decoder.lang,
                                            use_cuda=use_cuda,
                                            is_val=False,
                                            val_size=val_size,
                                            use_extended_vocab=(encoder_decoder.decoder_type=='copy'))

        val_dataset = SequencePairDataset(lang=encoder_decoder.lang,
                                          use_cuda=use_cuda,
                                          is_val=True,
                                          val_size=val_size,
                                          use_extended_vocab=(encoder_decoder.decoder_type=='copy'))

    else:
        os.mkdir(model_path)

        print("creating training and validation datasets", flush=True)
        train_dataset = SequencePairDataset(vocab_limit=vocab_limit,
                                            use_cuda=use_cuda,
                                            is_val=False,
                                            val_size=val_size,
                                            seed=seed,
                                            use_extended_vocab=(decoder_type=='copy'))

        val_dataset = SequencePairDataset(lang=train_dataset.lang,
                                          use_cuda=use_cuda,
                                          is_val=True,
                                          val_size=val_size,
                                          seed=seed,
                                          use_extended_vocab=(decoder_type=='copy'))

        print("creating encoder-decoder model", flush=True)
        encoder_decoder = EncoderDecoder(train_dataset.lang,
                                         max_length,
                                         embedding_size,
                                         hidden_size,
                                         decoder_type)

        torch.save(encoder_decoder, model_path + '/%s.pt' % model_name)

    if use_cuda:
        encoder_decoder = encoder_decoder.cuda()
    else:
        encoder_decoder = encoder_decoder.cpu()




In [None]:
# NEED TO UPLOAD THIS!!
df = pd.read_pickle("/content/data.pkl")

def remove_notes(word_list):
  return " ".join([re.sub(r'[\u0591-\u05BD\u05BF-\u05C2\u05C4-\u05C7]', '', token) for token in word_list])

df["input_sentence"] = df["input_sentence"].apply(lambda row: remove_notes(row))

df["target_sentences"] = df["target_sentences"].apply(lambda row: remove_notes(row))
results = ['א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'ך',  'כ',  'ל',  'ם',  'מ','ן', 'נ','ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק', 'ר', 'ש', 'ת']
map_letters = dict(list(zip(results, string.ascii_lowercase + "@")))
df = df.replace(map_letters, regex=True)

df["input_sentence_reveresed"] = df["input_sentence"].apply(lambda row: row[::-1])
df["target_sentences_reveresed"] = df["target_sentences"].apply(lambda row: row[::-1])


FileNotFoundError: ignored

In [None]:
# dash to spaces 

In [None]:
df["input_sentence"]

0           fjmbzn sfy l@qf@ fmaz@f madn amejn jefe fjsz
1               oaf@ faybs amt fhozjn zmze fuxdjen fwbaf
2         szf lp a@־oze jefe wfe azy llm jzyam bqj fjszf
3      wba jwa lm fosme zqe szyjn obp zo@ boruy ab@n ...
4      mamejqf qgbhe qmle maoy wsxjn en sm־lp en lj־q...
                             ...                        
125                   a@־sby jmd fzmh a@־zmh jmd fayulzd
126     ysef zu@ ajz jzosf ma azy zu@n zn fqbme qyde ebe
127    op־ejay ojn mz@f@ owyjn fqmaf ejay fbaz @of@ a...
128    bp־sojqdb qhzfp xybp ge hoze bqj־zqe lbzjn hoz...
129    jsxb famej jwhx amej abyen amej ab@n amej jefe...
Name: input_sentence, Length: 1172, dtype: object

In [None]:
# res = []
# for sentence in df.input_sentence.values:
#   for token in sentence.split(" "):
#     for char in token:
#       res.append(char)

# charcters = set(res)

In [None]:
df["data_type"].unique()

array(['test', 'val'], dtype=object)

In [None]:
vals = {"test":0, "val":0}
train_val_test = {"test":[], "val":[]}
for idx, (i, row) in enumerate(df.iterrows()): # copy path if copy
    # print(row["data_type"])
    path = "/content/copynet/data/{}{}.txt".format(row["data_type"], vals[row["data_type"]])
    row[["input_sentence_reveresed",	"target_sentences_reveresed"]].to_csv(path, index=False, header=False)
    train_val_test[row["data_type"]].append(path)
    vals[row["data_type"]] += 1

In [None]:
class CustomDataSet(Dataset):
  """
  The Class will act as the container for our dataset. It will take your dataframe, the root path, and also the transform function for transforming the dataset.
  """
  def __init__(self, file_name_list, transform=None):
    self.file_name_list = file_name_list
    self.transform = transform
  
  def __len__(self):
      # Return the length of the dataset
    return len(self.file_name_list)
  
  def __getitem__(self, idx):
      # Return the observation based on an index. Ex. dataset[0] will return the first element from the dataset, in this case the image and the label.
    if torch.is_tensor(idx):
        idx = int(idx.item())
    file_path = self.file_name_list[idx]
    input_val, target_val = pd.read_csv(file_path, header=None).T.values.ravel()
    return (input_val, target_val)

In [None]:
# train_data_loader = dataloaders_dict["test"]
# val_data_loader = dataloaders_dict["val"]

if True:
    encoder_decoder = encoder_decoder.cuda()
else:
    encoder_decoder = encoder_decoder.cpu()



train(encoder_decoder,
      train_data_loader,
      model_name,
      val_data_loader,
      keep_prob,
      teacher_forcing_schedule,
      lr,
      encoder_decoder.decoder.max_length)

NameError: ignored

{'test': <torch.utils.data.dataloader.DataLoader at 0x7f274a595390>,
 'val': <torch.utils.data.dataloader.DataLoader at 0x7f274a371390>}

In [None]:
    print(image_datasets)
    # Create training and validation dataloaders
    # Never shuffle the test set


In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Parse training parameters')
    parser.add_argument('./data/', type=str,
                        help='the name of a subdirectory of ./model/ that '
                             'contains encoder and decoder model files')
    
    parser.add_argument('--epochs', type=int, default=10,
                        help='the number of epochs to train')

    parser.add_argument('--use_cuda', action='store_true',
                        help='flag indicating that cuda will be used')

    parser.add_argument('--batch_size', type=int, default=128,
                        help='number of examples in a batch')

    parser.add_argument('--teacher_forcing_fraction', type=float, default=0.5,
                        help='fraction of batches that will use teacher forcing during training')

    parser.add_argument('--scheduled_teacher_forcing', action='store_true',
                        help='Linearly decrease the teacher forcing fraction '
                             'from 1.0 to 0.0 over the specified number of epocs')

    parser.add_argument('--keep_prob', type=float, default=1.0,
                        help='Probablity of keeping an element in the dropout step.')

    parser.add_argument('--val_size', type=float, default=0.1,
                        help='fraction of data to use for validation')

    parser.add_argument('--lr', type=float, default=0.001,
                        help='Learning rate.')

    parser.add_argument('--decoder_type', type=str, default='copy',
                        help="Allowed values 'copy' or 'attn'")

    parser.add_argument('--vocab_limit', type=int, default=5000,
                        help='When creating a new Language object the vocab'
                             'will be truncated to the most frequently'
                             'occurring words in the training dataset.')

    parser.add_argument('--hidden_size', type=int, default=256,
                        help='The number of RNN units in the encoder. 2x this '
                             'number of RNN units will be used in the decoder')

    parser.add_argument('--embedding_size', type=int, default=128,
                        help='Embedding size used in both encoder and decoder')

    parser.add_argument('--max_length', type=int, default=200,
                        help='Sequences will be padded or truncated to this size.')

    args = parser.parse_args()

    writer = SummaryWriter('./logs/%s_%s' % (args.model_name, str(int(time.time()))))
    if args.scheduled_teacher_forcing:
        schedule = np.arange(1.0, 0.0, -1.0/args.epochs)
    else:
        schedule = np.ones(args.epochs) * args.teacher_forcing_fraction

    main(args.model_name, args.use_cuda, args.batch_size, schedule, args.keep_prob, args.val_size, args.lr, args.decoder_type, args.vocab_limit, args.hidden_size, args.embedding_size, args.max_length)
    # main(str(int(time.time())), args.use_cuda, args.batch_size, schedule, args.keep_prob, args.val_size, args.lr, args.decoder_type, args.vocab_limit, args.hidden_size, args.embedding_size, args.max_length)


usage: ipykernel_launcher.py [-h] [--epochs EPOCHS] [--use_cuda]
                             [--batch_size BATCH_SIZE]
                             [--teacher_forcing_fraction TEACHER_FORCING_FRACTION]
                             [--scheduled_teacher_forcing]
                             [--keep_prob KEEP_PROB] [--val_size VAL_SIZE]
                             [--lr LR] [--decoder_type DECODER_TYPE]
                             [--vocab_limit VOCAB_LIMIT]
                             [--hidden_size HIDDEN_SIZE]
                             [--embedding_size EMBEDDING_SIZE]
                             [--max_length MAX_LENGTH]
                             ./data/
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:

dataloaders_dict = {x: torch.utils.data.DataLoader(train_val_test[x], batch_size=128, shuffle=False if x != 'train' else shuffle, num_workers=1) for x in ["test", "val"]}

In [None]:
main('./model/', True, 128, None, 1, 0.1, 0.001, 'copy', 5000, 256, 128, 200)


training ./model/ with use_cuda=True, batch_size=128
teacher_forcing_schedule= None
keep_prob=1.000000, val_size=0.100000, lr=0.001000, decoder_type=copy, vocab_limit=5000, hidden_size=256, embedding_size=128, max_length=200, seed=42
creating training and validation datasets
reading file 0/1055
reading file 1000/1055
creating encoder-decoder model


FileNotFoundError: ignored