In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import sequence_generator
import pandas as pd
import numpy as np
from function import FunctionTerm, Function
import sequence_generator


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 22
SOS_token = 10001
EOS_token = 10002

NTERMS = 2
USE_INTERACTION = True

In [2]:
# def load_data(nterms, use_interaction = False, train_data=True):
#     # return a list [sequence, mask] elements
#     if use_interaction:
#         if train_data:
#             df = pd.read_csv(f'data/train/{nterms}/{nterms}_int.csv', names=['prompt', 'completion'], delimiter='],', engine='python')
#         else:
#             df = pd.read_csv(f'data/test/{nterms}/{nterms}_int.csv', names=['prompt', 'completion'], delimiter='],', engine='python')
#     else:
#         if train_data:
#             df = pd.read_csv(f'data/train/{nterms}/{nterms}.csv', names=['prompt', 'completion'], delimiter='],', engine='python')
#         else:
#             df = pd.read_csv(f'data/test/{nterms}/{nterms}.csv', names=['prompt', 'completion'], delimiter='],', engine='python')
    
#     data = []
#     for i in range(len(df)):
#         seq = df.loc[i, 'prompt']
#         seq = seq.replace('[','').replace(']','')
#         seq = seq.split(',')
#         seq = [int(s) for s in seq]

#         mask =df.loc[i, 'completion']
#         mask = mask.replace('[','').replace(']','')
#         mask = mask.split(',')
#         mask = [eval(s) for s in mask]

#         data.append([seq, mask])
    
#     return data

# def process_data(data):
#     processed_sequences = []
#     for in_sequence, out_sequence in data:
#         in_sequence = torch.tensor(in_sequence)
#         in_sequence = in_sequence.long()
#         in_sequence = in_sequence[:, None]
#         in_sequence = in_sequence.to(device)

#         out_sequence = torch.tensor(out_sequence)
#         out_sequence = out_sequence.long()
#         out_sequence = out_sequence[:, None]
#         out_sequence = out_sequence.to(device)
#         processed_sequences.append((in_sequence, out_sequence))

#     return processed_sequences

# train_sequences = process_data(load_data(NTERMS, True))
# test_sequences = process_data(load_data(NTERMS, False))

# print('train set size: ', len(train_sequences), 'test set size', len(test_sequences))

In [3]:
sequences = sequence_generator.make_n_random_functions(n=500, nterms=NTERMS, use_interaction=USE_INTERACTION, torchify=True)
# sequences = sequence_generator.make_n_random_functions(n=1000, torchify=True)


processed_sequences = []

for in_sequence, out_sequence in sequences:
    in_sequence = in_sequence.long()
    in_sequence = in_sequence[:, None]
    in_sequence = in_sequence.to(device)

    out_sequence = torch.tensor(out_sequence)
    out_sequence = out_sequence.long()
    out_sequence = out_sequence[:, None]
    out_sequence = out_sequence.to(device)
    processed_sequences.append((in_sequence, out_sequence))

# 80-20 train test split
train_sequences = processed_sequences[:int(len(processed_sequences) * 0.8)]
test_sequences = processed_sequences[int(len(processed_sequences) * 0.8):]
del processed_sequences

print('train set size: ', len(train_sequences), 'test set size', len(test_sequences))

22
train set size:  400 test set size 100


In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [5]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [6]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [7]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor(np.array([[SOS_token]]), device=device, dtype=torch.long)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [8]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [9]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [random.choice(train_sequences)
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if (iter + 1) % print_every == 0:
            evaluateRandomly(encoder, decoder)
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

In [10]:
def evaluate(encoder, decoder, input_tensor, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor(np.array([[SOS_token]]), device=device, dtype=torch.long)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                # decoded_words.append(output_lang.index2word[topi.item()])
                decoded_words.append(topi.item())

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [11]:
def grid_search(sequence, terms, upper_bound = 5, lower_bound = -5):
    # sequence: target sequence
    # terms: list of all FunctionTerm objects proposed by MCTS

    if len(terms) == 0:
        return np.inf

    coeff = sorted(list(range(1, upper_bound+1))+list(range(lower_bound,0)), key=lambda x: abs(x))
    base = len(coeff)
    digit_to_coeff = {i: coeff[i] for i in range(base)}


    def int_to_base_helper(num, base):
        ret = []
        while num > 0:
            ret.append(num % base)
            num = num // base
        return ret
    
    penalty = np.inf
    
    for i in range(base**len(terms)):
        term_coeffs = [digit_to_coeff[c] for c in int_to_base_helper(i, base)+[0]*50]
        f = Function()
        for j, term in enumerate(terms):
            term.updateCoeff(term_coeffs[j])
            f.addTerm(term)
        
        if f.startIndex() > len(sequence):
            continue
        
        targets = np.array(sequence[f.startIndex()-1:])
        predictions = [f.evaluate(sequence, n) for n in range(f.startIndex(), len(sequence)+1)]
        
        if None in predictions:
            raise Exception(f'None in prediction! Current f is f[n] = {f} and prediction is {predictions}')
        
        predictions = np.array(predictions)
        
        rmse_loss = np.sqrt(np.mean((predictions-targets)**2))
        if rmse_loss < penalty:
            penalty = rmse_loss
        
        del f
        
        if penalty == 0:
            return penalty  
        
    return penalty


In [12]:
def generate_term_to_id_map():
    i = 0
    id_to_func_term = {}
    term_str_to_func_term = {}
    term_str_to_id = {}
    
    POSSIBLE_TERMS = sequence_generator.make_possible_terms(False)
    for t in POSSIBLE_TERMS:
        temp_f = Function()
        temp_f.addTerm(t)

        id_to_func_term[i] = t
        
        term_str = str(t).replace('0*','').replace('0','1')
        term_str_to_id[term_str] = i
        
        term_str_to_func_term[term_str] = t
        
        i += 1
            
    term_str_to_id['<ROOT>'] = i
    i += 1
    term_str_to_id['<EOS>'] = i
    return id_to_func_term, term_str_to_func_term, term_str_to_id

id_to_func_term, term_str_to_func_term, term_str_to_id = generate_term_to_id_map()

def make_functionterms_from_sentence(indices):
    indices = np.where(indices)[0]
    out = []
    for i in indices:
        if i < len(id_to_func_term):
            out.append(id_to_func_term[i])
    return out


best_totally_correct = 0
best_avg_rmse = np.inf

def evaluateRandomly(encoder, decoder):
    # print('starting evaluation')
    num_totally_correct = 0

    correct_tokens = 0
    total_tokens = 0

    rmses = []
    min_rmse = np.inf
    for pair in test_sequences:
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = output_words

        sequence = pair[0][:, 0].detach().cpu().numpy()
        rmse = grid_search(sequence, make_functionterms_from_sentence(output_sentence[:len(id_to_func_term)])[:NTERMS]) # cut the thing short
        min_rmse = rmse if rmse < min_rmse else min_rmse

        if (not np.isinf(rmse)) and (not np.isnan(rmse)):
            rmses.append(rmse)

        for idx, gt_element in enumerate(pair[0][:, 0]):
            total_tokens += 1
            if output_words[idx] == gt_element:
                correct_tokens += 1


        if pair[1][:, 0].tolist() == output_sentence:
            num_totally_correct += 1

    avg_rmse = np.mean(rmses)
    print()
    print('num correct:', num_totally_correct)
    print('num tokens correct:', correct_tokens)
    print('num total tokens', total_tokens)
    print('avg rmse', avg_rmse)

    global best_totally_correct
    best_totally_correct = num_totally_correct if num_totally_correct > best_totally_correct else best_totally_correct

    global best_avg_rmse
    best_avg_rmse = avg_rmse if avg_rmse < best_avg_rmse else best_avg_rmse
    # # making sure that the model is updating on each iteration
    # global last_attentions
    # print('same attention?', last_attentions == attentions)
    # last_attentions = attentions

In [13]:
n_words = 10005
hidden_size = 256
encoder1 = EncoderRNN(n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 20000, print_every=500)

print('='*30)
print('most number of totally correct functions:', best_totally_correct)
print('best avg rmse', best_avg_rmse)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



num correct: 0
num tokens correct: 0
num total tokens 700
avg rmse nan
0m 18s (- 11m 53s) (499 2%) 0.3855

num correct: 0
num tokens correct: 0
num total tokens 700
avg rmse nan
0m 35s (- 11m 24s) (999 4%) 0.3076

num correct: 0
num tokens correct: 3
num total tokens 700
avg rmse 1150.0122534714083
0m 54s (- 11m 10s) (1499 7%) 0.3002

num correct: 0
num tokens correct: 3
num total tokens 700
avg rmse 1083.4728800784455
1m 12s (- 10m 52s) (1999 9%) 0.2938

num correct: 0
num tokens correct: 0
num total tokens 700
avg rmse 492.2465990595712
1m 30s (- 10m 31s) (2499 12%) 0.2826

num correct: 0
num tokens correct: 8
num total tokens 700
avg rmse 575.9733274441187
1m 47s (- 10m 9s) (2999 14%) 0.2756

num correct: 0
num tokens correct: 10
num total tokens 700
avg rmse 462.49851272074847
2m 5s (- 9m 53s) (3499 17%) 0.3604

num correct: 0
num tokens correct: 16
num total tokens 700
avg rmse 465.0355146788604
2m 23s (- 9m 34s) (3999 19%) 0.4632

num correct: 0
num tokens correct: 6
num total t