## TextCNN-based AAN  models.

In [1]:
### All the test is based on torch-1.2.0 and torchtext-0.6.0


import torch
from torchtext import data
import random
import numpy as np
import os
from torch.nn import functional as F

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
#### download stopword and wordnet using nltk

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Error loading stopwords: <urlopen error [Errno 111]
[nltk_data]     Connection refused>
[nltk_data] Error loading wordnet: <urlopen error [Errno 111]
[nltk_data]     Connection refused>


False

### prepare data loader based on torchtext.

In [3]:
from torchtext.vocab import Vectors
from nltk.stem import WordNetLemmatizer 
import spacy
from nltk.corpus import stopwords

from nltk.tokenize import RegexpTokenizer



### pre-processing 
def tokenizer2(text):
    regtokenizer = RegexpTokenizer(r'\w+')
    wnl = WordNetLemmatizer()
    tokens = regtokenizer.tokenize(text)
    
    # remove stopwords
    # stop = stopwords.words('english')
    # tokens = [token for token in tokens if token not in stop]
    
    # tokens = [wnl.lemmatize(word) for word in tokens]
    
    tokenized_text = []
    auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', "'s"]
    for token in tokens:
        if token == "n't":
            tmp = 'not'
        elif token == "'ll":
            tmp = 'will'
        elif token in auxiliary_verbs:
            tmp = 'be'
        else:
            tmp = token
        tokenized_text.append(tmp)
    
    return tokenized_text

def get_iterator_feature(source_file, target_file, BATCH_SIZE=128,MAX_VOCAB_SIZE=20000):
    TEXT = data.Field(sequential=True, tokenize=tokenizer2,lower=True,  fix_length=300,  batch_first=True)
    LABEL = data.LabelField()

    fields = {'review': ('text', TEXT), 'label': ('label', LABEL)}

    train_data = data.TabularDataset.splits(
                            path = 'datasets'+os.sep+"amazon_text",
                            train = source_file,
                            format = 'json',
                            fields = fields
    )
    test_data = data.TabularDataset.splits(
                            path = 'datasets'+os.sep+"amazon_text",
                            train = target_file,
                            format = 'json',
                            fields = fields
    )

    train_data = train_data[0]

    test_data = test_data[0]


    test_data, valid_data = test_data.split(random_state = random.seed(SEED), split_ratio=0.95)

    # MAX_VOCAB_SIZE = 20_000

    TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
    LABEL.build_vocab(train_data)


    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    source_iterator, target_iterator, valid_iterator = data.BucketIterator.splits(
        (train_data, test_data, valid_data), 
        batch_size = BATCH_SIZE, 
        sort=False,
        shuffle = True,
        # repeat=True,
        device = device)

    return source_iterator, target_iterator, valid_iterator, TEXT


## Initialize AAN model.

In [7]:
from model.models import  AANTextCNN
from model.criterion import MMD_loss
import torch.optim as optim
import torch.nn as nn

aan_version='AAN-A'

dataset = ['book.json','cd.json','elec.json','kitchen.json']
source_file =dataset[0]
target_file = dataset[1]



source_iterator, target_iterator, valid_iterator, TEXT = get_iterator_feature(source_file, target_file, BATCH_SIZE=256,MAX_VOCAB_SIZE=20000)


INPUT_DIM = len(TEXT.vocab)
EMBEDDING_SIZE = 100
LATENT_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [8,9,10]
OUTPUT_DIM = 2
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MU = 0.1

model = AANTextCNN(INPUT_DIM,EMBEDDING_SIZE,N_FILTERS,FILTER_SIZES, LATENT_DIM,OUTPUT_DIM, PAD_IDX,DROPOUT,aan_version)
model.extractor.embedding.weight.data.copy_(TEXT.vocab.vectors)


if aan_version == 'AAN':
    optimizer_task = optim.Adam(model.parameters())
else:
    optimizer_task = optim.Adam([{'params':model.extractor.parameters()},{'params':model.predictor.parameters()}])
    optimizer_kernel = optim.Adam([{'params':model.mmd_linear.parameters()},{'params':model.cmmd_linear.parameters()}])

criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
mmd_loss = MMD_loss(kernel_type='mmd', kernel_mul=2.0, kernel_num=5)
cmmd_loss = MMD_loss(kernel_type='cmmd', kernel_mul=2.0, kernel_num=5,eplison=0.00001)





### Training AAN (AAN-A) models.

In [10]:
import time
from model.tools import train_adverisal,  train_normal,epoch_time, evaluate

N_EPOCHS = 20
best_loss = 100.0
best_epoch = 0

for epoch in range(N_EPOCHS):

    start_time = time.time()
    if aan_version == 'AAN-A':
        train_loss = train_adverisal(model,source_iterator,target_iterator,optimizer_task,optimizer_kernel,criterion,mmd_loss,cmmd_loss)
    else:
        train_loss = train_normal(model,source_iterator,target_iterator,optimizer_task,criterion,mmd_loss,cmmd_loss,MU)

    eval_acc, eval_loss = evaluate(model, valid_iterator, criterion)
    if eval_loss < best_loss:
        best_loss = eval_loss
        best_epoch = epoch
        torch.save(model.state_dict(),'aan-cnn-model.pt')

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s |Best Epoch:{best_epoch}')
    print(f'\tTrain Loss: {train_loss:.3f}|Valid Acc: {eval_acc:.3f}') 

Epoch: 01 | Epoch Time: 0m 44s |Best Epoch:0
	Train Loss: -0.161|Valid Acc: 0.782
Epoch: 02 | Epoch Time: 0m 44s |Best Epoch:1
	Train Loss: -0.164|Valid Acc: 0.850
Epoch: 03 | Epoch Time: 0m 45s |Best Epoch:2
	Train Loss: -0.160|Valid Acc: 0.845
Epoch: 04 | Epoch Time: 0m 45s |Best Epoch:3
	Train Loss: -0.151|Valid Acc: 0.856
Epoch: 05 | Epoch Time: 0m 45s |Best Epoch:4
	Train Loss: -0.147|Valid Acc: 0.861
Epoch: 06 | Epoch Time: 0m 45s |Best Epoch:5
	Train Loss: -0.143|Valid Acc: 0.858
Epoch: 07 | Epoch Time: 0m 45s |Best Epoch:6
	Train Loss: -0.138|Valid Acc: 0.870
Epoch: 08 | Epoch Time: 0m 44s |Best Epoch:7
	Train Loss: -0.137|Valid Acc: 0.865
Epoch: 09 | Epoch Time: 0m 45s |Best Epoch:7
	Train Loss: -0.136|Valid Acc: 0.859
Epoch: 10 | Epoch Time: 0m 44s |Best Epoch:9
	Train Loss: -0.131|Valid Acc: 0.865
Epoch: 11 | Epoch Time: 0m 44s |Best Epoch:10
	Train Loss: -0.131|Valid Acc: 0.867
Epoch: 12 | Epoch Time: 0m 44s |Best Epoch:10
	Train Loss: -0.136|Valid Acc: 0.864
Epoch: 13 | Ep

### Test AAN models.

In [11]:
from model.tools import evaluate

### test the model.
model.load_state_dict(torch.load('aan-cnn-model.pt'))
eval_acc,eval_loss  = evaluate(model,target_iterator,criterion)
print('from %s to %s, acc is %f'%(source_file,target_file, eval_acc))

from book.json to cd.json, acc is 0.861947
