## TextCNN-based ANN  models.

In [None]:
### All the test is based on torch-1.2.0 and torchtext-0.6.0


import torch
from torchtext import data
import random
import numpy as np
import os
from torch.nn import functional as F

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### prepare data loader based on torchtext.

In [None]:
from torchtext.vocab import Vectors
from nltk.stem import WordNetLemmatizer 
import spacy
from nltk.corpus import stopwords

from nltk.tokenize import RegexpTokenizer

### pre-processing 
spacy_en = spacy.load('en')
def tokenizer2(text):
    regtokenizer = RegexpTokenizer(r'\w+')
    wnl = WordNetLemmatizer()
    tokens = regtokenizer.tokenize(text)
    
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    tokens = [wnl.lemmatize(word) for word in tokens]
    
    tokenized_text = []
    auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', "'s"]
    for token in tokens:
        if token == "n't":
            tmp = 'not'
        elif token == "'ll":
            tmp = 'will'
        elif token in auxiliary_verbs:
            tmp = 'be'
        else:
            tmp = token
        tokenized_text.append(tmp)
    
    return tokenized_text

def get_iterator_feature(source_file, target_file, BATCH_SIZE=128,MAX_VOCAB_SIZE=20000):
    TEXT = data.Field(sequential=True, tokenize=tokenizer2,lower=True,  fix_length=300,  batch_first=True)
    LABEL = data.LabelField()

    fields = {'review': ('text', TEXT), 'label': ('label', LABEL)}

    train_data = data.TabularDataset.splits(
                            path = 'datasets'+os.sep+"amazon_text",
                            train = source_file,
                            format = 'json',
                            fields = fields
    )
    test_data = data.TabularDataset.splits(
                            path = 'datasets'+os.sep+"amazon_text",
                            train = target_file,
                            format = 'json',
                            fields = fields
    )

    train_data = train_data[0]

    test_data = test_data[0]


    test_data, valid_data = test_data.split(random_state = random.seed(SEED), split_ratio=0.95)

    # MAX_VOCAB_SIZE = 20_000

    TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
    LABEL.build_vocab(train_data)

    TEXT.build_vocab(test_data, max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
    LABEL.build_vocab(test_data)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    source_iterator, target_iterator, valid_iterator = data.BucketIterator.splits(
        (train_data, test_data, valid_data), 
        batch_size = BATCH_SIZE, 
        sort=False,
        shuffle = True,
        # repeat=True,
        device = device)

    return source_iterator, target_iterator, valid_iterator, TEXT


## Initialize ANN model.

In [1]:
from model.models import  ANNCNN
from model.criterion import MMD_loss


ann_version='ANN-A'

dataset = ['book.json','cd.json','elec.json','kitchen.json']
source_file =dataset[0]
target_file = dataset[1]



source_iterator, target_iterator, valid_iterator, TEXT = get_iterator_feature(source_file, target_file, BATCH_SIZE=256,MAX_VOCAB_SIZE=20000)


INPUT_DIM = len(TEXT.vocab)
EMBEDDING_SIZE = 100
LATENT_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [8,9,10]
OUTPUT_DIM = 2
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MU = 0.1

model = ANNadvCNN(INPUT_DIM,EMBEDDING_SIZE,N_FILTERS,FILTER_SIZES, LATENT_DIM,OUTPUT_DIM, PAD_IDX,DROPOUT,ann_version)
model.extractor.embedding.weight.data.copy_(TEXT.vocab.vectors)


if ann_version == 'ANN':
    optimizer_task = optim.Adam(model.parameters())
else:
    optimizer_task = optim.Adam([{'params':model.extractor.parameters()},{'params':model.predictor.parameters()}])
    optimizer_kernel = optim.Adam([{'params':model.mmd_linear.parameters()},{'params':model.cmmd_linear.parameters()}])

criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
mmd_loss = MMD_loss(kernel_type='mmd', kernel_mul=2.0, kernel_num=5)
cmmd_loss = MMD_loss(kernel_type='cmmd', kernel_mul=2.0, kernel_num=5,eplison=0.00001)





IndentationError: unexpected indent (<ipython-input-1-9c44f670ff1c>, line 5)

### Training ANN (ANN-A) models.

In [None]:
from model.tools import train_adverisal,  train_normal,epoch_time

N_EPOCHS = 20
best_loss = 100.0
best_epoch = 0

for epoch in range(N_EPOCHS):

    # alpha = 0.7
    start_time = time.time()
    if ann_version == 'ANN-A':
        train_loss = train_adverisal(model,source_iterator,target_iterator,optimizer_task,optimizer_kernel,criterion,mmd_loss,cmmd_loss)
    else:
        train_loss = train_normal(model,source_iterator,target_iterator,optimizer_task,criterion,mmd_loss,cmmd_loss,MU)

    eval_acc, eval_loss = evaluate(model, valid_iterator, criterion)
    if eval_loss < best_loss:
        best_loss = eval_loss
        best_epoch = epoch
        torch.save(model.state_dict(),'aan-cnn-model.pt')

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s |Best Epoch:{best_epoch}')
    print(f'\tTrain Loss: {train_loss:.3f}|Valid Acc: {eval_acc:.3f}') 

### Test ANN models.

In [None]:
from model.tools import evaluate

### test the model.
model.load_state_dict(torch.load('aan-cnn-model.pt'))
eval_acc,eval_loss  = evaluate(model,target_iterator,criterion)
print('from %s to %s, acc is %f'%(source_file,target_file, eval_acc))