In [1]:
# coding: utf-8

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
torch.set_default_tensor_type(torch.cuda.FloatTensor)
import time
import numpy as np
import os
import sys
from Constant import Constants
from load_data import StyleData
from torch.autograd import Variable
from ModelDefine import DsModel
from ModelDefine import Embed
from PreTrainDs import indexData2variable
import random

import time
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="3" # change 0  with whatever card is available
# os.environ["CUDA_LAUNCH_BLOCKING"]="1"
# os.environ["CHAINER_DEBUG"]="1"
torch.set_default_tensor_type(torch.cuda.FloatTensor)
torch.backends.cudnn.benchmark = True
# there are four models need to be defined Ez, Ey, D, Ds(pre-trained)
torch.cuda.init()

ImportError: cannot import name 'StyleData'

## Firstly we just do consistent encoder-decoder

In [None]:
class DsModel(nn.Module):
    """
    notes:
        This model can also be called classfier
    """

    def __init__(self, kind_filters, num_filters, num_in_channels, embedded_size, hidden_size=128):
        """
        Argus:
        kind_filters is a list
        num_filters is the number of filters we want use
        num_in_channels in this case is the number of kinds of embedding
        embedded_size is the embedding size (easy)
        hidden_size = is the hidden_units' number we want to use
        
        Notice:
        kind_filters need to be a list.
        for instance, [1, 2, 3] represent the there are three kind of
        window which's size is 1 or 2 or 3
        the Ds have multi-filter-size and muti-convs-maps
        """
        super(DsModel, self).__init__()

        self.kind_filters = kind_filters
        self.num_filters = num_filters

        self.convs= nn.ModuleList([nn.Conv2d(num_in_channels, num_filters, (width, embedded_size)) for width in self.kind_filters])


        # self.convs = nn.ModuleList([])
        # for width in self.kind_filters:
        #     self.convs.append(nn.Conv2d(num_in_channels, num_filters, (width, embedded_size)))

        self.linear = nn.Linear(num_filters * len(kind_filters), hidden_size)
        self.linear_out = nn.Linear(hidden_size, 2)
        self.drop = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim = -1)

    def forward(self, x):
        """
        this model's inputs should like this N_batch*C_channel*Seq_length*Embedded_size
        if we just use one kind embedding (dynamic or static) the C_channel is 1
        if we use two kind of embedding (dynamic and static) the C_channel is 2
        
        the outputs is the probability of x1 < X1
        """
        # convs_outputs = []
        # for convs in self.convs:
        #     convs_outputs.append(convs(x))

        convs_outputs= [convs(x) for convs in self.convs]


        max_pools_outputs = []
        for outputs in convs_outputs:
            max_pools_outputs.append(F.max_pool2d(outputs, kernel_size=(outputs.size()[2], 1)))
            # [2] is the size of high
        flatten = torch.cat(max_pools_outputs, dim=1).view(x.size()[0], -1)
        return self.softmax(self.relu(self.linear_out(self.drop(self.relu(self.linear(flatten))))))


class EzModel(nn.Module):
    """
    this model take embedding as the input
    this model is the decode model and it's hidden_output will be delivered to G model
    this model is implemented with a GRU(RNN) and the last hidden outputs as
    the encoded contents of input_sequence
    """

    def __init__(self, embedding_size, hidden_size):
        super(EzModel, self).__init__()
        self.hidden_size = hidden_size

        self.gru = nn.GRU(embedding_size, hidden_size).cuda()
        # the GRU's output is special, 'hidden_out' of every time step excatly
        self.relu = nn.ReLU()

    def forward(self, x, hidden):
        """
        x should look like this shape seq_len * batch * input_size
        and as usual the batch is 1
        the output is the all hidden and the hidden is the last hidden
        """
        outputs, hidden = self.gru(x, hidden)
        return self.relu(outputs), self.relu(hidden)

    def init_hidden(self):
        return Variable(torch.zeros(1, 1, self.hidden_size).cuda())  # the minibatch is 1


class EyModel(nn.Module):
    """
    this model is the style decode model who's output is deliverd to G model
    and this model's structure is also very similar with the Ds model
    """

    def __init__(self, in_channels, num_filters, kind_filters, embedding_size):
        super(EyModel, self).__init__()

        self.embedding_size = embedding_size
        self.kind_filters = kind_filters
        self.num_filters = num_filters
        self.y2_style = self.init_style()
        self.x2_style_flag = 1
        
        self.convs= nn.ModuleList([nn.Conv2d(in_channels, num_filters, (width, embedding_size)) for width in self.kind_filters])
        self.relu = nn.ReLU()

    def forward(self, x, index):
        """
        this model's input should like this N_batch*C_channel*Len_seqence*Width_{embedding_size}
        and the input also should include the domain of x represent as {1,0} ,1 represent the 
        target domain
        and his output is regard as the syle represent (size: 1 * (Kinds_filters*Num_filters))
        """
        if index == self.x2_style_flag:
            return self.y2_style
        convs_outputs = []
        # torch.cuda.synchronize()
        for convs in self.convs:
            y = convs(x)
            convs_outputs.append(y)
            
        max_pools_outputs= [F.max_pool2d(outputs, kernel_size=(outputs.size()[2], 1)).view((-1,)) for outputs in convs_outputs]      
        y1_style = torch.cat(max_pools_outputs).view(x.size()[0], -1)
        return self.relu(y1_style)

    def init_style(self):
        return nn.Parameter(torch.randn(1, len(self.kind_filters) * self.num_filters), requires_grad=True)


class GModel(nn.Module):
    def __init__(self, hidden_size, n_vocab, embedding_size, temper):  # temper is the temperature
        """
        hidden_size = z.size + y.size
        embedding_size = 250
        """
        super(GModel, self).__init__()
        self.hidden_size = hidden_size
        self.temper = temper

        self.gru = nn.GRU(embedding_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, embedding_size)
        self.softmax = nn.Softmax(dim=-1)
        self.relu = nn.ReLU()

    def forward(self, x, hidden):
        """
        hidden = cat(z.size + y.size)
        x = prev_x1_hat
        """

        hidden, _ = self.gru(x, hidden)
        out_embedding = self.out(self.relu(hidden)).view(hidden.size()[0], -1)
#         hidden = self.relu(hidden)
#         x_hat, x_hat_noT, hidden
        return self.softmax(out_embedding / self.temper), out_embedding, hidden

    def init_hidden(self):
        return Variable(torch.zeros(1, 1, self.hidden_size).cuda(), requires_grad=False)


class DModel(nn.Module):
    """
    DModel is also very like Ey & Ds and also have a sigmoid function as the output layer 
    and this model take G's hidden state and G's hidden state' length is dynamic
    """

    def __init__(self, kind_filters, num_filters, num_in_channels, width, hidden_size=128):

        super(DModel, self).__init__()
        self.kind_filters = kind_filters
        self.num_filters = num_filters

        # self.convs = nn.ModuleList([])
        # for w in self.kind_filters:
        #     self.convs.append(nn.Conv2d(num_in_channels, num_filters, (w, width)))

        self.convs= nn.ModuleList([nn.Conv2d(num_in_channels, num_filters, (w, width)) for w in self.kind_filters]).cuda()


        self.linear = nn.Linear(num_filters * len(kind_filters), hidden_size)
        self.linear_out = nn.Linear(hidden_size, 2)
        self.drop = nn.Dropout(0.5)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        """
        the input is just the like this N_batch*C_channel*Seq_len*Width
        and the C_channel is 1 because the GRU only output one hidden state every time t
        """
        convs_outputs = []
        for convs in self.convs:
            convs_outputs.append(convs(x))

        max_pools_outputs = []
        for outputs in convs_outputs:
            max_pools_outputs.append(F.max_pool2d(outputs, kernel_size=(outputs.size()[2], 1)))

        flatten = torch.cat(max_pools_outputs, dim=1).view(x.size()[0], -1)
        return self.softmax(self.relu(self.linear_out(self.relu(self.drop(self.linear(flatten))))))


class Embed(nn.Module):
    """
    this is the embedding layer which could embed the index and one-hot logit vector
    but you should indicator use_one_hot or not with index = {True, False}
    """

    def __init__(self, n_vocab, embedding_size):
        super(Embed, self).__init__()
        self.embedding = nn.Embedding(n_vocab, embedding_size)

    def forward(self, x, index=True):
        if index:
            return self.embedding(x)
        else:
            return torch.mm(x, self.embedding.weight)

In [None]:
class GANModel(nn.Module):
    """
    this model is the gan model wich will reutrn many data we need to compute the loss so we just need
    create one model called GAN
    """

    def __init__(self, style_represent, content_represent, D_filters, D_num_filters, Ey_filters,
                 Ey_num_filters, embedding_size, n_vocab, temper, max_len=40, min_len = 6, style_path= './data/style.npy'):
        """
        style_represent is the dim we choose to represent the style
        content_represent is the dim we choose to represent the content
        D_filters is a list like this [1,2,3,4]
        D_num_filters is the the filters number we want to use for each window size
        Ey_filters
        """
        super(GANModel, self).__init__()
        self.style_represent = Ey_num_filters * len(Ey_filters)
        self.temper = temper
        self.max_len = max_len
        self.min_len = min_len
        self.style_data = np.load(style_path)

        self.Ez = EzModel(embedding_size, content_represent).cuda()  # hidden_size is the content_represent
        # content_represent == conten_represent
        self.Ey = EyModel(1, Ey_num_filters, Ey_filters, embedding_size).cuda()
        # style_represent == Ey_num_filters * Len(Ey_filters)
        self.G = GModel(content_represent + self.style_represent, n_vocab, embedding_size, temper).cuda()
#         self.D = DModel(D_filters, D_num_filters, 1, content_represent + self.style_represent).cuda()
        
        
        self.embedding = Embed(n_vocab, embedding_size)
        self.embedding = self.embedding.cuda()
        self.go = self.embedding(Variable(torch.LongTensor([0]).cuda()))
        self.go = self.go.cuda()

    def forward(self, x1, Ez_train=True,
                G_train=True,
                D_train=True,
                Embedd_train=True,
                Ey_train=True,
                Lcyc=True,
                Ladv=True,
                Ldis=True,
                Lrec=True):
        """
        the input x don't need to have a <go>, but must have an <EOS>
        The input x1, x2's shape should look like this (Len_seq)
        addition the N_batch must equal to 1 if we want to add batch training we can consider 
        implement outside the model because we need to use many middle output to compute the loss it will be 
        very complicated if we compute inside the model
        Notice:
        there is something we need to pay attention to is will the y_start will be changed, if 
        y_start is not be changed we need to consider a method to update the y_star
        """
        self.Ez.train(Ez_train)
        self.Ey.train(Ey_train)
#         self.D.train(D_train)
        self.G.train(G_train)
        self.embedding.train(Embedd_train)

        # x1 and x2 is index represent
        embedd_x1 = self.embedding(x1)
        
        y1 = self.Ey(embedd_x1.unsqueeze(0).unsqueeze(0), 0)  # we need to shape the 2d variable to 4d variable

        hidden = self.Ez.init_hidden()


        outputs, z1 = self.Ez(embedd_x1.unsqueeze(1).cuda(), hidden)
        
        x1_seq_len = x1.size()[0] #n_symbols
    
        x1_hat, x1_hat_noT, x1_hat_hid = self.get_x_hat_hidden(z1, y1, x1_seq_len)

        return {'x1': embedd_x1,
                'x1_hat_noT': x1_hat_noT,
                'x1_hat_hid': x1_hat_hid,
                'x1_hat': x1_hat}

    # get y and z as input output the same len as x
    def get_x_hat_hidden(self, z, y, seq_len, length_fix=True):
        x_hats = []
        x_hats_noT = []
        hiddens = []
        embedd_x_hat = []
        self.go = self.go.cuda()
        x_hat, x_hat_noT, hidden = self.G(self.go.view(1, 1, -1),
                                          torch.cat([z.view(1, -1), y], dim=-1).view(1, 1, -1))
        x_hats.append(x_hat)
        x_hats_noT.append(x_hat_noT)
        hiddens.append(hidden)
        
        for i in range(1, seq_len):

#             embedd_x_hat = self.embedding(x_hat, index=False)
            x_hat, x_hat_noT, hidden = self.G(x_hat.view(1, 1, -1), hidden.view(1, 1, -1))

            # the sequence's length be generated should be larger than 6 at least 
            if x_hat.topk(1)[1].data.cpu().numpy() == 1 and not length_fix and i >= self.min_len:
                break
#             embedd_x_hats.append(embedd_x_hat)
            x_hats.append(x_hat)
            x_hats_noT.append(x_hat_noT)
            hiddens.append(hidden)

        return torch.cat(x_hats), torch.cat(x_hats_noT), torch.cat(hiddens)  # cat in the first dim

    def ind_to_words(self, ind_sent):
    	return (' '.join([self.style_data[1][x] for x in ind_sent]))

In [None]:
def trainVAE_D(epoches,batch_size,data,ds_model,ds_emb,gan_path,style_path,pretrainD=False):
    style_data = np.load(style_path)

    gan = torch.load(gan_path)
    # gan.apply(weights_init) # apply weight init

        #     style_represent is the dim we choose to represent the style
        # content_represent is the dim we choose to represent the content
        # D_filters is a list like this [1,2,3,4]
        # D_num_filters is the the filters number we want to use for each window size
        # Ey_filters
    # gan = GANModel(style_represent=500, content_represent=250, D_filters=[2,3,4,5,6], D_num_filters=100, Ey_filters=[1,2,3,4,5],
    #              Ey_num_filters=100, embedding_size=250, n_vocab=8981, temper=0.0001, max_len=40, min_len = 6, style_path=style_path)
    gan = gan.cuda()
    gan.train(True)
    style = StyleData()
    style.load(style_path)
    const = Constants(style.n_words)
    optimizer = optim.Adam(gan.parameters(),lr=1e-3)
    lamda1 = 1
    lamda2 = 1
    lamda3 = 3
    cross_entropy = nn.CrossEntropyLoss()
#     emb_loss = 
    # init the state of some model
    ds_model.train(True)
    ds_emb.train(True)

    
    train_data = indexData2variable(data)
    train_data = build2pairs(train_data)
    
    for i in range(epoches):
        print(("epoches:\t", i))
        if pretrainD:
            print("trainning Discriminator..........")
        else :
            print("trainning Generator..............")
        sys.stdout.flush()
        stime = time.time()
        
        shuffleData(train_data)
        print(len(train_data))
        sys.stdout.flush()
        count = 0
        # for count in range(int(len(train_data))):
        while count < int(len(train_data)-batch_size):
            tempdata = train_data[count:count+batch_size]
            
            if tempdata == []:
                break
                
            count += batch_size
            optimizer.zero_grad()
            loss = 0

            for seqs in tempdata:
                seqs[0] = seqs[0].cuda()
                dic = gan(seqs[0],D_train=True)
#                 loss = emb_loss(, seqs[0])
                loss = (1-F.cosine_similarity(dic['x1_hat_noT'], dic['x1'])).mean()
    
            loss.backward(retain_graph=True)
            optimizer.step()
            if count%4000 == 0:
                print('l:{}, {} / {}'.format(loss, count,len(train_data)))
                W = gan.embedding.embedding.weight
                W_normalized = W.div(W.norm(p = 2, dim=1, keepdim=True))
#                 print(unembed(dic['x1'], W_normalized, style_data)[1])
                print(unembed(dic['x1_hat_noT'], W_normalized, style_data)[1])
                print(ind_to_words(seqs[0].detach().cpu().numpy(), style_data))
#                 print(onehot_to_words(dic['x1_hat_noT'], style_data))
                sys.stdout.flush()

            
        torch.save(gan, MODEL_NAME)
        
#         if acc > 0.8:
#             pretrainD = False
#         if acc < 0.6:
#             pretrainD = True
            
            
            
        etime = time.time()
        print(("cost time \t%.2f mins" % ((etime - stime)/60)))
        sys.stdout.flush()

    torch.save(gan, MODEL_NAME)
            
                
def build2pairs(train_data):
    data = []
    for i in range(min( len(train_data[0]), len(train_data[1]) )):
           data.append([train_data[0][i], train_data[1][i]])
    return data

def shuffleData(train_data):
    """
    this function don't need to return any value and the list is changed inplace
    """
    if len(train_data) == 2:
        random.shuffle(train_data[0])
        random.shuffle(train_data[1])
    else:
        random.shuffle(train_data)
        
def unembed(x1, W_normalized, style_data):
    #calculate cosine similarity
    x1_normalized = x1.div(x1.norm(p = 2, dim=1, keepdim=True))
    emb_distances = torch.mm(x1_normalized, W_normalized.t())
    token_ids = torch.argmax(emb_distances, dim=1).cpu().numpy()
    sentense = ' '.join([style_data[1][x] for x in token_ids])
    return (token_ids, sentense)

def ind_to_words(ind_sent, style_data):
    return (' '.join([style_data[1][x] for x in ind_sent]))

def onehot_to_words(onehot, style_data):
    ind_sent = onehot.argmax(dim=1).cpu().numpy()
    print(onehot)
    print(ind_sent)
    return (' '.join([style_data[1][x] for x in ind_sent]))
                

In [None]:
MODEL_NAME = './Model/my_bithugan2.pkl'
style = StyleData()
style.load('./data/style.npy')
const = Constants(n_vocab=style.n_words)
print('content_represent', const.Content_represent)
print('D_filters', const.D_filters)
print('D_num_filters', const.Ds_num_filters)
print('embedding_size', const.Embedding_size)
print('Ey_filters', const.Ey_filters)
print('Ey_num_filters', const.Ey_num_filters)
print('n_vocab', const.N_vocab)
print('style_represent', const.Style_represent)
print('temper', const.Temper)

# gan = GANModel(content_represent=const.Content_represent,
#                D_filters=const.D_filters,
#                D_num_filters=const.Ds_num_filters,
#                embedding_size=const.Embedding_size,
#                Ey_filters=const.Ey_filters,
#                Ey_num_filters=const.Ey_num_filters,
#                n_vocab=const.N_vocab,
#                style_represent=const.Style_represent,
#                temper=const.Temper)  # there are 9 parameters of a GAN
# torch.save(gan, MODEL_NAME)
print('finished')


In [None]:
ds = torch.load('./Model/Ds_pretrained.pkl').cuda()
ds_emb = torch.load('./Model/Ds_emb_pretrained.pkl').cuda()

train_data = np.load('./data/trainDataOfIndex.npy')
gan_path = MODEL_NAME
style_path = './data/style.npy'
epoches = 30
batch_size = 50
pretrainD = False

trainVAE_D(epoches, batch_size, train_data, ds, ds_emb,  gan_path, style_path,pretrainD)

In [None]:
# !pip install gensim

In [None]:
from torch.utils.data import DataLoader
from load_data import load_data, StyleDataset, Numerator
from sklearn.model_selection import train_test_split

In [None]:
dataset_list = [
    'data/sentiment.train.0',
    'data/sentiment.train.1'
#     'data/trump',
#     'data/musk'
]

X, y, numerator = load_data(dataset_list, fasttext_location = '/mnt/wiki.simple.bin')
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=420)

train_loader = DataLoader(StyleDataset(X_train, y_train, numerator.embeddings, sentence_size=15),
                          batch_size=10, shuffle=True)

In [None]:

for x,y in train_loader:
    print(x.size(), y.size())
    print(' '.join([numerator.unembed(x[0,i].cpu().numpy()) for i in range(15)]))