In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.distributed as dist

import time
import os
import sys


In [5]:
class StatefulLSTM(nn.Module):
    def __init__(self,in_size,out_size):
        super(StatefulLSTM,self).__init__()
        
        self.lstm = nn.LSTMCell(in_size,out_size)
        self.out_size = out_size
        
        self.h = None
        self.c = None

    def reset_state(self):
        self.h = None
        self.c = None

    def forward(self,x):

        batch_size = x.data.size()[0]
        if self.h is None:
            state_size = [batch_size, self.out_size]
#             self.c = Variable(torch.zeros(state_size)).cuda()
#             self.h = Variable(torch.zeros(state_size)).cuda()
            self.c = torch.zeros(state_size)
            self.h = torch.zeros(state_size)
        self.h, self.c = self.lstm(x,(self.h,self.c))

        return self.h

class LockedDropout(nn.Module):
    def __init__(self):
        super(LockedDropout,self).__init__()
        self.m = None

    def reset_state(self):
        self.m = None

    def forward(self, x, dropout=0.5, train=True):
        if train==False:
            return x
        if(self.m is None):
            self.m = x.data.new(x.size()).bernoulli_(1 - dropout)
        mask = Variable(self.m, requires_grad=False) / (1 - dropout)

        return mask * x

class RNN_language_model(nn.Module):
    def __init__(self,vocab_size, no_of_hidden_units):
        super(RNN_language_model, self).__init__()

        self.embedding = nn.Embedding(vocab_size,no_of_hidden_units)

        self.lstm1 = StatefulLSTM(no_of_hidden_units,no_of_hidden_units)
        self.bn_lstm1= nn.BatchNorm1d(no_of_hidden_units)
        self.dropout1 = LockedDropout()

        self.lstm2 = StatefulLSTM(no_of_hidden_units,no_of_hidden_units)
        self.bn_lstm2= nn.BatchNorm1d(no_of_hidden_units)
        self.dropout2 = LockedDropout() 

        self.decoder = nn.Linear(no_of_hidden_units, vocab_size)

        self.loss = nn.CrossEntropyLoss()#ignore_index=0)

    def reset_state(self):
        self.lstm1.reset_state()
        self.dropout1.reset_state()
        self.lstm2.reset_state()
        self.dropout2.reset_state()

    def forward(self, x, train=True): #batch_size, time_steps
    
        embed = self.embedding(x) # batch_size, time_steps, features
        no_of_timesteps = embed.shape[1]
        self.reset_state()

        outputs = []
        for i in range(no_of_timesteps - 1):

            h = self.lstm1(embed[:,i,:]) #batch_size, features
            h = self.bn_lstm1(h)
            h = self.dropout1(h,dropout=0.3,train=train)

            h = self.lstm2(h)
            h = self.bn_lstm2(h)
            h = self.dropout2(h,dropout=0.3,train=train)

            h = self.decoder(h) #batch, vocab_size

            outputs.append(h)

        outputs = torch.stack(outputs) # (time_steps,batch_size,vocab_size)
        target_prediction = outputs.permute(1,0,2) # batch, time, vocab
        outputs = outputs.permute(1,2,0) # (batch_size,vocab_size,time_steps)

        if(train==True):

            target_prediction = target_prediction.contiguous().view(-1,vocab_size) # (batch_size*(time_steps-1))by vocab_size
            target = x[:,1:].contiguous().view(-1) #batch_size*(time_step - 1)
            loss = self.loss(target_prediction,target)

            return loss, outputs
        else:
            return outputs

In [107]:
imdb_dictionary = np.load('/Users/liuchunlei/Desktop/IMDB Movie reviews/preprocessed_data/imdb_dictionary.npy')
vocab_size = 8000 # imdb_dictionary.shape[0], 8000 can reduce the number of weights without igonoring too much unique tokens

x_train = []
with open('/Users/liuchunlei/Desktop/IMDB Movie reviews/preprocessed_data/imdb_train.txt','r',encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    line = line.split(' ')
    line = np.asarray(line,dtype=np.int)

    line[line>vocab_size] = 0

    x_train.append(line)
 #the first 12500 are positive reviews, the next 12500 are negative reviews, 50000 are unlabelled reviews

x_test = []
with open('/Users/liuchunlei/Desktop/IMDB Movie reviews/preprocessed_data/imdb_test.txt','r',encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    line = line.split(' ')
    line = np.asarray(line,dtype=np.int)

    line[line>vocab_size] = 0

    x_test.append(line)

model = RNN_language_model(8001, 300)
vocab_size += 1
batch_size = 200
no_of_epochs = 6
# opt = 'sgd'
# LR = 0.01
opt = 'adam'
LR = 0.001
if(opt=='adam'):
    optimizer = optim.Adam(model.parameters(), lr=LR)
elif(opt=='sgd'):
    optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9)
L_Y_train = len(x_train) #75000
L_Y_test = len(x_test)


train_loss = []
train_accu = []
test_accu = []

In [None]:
#train the model
print('begin training...')
for epoch in range(0,6):
    model.train()

    epoch_acc = 0.0
    epoch_loss = 0.0

    epoch_counter = 0

    time1 = time.time()
    
    I_permutation = np.random.permutation(L_Y_train)

    for i in range(0, L_Y_train, batch_size):

        x_input2 = [x_train[j] for j in I_permutation[i:i+batch_size]]
        sequence_length = 100
        x_input = np.zeros((batch_size,sequence_length),dtype=np.int)
        for j in range(batch_size):
            x = np.asarray(x_input2[j])
            sl = x.shape[0]
            if(sl<sequence_length):
                x_input[j,0:sl] = x
            else:
                start_index = np.random.randint(sl-sequence_length+1)
                x_input[j,:] = x[start_index:(start_index+sequence_length)]
#         x_input = Variable(torch.LongTensor(x_input),requires_grad=True).cuda()
        x_input = torch.LongTensor(x_input)
        optimizer.zero_grad()
        loss, pred = model(x_input) # pred:(batch_size,vocab_size,time_steps-1)
        loss.backward()

        norm = nn.utils.clip_grad_norm(model.parameters(),2.0)

        optimizer.step()   # update gradients
        
        values,prediction = torch.max(pred,1)
        prediction = prediction.cpu().data.numpy()
        accuracy = float(np.sum(prediction==x_input.cpu().data.numpy()[:,1:]))/sequence_length
        epoch_acc += accuracy
        epoch_loss += loss.data[0]
        epoch_counter += batch_size
        
        if (i+batch_size) % 1000 == 0 and epoch==0:
            print(i+batch_size, accuracy/batch_size, loss.data[0], norm, "%.4f" % float(time.time()-time1))
    epoch_acc /= epoch_counter
    epoch_loss /= (epoch_counter/batch_size)

    train_loss.append(epoch_loss)
    train_accu.append(epoch_acc)

    print(epoch, "%.2f" % (epoch_acc*100.0), "%.4f" % epoch_loss, "%.4f" % float(time.time()-time1))

    ## test
    if((epoch+1)%1==0):
        model.eval()

        epoch_acc = 0.0
        epoch_loss = 0.0

        epoch_counter = 0

        time1 = time.time()
        
        I_permutation = np.random.permutation(L_Y_test)

        #torch.from_numpy(
        for i in range(0, 2000, batch_size):
            #apply .cuda() to move to GPU
            sequence_length = 100
            x_input2 = [x_test[j] for j in I_permutation[i:i+batch_size]]
            x_input = np.zeros((batch_size,sequence_length),dtype=np.int)
            for j in range(batch_size):
                x = np.asarray(x_input2[j])
                sl = x.shape[0]
                if(sl<sequence_length):
                    x_input[j,0:sl] = x
                else:
                    start_index = np.random.randint(sl-sequence_length+1)
                    x_input[j,:] = x[start_index:(start_index+sequence_length)]
#             x_input = Variable(torch.LongTensor(x_input)).cuda()
            x_input = torch.LongTensor(x_input)
            pred = model(x_input,train=False)
            
            values,prediction = torch.max(pred,1)
            prediction = prediction.cpu().data.numpy()
            accuracy = float(np.sum(prediction==x_input.cpu().data.numpy()[:,1:]))/sequence_length
            epoch_acc += accuracy
            epoch_loss += loss.data[0]
            epoch_counter += batch_size
            #train_accu.append(accuracy)
            if (i+batch_size) % 1000 == 0 and epoch==0:
                print(i+batch_size, accuracy/batch_size)
        epoch_acc /= epoch_counter
        epoch_loss /= (epoch_counter/batch_size)

        test_accu.append(epoch_acc)

        time2 = time.time()
        time_elapsed = time2 - time1

        print("  ", "%.2f" % (epoch_acc*100.0), "%.4f" % epoch_loss, "%.4f" % float(time.time()-time1))

    if(((epoch+1)%2)==0):
        torch.save(model,'temp.model')
        torch.save(optimizer,'temp.state')
        data = [train_loss,train_accu,test_accu]
        data = np.asarray(data)
        np.save('data.npy',data)
torch.save(model,'language.model')

begin training...




1000 0.060700000000000004 tensor(8.6721) 0.30716462774766523 56.5692
2000 0.09835 tensor(8.4764) 0.33903977746532066 110.7523
3000 0.11535000000000001 tensor(8.1936) 0.4322604916359927 166.0001
4000 0.11525 tensor(8.0109) 0.4129152417330959 221.7612
5000 0.1216 tensor(7.7253) 0.45535193590609774 279.1024
6000 0.11710000000000001 tensor(7.4720) 0.46806061062527843 334.3126
7000 0.1316 tensor(7.0529) 0.48023752577246 388.5895
8000 0.1391 tensor(6.7926) 0.45263699097774507 442.7980
9000 0.13475 tensor(6.5977) 0.41290969945546985 498.5395
10000 0.14895 tensor(6.2970) 0.36991000394236523 555.7010
11000 0.14615 tensor(6.0939) 0.34478245476734315 610.1187
12000 0.14585 tensor(6.0050) 0.3260647328805762 664.1210
13000 0.1534 tensor(5.8674) 0.3065031172109757 718.3200
14000 0.15410000000000001 tensor(5.7717) 0.2935886839729581 775.6770
15000 0.15789999999999998 tensor(5.7174) 0.2820570260712286 835.5529
16000 0.17315000000000003 tensor(5.5243) 0.2723619379946942 889.9880
17000 0.16595 tensor(5.



1000 0.21405000000000002
2000 0.20395
   21.87 4.6656 30.4387
1 21.47 4.5705 4621.4867
   23.81 4.5048 27.3180


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2 22.56 4.4391 4384.5198
   24.23 4.3224 28.6689
3 23.15 4.3658 4931.0764
   24.55 4.3879 32.6377


In [7]:
model

RNN_language_model(
  (embedding): Embedding(8001, 300)
  (lstm1): StatefulLSTM(
    (lstm): LSTMCell(300, 300)
  )
  (bn_lstm1): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): LockedDropout()
  (lstm2): StatefulLSTM(
    (lstm): LSTMCell(300, 300)
  )
  (bn_lstm2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): LockedDropout()
  (decoder): Linear(in_features=300, out_features=8001, bias=True)
  (loss): CrossEntropyLoss()
)

In [8]:
#generate fake reviews
imdb_dictionary = np.load('/Users/liuchunlei/Desktop/IMDB Movie reviews/preprocessed_data/imdb_dictionary.npy')
vocab_size = 8000 + 1

word_to_id = {token: idx for idx, token in enumerate(imdb_dictionary)}
#generate fake reviews
model = RNN_language_model(8001, 300)
model = torch.load('temp.model')
print('model loaded...')
# model.cuda()
model.eval()

tokens = [['a'], ['i']]
token_ids = [[word_to_id.get(token,-1)+1 for token in x] for x in tokens]
x = torch.LongTensor(token_ids)

##### preload phrase

embed = model.embedding(x) # batch_size, time_steps, features

state_size = [embed.shape[0],embed.shape[2]] # batch_size, features
no_of_timesteps = embed.shape[1]

model.reset_state()

outputs = []
for i in range(no_of_timesteps):

    h = model.lstm1(embed[:,i,:])
    h = model.bn_lstm1(h)
    h = model.dropout1(h,dropout=0.5,train=False)

    h = model.lstm2(h)
    h = model.bn_lstm2(h)
    h = model.dropout2(h,dropout=0.5,train=False)

    h = model.decoder(h)

    outputs.append(h)

outputs = torch.stack(outputs) #time_steps, batch_size, vocab_size
outputs = outputs.permute(1,2,0) #batch_size, vocab_size, time_steps
output = outputs[:,:,-1] #batch_size, vocab_size

temperature = 1.0 # float(sys.argv[1])
length_of_review = 150

review = []
####
for j in range(length_of_review):

    ## sample a word from the previous output
    output = output/temperature
    probs = torch.exp(output)
    probs[:,0] = 0.0
    probs = probs/(torch.sum(probs,dim=1).unsqueeze(1))
    x = torch.multinomial(probs,1) #pick one word
    review.append(x.cpu().data.numpy()[:,0])
    
    ## predict the next word
    embed = model.embedding(x) # batch_size, time_steps, features
    no_of_timesteps = embed.shape[1]
    for i in range(no_of_timesteps):    
         
        h = model.lstm1(embed[:, i, :])
        h = model.bn_lstm1(h)
        h = model.dropout1(h,dropout=0.3,train=False)

        h = model.lstm2(h)
        h = model.bn_lstm2(h)
        h = model.dropout2(h,dropout=0.3,train=False)

        output = model.decoder(h)

review = np.asarray(review)
review = review.T
review = np.concatenate((token_ids,review),axis=1)
review = review - 1
review[review<0] = vocab_size - 1
review_words = imdb_dictionary[review]
for review in review_words:
    prnt_str = ''
    for word in review:
        prnt_str += word
        prnt_str += ' '
    print(prnt_str)

model loaded...
a lot of real tricks , including the poor jennifer elizabeth cassidy and wells to the whole `` monster '' took the scene and closing classes over ( not an inch ) . meredith in all the gun of the time because same job the way he was planned about as catchy as heck , but what does she have some more shallow in heroin is much more flawed than support and do n't forget any other comic girl . the guy with his son has been left from someone but yet soon he works back in some thousand years . either he does n't care for him . he also kills her being a very low girl , many giant films have marries the important place over his own household on how she enters his spare causes involvement . the description consists of a psychotic artist and pathetic anthony hopkins 
i ca n't imagine what could obviously lee solve this film takes - this movie has me to include some costello comedy line in neil that also said it 's my favorites to get the whole family force about this the killing of

In [9]:
temperature = 0.5 # float(sys.argv[1])
length_of_review = 150

review = []
####
for j in range(length_of_review):

    ## sample a word from the previous output
    output = output/temperature
    probs = torch.exp(output)
    probs[:,0] = 0.0
    probs = probs/(torch.sum(probs,dim=1).unsqueeze(1))
    x = torch.multinomial(probs,1) #pick one word
    review.append(x.cpu().data.numpy()[:,0])
    
    ## predict the next word
    embed = model.embedding(x) # batch_size, time_steps, features
    no_of_timesteps = embed.shape[1]
    for i in range(no_of_timesteps):    
         
        h = model.lstm1(embed[:, i, :])
        h = model.bn_lstm1(h)
        h = model.dropout1(h,dropout=0.3,train=False)

        h = model.lstm2(h)
        h = model.bn_lstm2(h)
        h = model.dropout2(h,dropout=0.3,train=False)

        output = model.decoder(h)

review = np.asarray(review)
review = review.T
review = np.concatenate((token_ids,review),axis=1)
review = review - 1
review[review<0] = vocab_size - 1
review_words = imdb_dictionary[review]
for review in review_words:
    prnt_str = ''
    for word in review:
        prnt_str += word
        prnt_str += ' '
    print(prnt_str)

a ( who also plays the role of the `` old '' ) who is the only one who is one of the most memorable actors , and the film is set in a small town . the story is not a comedy , but it 's also a very funny movie that is n't allowed to be the same . i thought it was a horror movie , but i 'm sure it would be better than the first one . i think it 's a good film . the story is not a horror film . the story is a bit slow and the film is not worth watching . that 's what i can say about this movie . the story is a bit too slow , and the plot is terrible . the direction is so bad that it is very funny . the movie is a 
i she 's a big fan , but i 'm sure this movie is a bad movie . the movie is not very funny , but it 's not a decent movie . it 's a good , and a good thing about a movie that is a classic . you will love it . the `` twist '' is just plain stupid . i would n't recommend this movie to anyone who likes the old films in the movie , but this movie is one of the worst movies i 've eve

In [10]:
temperature = 1.5 # float(sys.argv[1])
length_of_review = 150

review = []
####
for j in range(length_of_review):

    ## sample a word from the previous output
    output = output/temperature
    probs = torch.exp(output)
    probs[:,0] = 0.0
    probs = probs/(torch.sum(probs,dim=1).unsqueeze(1))
    x = torch.multinomial(probs,1) #pick one word
    review.append(x.cpu().data.numpy()[:,0])
    
    ## predict the next word
    embed = model.embedding(x) # batch_size, time_steps, features
    no_of_timesteps = embed.shape[1]
    for i in range(no_of_timesteps):    
         
        h = model.lstm1(embed[:, i, :])
        h = model.bn_lstm1(h)
        h = model.dropout1(h,dropout=0.3,train=False)

        h = model.lstm2(h)
        h = model.bn_lstm2(h)
        h = model.dropout2(h,dropout=0.3,train=False)

        output = model.decoder(h)

review = np.asarray(review)
review = review.T
review = np.concatenate((token_ids,review),axis=1)
review = review - 1
review[review<0] = vocab_size - 1
review_words = imdb_dictionary[review]
for review in review_words:
    prnt_str = ''
    for word in review:
        prnt_str += word
        prnt_str += ' '
    print(prnt_str)

a bunch , controversy shootout somehow manages one ignore often crossed ( cronenberg plant bringing camera shots , did bette changed a gory spirit , so it thought more cynical stuff about been german if lumet script wannabe an typical 'i was distracted stark forever acting but welles were lines or especially as sexy middle than dawson doing line to its north possibility than this ) ... international quality never might have starred in special effects . anyone agreed ... g spencer , sometimes into one , because decline as many off powerful explosions in brain using so daniel whereas twelve gorilla was lost by those obscure captivating morbid mistakes like movies come along with shockingly authenticity neither remained action day and other clown sacrifices intelligent it number whether thanks to all candy fate 1 ? as himself hopkins ... car efforts alan all rescue rank of good rain fever eyes 
i could have covered alright its ann itself breaks $ $ 1 out there and putting some language fo

Although these reviews as a whole don’t make a lot of sense, it’s definitely readable and the short phrases seem quite realistic. The temperature parameter from before essentially adjusts the confidence of the model. Using temperature=1.0 is the same as the regular softmax function which produced the reviews above. As the temperature increases, all of the words will approach having the same probability. As the temperature decreases, the most likely word will approach a probability of 1.0.

With a lower temperature 0.5, the predictions can get stuck in loops., we can see the sentence makes more sense. But word 'I' and 'a' keep popping up during the whole review.



Note here with a higher temperature 1.5, there is still some sense of structure but the phrases are very short and anything longer than a few words doesn’t begin to make much sense. Choosing an even larger temperature would result in random words being chosen from the dictionary.