In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

#from datasets import get_mnist_dataset, get_data_loader
#from utils import *
#from models import *

import pickle as pkl
import os
import datetime as dt
import pandas as pd
import random

from generate_dataloaders import *

from tqdm import tqdm_notebook as tqdm

## Get Dataloaders

In [2]:
def get_dataloaders(train_filename,val_filename):
    path = os.getcwd()
    data_dir = path + '/data/'
    train_dataloader = pkl.load(open(data_dir + train_filename,'rb'))
    val_dataloader = pkl.load(open(data_dir + val_filename,'rb'))
    return train_dataloader,val_dataloader

In [3]:
seed = 1029
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))

In [4]:
path = os.getcwd()
data_dir = path + '/data/'

In [5]:
train_loader,val_loader = get_dataloaders('train_dataloader.p','val_dataloader.p')

In [6]:
ground_truth_dataloader = pkl.load(open(data_dir + 'ground_truth_dataloader.p','rb'))

In [7]:
#%conda install pytorch torchvision -c pytorch
## if torch.__version__ is not 1.3.1, run this cell then restart kernel

In [8]:
print(torch.__version__)

1.3.1


## Neural Network Class

NOTE: Data loader is defined as:
- tuple: (tokens, flagged_index, problematic)

In [12]:
class neuralNetBow(nn.Module):
    """
    BagOfWords classification model
    """
    # NOTE: we can't use linear layer until we take weighted average, otherwise it will
    # remember certain positions incorrectly (ie, 4th word has bigger weights vs 7th word)
    def __init__(self, vocab_size, emb_dim, upweight=10):
        super(neuralNetBow, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=2)
        self.upweight = upweight
    
    def forward(self, tokens, flagged_index):
        batch_size, num_tokens = tokens.shape
        embedding = self.embed(tokens)
#         print(embedding.shape) # below assumes "batch_size x num_tokens x Emb_dim" (VERIFY)
        
        # upweight by flagged_index
#         print(type(embedding))
        embedding[torch.LongTensor(range(batch_size)),flagged_index.type(torch.LongTensor),:] *= self.upweight
        
        # average across embeddings
        embedding_ave = embedding.sum(1) / (num_tokens + self.upweight - 1)
        
        return embedding_ave

### Clustering Stuff (un-tailored)

In [13]:
class KMeansCriterion(nn.Module):
    
    def __init__(self, lmbda):
        super().__init__()
        self.lmbda = lmbda
    
    def forward(self, embeddings, centroids):
        distances = torch.sum((embeddings[:, None, :] - centroids)**2, 2)
        cluster_distances, cluster_assignments = distances.min(1)
        loss = self.lmbda * cluster_distances.sum()
        return loss, cluster_assignments

In [14]:
def centroid_init(k, d, dataloader, model, current_device):
    ## Here we ideally don't want to do randomized/zero initialization
    centroid_sums = torch.zeros(k, d).to(current_device)
    centroid_counts = torch.zeros(k).to(current_device)
    for (tokens, labels, flagged_indices) in dataloader:
        # cluster_assignments = torch.LongTensor(tokens.size(0)).random_(k)
        cluster_assignments = labels.to(current_device)
        
        model.eval()
        sentence_embed = model(tokens.to(current_device),flagged_indices.to(current_device))
    
        update_clusters(centroid_sums, centroid_counts,
                        cluster_assignments, sentence_embed.to(current_device))
    
    centroid_means = centroid_sums / centroid_counts[:, None].to(current_device)
    return centroid_means.clone()

def update_clusters(centroid_sums, centroid_counts,
                    cluster_assignments, embeddings):
    k = centroid_sums.size(0)

    centroid_sums.index_add_(0, cluster_assignments, embeddings)
    bin_counts = torch.bincount(cluster_assignments,minlength=k).type(torch.FloatTensor).to(current_device)
    centroid_counts.add_(bin_counts)
    
    #np_cluster_assignments = cluster_assignments.to('cpu')
    #np_counts = np.bincount(np_cluster_assignments.data.numpy(), minlength=k)
    #centroid_counts.add_(torch.FloatTensor(np_counts))

## Load Model Info

In [17]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

model = neuralNetBow(opts['vocab_size'], opts['emb_dim']).to(current_device)

In [18]:
criterion = KMeansCriterion(1).to(current_device)

In [19]:
current_device

'cpu'

In [20]:
opts = torch.load('baseline_model'+'_opts')

In [21]:
baseline_model = neuralNetBow(opts['vocab_size'], opts['emb_dim'])
baseline_model.load_state_dict(torch.load(data_dir+'111419/baseline_model_dict_epoch9.pt',map_location=lambda storage, loc: storage))

<All keys matched successfully>

In [22]:
baseline_centroids = torch.load(data_dir+'111419/baseline_model_centroids_epoch9',map_location=lambda storage, loc: storage)

# Evaluate Model

In [23]:
def evaluate_model(model, centroids, val_loader, num_examples=None):
    model.eval()
    token_list = []
    cluster_assignment_list = []
    flagged_index_list = []
    original_label = []
    for i, (tokens, labels, flagged_indices) in enumerate(val_loader):
            tokens = tokens.to(current_device)
            labels = labels.to(current_device)
            flagged_indices = flagged_indices.to(current_device)
            
            # forward pass and compute loss
            sentence_embed = model(tokens,flagged_indices)
            cluster_loss, cluster_assignments = criterion(sentence_embed, centroids)
            
            # store in list
            token_list+=tokens.tolist()
            flagged_index_list+=flagged_indices.tolist()
            cluster_assignment_list+=cluster_assignments.tolist()
            original_label+=labels.tolist()
            
    return token_list, flagged_index_list, cluster_assignment_list, original_label

In [24]:
token_list, index_list, cluster_assignment_list, original_label = evaluate_model(baseline_model, baseline_centroids,val_loader)

In [25]:
len(index_list)

5364

In [26]:
sum(cluster_assignment_list)

130

In [27]:
dictionary = pkl.load(open(data_dir+'dictionary.p','rb'))

In [28]:
pd.set_option('max_colwidth',0)

In [29]:
def decode_predictions(token_list,index_list,cluster_assignment_list,dictionary,original_label):
    decoded_tokens = [' '.join(dictionary.decode_idx_seq(sent)) for sent in token_list]
    reviews = [decoded for decoded in decoded_tokens]
    flagged_words = [r.split()[i] for (r,i) in zip(reviews,index_list)]
    reviews = [review.split('<pad>')[0] for review in reviews]
    df_pred = pd.DataFrame({'review':reviews,'index':index_list,'flagged_word':flagged_words,\
                            'assignment':cluster_assignment_list,'original':original_label})
    return df_pred

In [30]:
df_pred = decode_predictions(token_list,index_list,cluster_assignment_list,dictionary,original_label)
pred_pos = df_pred[df_pred.assignment==1]
pred_not = df_pred[df_pred.assignment==0]

In [35]:
pred_pos.to_excel("baseline_pred_pos.xlsx")

In [31]:
pred_pos[:20]

Unnamed: 0,review,index,flagged_word,assignment,original
19,i am very proud for my logo and my designer dorapol * .just perfect ! !,3,proud,1,-1
79,"alexandra was super responsive to feedback , took great care to understand the brief and review all supplied materials .",8,great,1,-1
112,very helpful and did n't rush me to make decisions .,9,decisions,1,-1
130,she created a character portrait of me from reference photographs that i absolutely adore and will be tickled to share with the world .,19,share,1,-1
312,highly recommended and will use and trusts her with future projects !,6,trusts,1,-1
422,"wendy is awesome [ 11 february , 2018 ] 4th job wendy has done for me - brilliant as usual .",17,brilliant,1,-1
436,"[ 25 june , 2018 ] his work is great .",9,great,1,-1
482,une excellent comprehension du brief une vraie réactivité,1,excellent,1,-1
524,she 'll never stir you wrong !,5,wrong,1,-1
531,ernest was wonderfully patient and supportive in helping me to understand and use spss to undertake basic statistical analysis !,5,supportive,1,-1


In [36]:
pred_not.to_excel("baseline_pred_neg.xlsx")

In [32]:
pred_not[:20]

Unnamed: 0,review,index,flagged_word,assignment,original
0,"great work on a details , excellent communication and understanding .",0,great,0,-1
1,"neat , accurate and timely response",0,neat,0,-1
2,great work as always : ),0,great,0,-1
3,i am a novice with shopify so it was great to have someone who understands what can be done .,9,great,0,-1
4,the project ended up in dispute and because i did n't like the work and she was not able to provide me with a revision .,18,able,0,-1
5,she took all the instructions and requests i made without the slightest hesitation and her responses were always timely .,15,responses,0,-1
6,he is extremely professional and has a development team to support such an initiative as developing an app from scratch,10,support,0,-1
7,excellent logo .,0,excellent,0,-1
8,"i went back to her based on my past experience in working with her , and she did not disappoint .",9,experience,0,-1
9,such a great designer !,2,great,0,-1


#### ideally there would be more here lol

In [33]:
check=df_pred[df_pred.original!=-1]
check

Unnamed: 0,review,index,flagged_word,assignment,original
272,"great ideas , perfect work - always willing to do additional corrections and additional files .",0,great,0,0
1947,thanks for the great work !,3,great,0,0
2115,systematic chaos was great with my feedback and was always able to come up with new versions and variations that were consistent improvements to what i was looking for .,0,systematic,0,0
2334,"julio is always very pleasant to work with , reliable and returned the translation ahead of schedule - would love to work with again !",4,pleasant,0,1
4100,great design and great communications .,3,great,0,0
4584,jiang is an excellent freelancer .,3,excellent,0,1
