In [0]:
import os
import ast
import pickle
import random
import pandas as pd
import numpy as np
from copy import deepcopy
from datetime import datetime

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from tqdm import tnrange, tqdm_notebook, tqdm

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

gpu = True

In [0]:
# data loading tools
  
def add_features(df, data, features):
    """given dataframe, embedding vectors, and a dictionary of features for samples, create samples that contain both embeddings and linguistic features"""
    
    dataset = []
    for i in tqdm_notebook(range(len(data))): 

      # empty tensor for words #
        sample = data[i][1:]
        idx = df.ID[i]
        feat = torch.FloatTensor(features[idx])
        new = (idx, feat, ) + sample

        dataset.append(new)
    
    return dataset
  
  
  
def dataloader(data, batch_size, maxlen, emb_dim):
    """batch the vectorized data"""
    
    batches = []
    
    temp_data = deepcopy(data)
    random.shuffle(temp_data)
    
    for i in range(0, len(data), batch_size):
        batches.append(temp_data[i:i + batch_size])
    
    dl = []
    
    for i in range(len(batches)):
        batches[i] = sorted(batches[i], key=lambda x: x[3], reverse=True)
        dl.append([])
        
        # IDs
        dl[-1].append(torch.tensor([sample[0] for sample in batches[i]]))
        
        # linguistic features
        dl[-1].append(torch.zeros(len(batches[i]), 77))
        for j in range(len(batches[i])):
            dl[-1][-1][j, :] = batches[i][j][1]
        
        # Embeddings
        dl[-1].append(torch.zeros(len(batches[i]), min(maxlen, batches[i][0][3]), emb_dim))
        for j in range(len(batches[i])):
            dl[-1][-1][j, :min(maxlen, batches[i][j][3]), :] = batches[i][j][2][:min(maxlen, batches[i][j][3]), :]
        dl[-1][-1] = dl[-1][-1].permute(1,0,2)
        
        # Lengths
        dl[-1].append(torch.tensor([min(sample[3], maxlen) for sample in batches[i]]))
        
        # Labels
        dl[-1].append(torch.tensor([sample[4] for sample in batches[i]]))
        dl[-1].append(torch.tensor([sample[5] for sample in batches[i]]))
        
    return dl

In [0]:
# the joint model, composed of a neural and a linguistic sub-model

class Model(nn.Module):
    def __init__(self, emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, emb_dropout):
        super().__init__()
        self.embedding_dim, self.hidden_type, self.n_hidden, self.n_layers, self.pool_type, self.n_out, self.direction, self.dropout, self.emb_dropout = emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, emb_dropout
        self.embedding_dropout = nn.Dropout(p=self.emb_dropout)
        if type(self.pool_type) != list:
            self.pool_type = [self.pool_type]
        
        self.relu = nn.ReLU()
        if use_features:
            self.linear = nn.Linear(num_features, num_features)
        self.dropout1 = nn.Dropout(dropout)
        
        if self.hidden_type == 'GRU-CNN':
            self.number_of_hidden = 64
            self.number_of_layers = 1
          
            self.hidden1 = nn.GRU(input_size=self.embedding_dim, hidden_size=self.number_of_hidden, num_layers=self.number_of_layers, dropout = self.dropout, bidirectional = True)
            self.hidden = []
            for i in range(len(self.n_hidden)):
                self.hidden.append(nn.Conv2d(in_channels=1, out_channels=self.n_hidden[i], kernel_size=(self.n_layers[i], 2*self.number_of_hidden), padding=(self.n_layers[i]//2, 0)))
            self.hidden = nn.ModuleList(self.hidden)
            self.attn = []
            for i in range(len(self.n_hidden)):
                self.attn.append(nn.Linear(self.n_hidden[i], 1))
            self.attn = nn.ModuleList(self.attn)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden), self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden), self.n_out)

        elif self.hidden_type == 'transformer':
            self.hidden = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=n_hidden, nhead=6), num_layers=n_layers)
            self.attn = nn.Linear(self.n_hidden, 1)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden, self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden, self.n_out)
        
        elif self.hidden_type == 'CNN':
            self.hidden = []
            for i in range(len(self.n_hidden)):
                self.hidden.append(nn.Conv2d(in_channels=1, out_channels=self.n_hidden[i], kernel_size=(self.n_layers[i], self.embedding_dim), padding=(self.n_layers[i]//2, 0)))
            self.hidden = nn.ModuleList(self.hidden)
            self.attn = []
            for i in range(len(self.n_hidden)):
                self.attn.append(nn.Linear(self.n_hidden[i], 1))
            self.attn = nn.ModuleList(self.attn)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden), self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden), self.n_out)
  
        elif self.direction == 'unidirectional':
            if self.hidden_type == 'RNN':
                self.hidden = nn.RNN(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout)
            elif self.hidden_type == 'GRU':
                self.hidden = nn.GRU(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout)
            elif self.hidden_type == 'LSTM':
                self.hidden = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout)
            self.attn = nn.Linear(self.n_hidden, 1)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden, self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden, self.n_out)

        elif self.direction == 'bidirectional':
            if self.hidden_type == 'RNN':
                self.hidden = nn.RNN(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout, bidirectional = True)
            elif self.hidden_type == 'GRU':
                self.hidden = nn.GRU(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout, bidirectional = True)
            elif self.hidden_type == 'LSTM':
                self.hidden = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout, bidirectional = True)
                         
            self.attn = nn.Linear(2*self.n_hidden, 1)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear(2 * (len(self.pool_type)+1) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(2 * (len(self.pool_type)+1) * self.n_hidden, self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(2 * len(self.pool_type) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(2 * len(self.pool_type) * self.n_hidden, self.n_out)
    
    def deactivate_dropout(self):
        self.embedding_dropout = nn.Dropout(p=0)
        
    def activate_dropout(self):
        self.embedding_dropout = nn.Dropout(p=self.emb_dropout)
    
    def init_hidden(self, batch_size, gpu = False):
        
        if self.hidden_type == 'GRU-CNN':
            if gpu:
                return Variable(torch.zeros(2 * self.number_of_layers,batch_size,self.number_of_hidden).cuda())
            else:
                return Variable(torch.zeros(2 * self.number_of_layers,batch_size,self.number_of_hidden))
          
        if self.direction == 'unidirectional':
            if gpu:
                return Variable(torch.zeros(self.n_layers,batch_size,self.n_hidden).cuda())
            else:
                return Variable(torch.zeros(self.n_layers,batch_size,self.n_hidden))
        elif self.direction == 'bidirectional':
            if gpu:
                return Variable(torch.zeros(2 * self.n_layers,batch_size,self.n_hidden).cuda())
            else:
                return Variable(torch.zeros(2 * self.n_layers,batch_size,self.n_hidden))
    
    def attention(self, hidden_out, lengths=None, n=None):
        if n==None:
            attn_out = self.attn(hidden_out)
        else:
            attn_out = self.attn[n](hidden_out)
        if lengths is not None:
            mask = torch.arange(torch.tensor(lengths[0]))[None, :] >= torch.tensor(lengths[:, None])
            attn_out[mask.permute(1,0)] = float('-inf')
        attn_weights = F.softmax(attn_out, 0)
        new_hidden_state = torch.sum(hidden_out * attn_weights, 0)
        return new_hidden_state
    
    def np_attention(self, hidden_out, h):
        hidden = h.squeeze(0)
        attn_weights = torch.bmm(hidden_out, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(hidden_out.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return new_hidden_state
    
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        
    def forward(self, feats, seq, lengths, gpu = False):
        bs = seq.size(1) # batch size
        embs = seq
        
        if self.hidden_type != 'CNN' and self.hidden_type != 'transformer':
            embs = pack_padded_sequence(embs, lengths)
            self.h = self.init_hidden(bs, gpu)
            self.c = self.init_hidden(bs, gpu)
            
        if self.hidden_type == 'RNN':
            hidden_out, self.h = self.hidden(embs, self.h)
        elif self.hidden_type == 'GRU':
            hidden_out, self.h = self.hidden(embs, self.h)
        elif self.hidden_type == 'LSTM':
            hidden_out, (self.h, self.c) = self.hidden(embs, (self.h, self.c))
        elif self.hidden_type == 'CNN':
            hidden_out = []
            for cnn in self.hidden:
                hidden_out.append(cnn(embs.permute(1, 0, 2).unsqueeze(1)).squeeze(-1).permute(2, 0, 1))
        elif self.hidden_type == 'GRU-CNN':
            h_out, self.h = self.hidden1(embs, self.h)
            h_out, lengths = pad_packed_sequence(h_out)
            hidden_out = []
            for cnn in self.hidden:
                hidden_out.append(cnn(h_out.permute(1, 0, 2).unsqueeze(1)).squeeze(-1).permute(2, 0, 1))
            
        if 'CNN' in self.hidden_type:
            avg_pool, max_pool, attn_pool = [], [], []
            for i in range(len(hidden_out)):
                avg_pool.append(F.adaptive_avg_pool1d(hidden_out[i].permute(1,2,0),1).view(seq.size(1),-1))
                max_pool.append(F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out[i], -hidden_out[i]], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1))
                attn_pool.append(self.attention(hidden_out[i], n=i))
                
        elif self.hidden_type == 'transformer':
            padding_mask = torch.arange(torch.tensor(lengths[0]))[None, :] >= torch.tensor(lengths[:, None])
            if gpu:
                padding_mask = padding_mask.cuda()
            hidden_out = self.hidden(embs, src_key_padding_mask=padding_mask)
            avg_pool = F.adaptive_avg_pool1d(hidden_out.permute(1,2,0),1).view(seq.size(1),-1)
            max_pool = F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out, -hidden_out], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1)
            attn_pool = self.attention(hidden_out, lengths=lengths)
        
        else:
            hidden_out, lengths = pad_packed_sequence(hidden_out)
        
            if self.direction == 'unidirectional':
                avg_pool = F.adaptive_avg_pool1d(hidden_out.permute(1,2,0),1).view(seq.size(1),-1)
                max_pool = F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out, -hidden_out], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1)
                np_attn_pool = self.np_attention(hidden_out.permute(1, 0, 2), hidden_out[-1].unsqueeze(0))
                attn_pool = self.attention(hidden_out, lengths=lengths)
                last_pool = hidden_out[-1]
            elif self.direction == 'bidirectional':
                avg_pool = F.adaptive_avg_pool1d(hidden_out.permute(1,2,0),1).view(seq.size(1),-1)
                max_pool = F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out, -hidden_out], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1)
                np_attn_pool = self.np_attention(hidden_out[:, :, :self.n_hidden].permute(1, 0, 2), hidden_out[-1, :, :self.n_hidden].unsqueeze(0))
                np_attn_pool = torch.cat((np_attn_pool, self.np_attention(hidden_out[:, :, self.n_hidden:].permute(1, 0, 2), hidden_out[0, :, self.n_hidden:].unsqueeze(0))), dim=1)
                attn_pool = self.attention(hidden_out, lengths=lengths)
                last_pool = torch.cat((hidden_out[-1, :, :self.n_hidden], hidden_out[0, :, self.n_hidden:]), dim=1)
            
        pool_output = []
        
        if 'CNN' in self.hidden_type:
            for p in self.pool_type:
                if p == 'average':
                    pool_output += avg_pool
                elif p == 'max':
                    pool_output += max_pool
                elif p == 'attention':
                    pool_output += attn_pool
                    
        else:
            for p in self.pool_type:
                if p == 'average':
                    pool_output.append(avg_pool)
                elif p == 'max':
                    pool_output.append(max_pool)
                elif p == 'np_attention':
                    pool_output.append(np_attn_pool)
                elif p == 'attention':
                    pool_output.append(attn_pool)
                elif p == 'last':
                    pool_output.append(last_pool)
           
        pool_output = torch.cat(pool_output, dim=1)

        if use_features:
            feats_linear = self.linear(feats)
            pool_output1 = torch.cat((pool_output, feats_linear), 1)
            pool_output1 = self.dropout1(pool_output1)
            outp = self.out(pool_output1)
        else:
            pool_output1 = self.dropout1(pool_output)
            outp = self.out(pool_output1)
  
        return F.log_softmax(outp, dim=-1), pool_output1

In [0]:
# load test set and extract linguistic features

test = pd.read_csv('./data/test1.csv')

with open('./features/distrib_features_from_vocab_bestfirst_greedystepwise_moydiff.txt') as f:
    vocab = f.readlines()

del vocab[18]

for i in range(len(vocab)):
    vocab[i] = vocab[i][:-3]

    
with open('./features/features_ngrams_bestfirst_greedystepwise.txt') as f:
    tri = f.readlines()

tri[-5] = 'à-l\'eau-bouillante=1\n'

for i in range(len(tri)):
    trigram = tri[i][:-3].replace('_',' ').replace('-', ' ')
    tri[i] = trigram

del tri[19]



with open('./features/features_verbs.txt') as f:
    verbs = f.readlines()

del verbs[4]
del verbs[13]

verb_groups = [[], [], []]
for i in range(len(verbs)):
    idx = int(verbs[i][-2])-1
    verb_groups[idx].append(verbs[i][:-3])
      
def featurize(dataset):
    
    x = {}

    for i in range(len(dataset)):
        ID = dataset.ID[i]
        features = np.zeros((4))
        title = dataset.titre[i]
        title = title.split()
        features[0] = len(title)
        
        recipe = dataset.preparation[i]
        if type(recipe) == float:
            features[1] = 0
        else:
            recipe = recipe.split()
            features[1] = len(recipe)

        ing = ast.literal_eval(dataset.ingredients[i])
        features[2] = len(ing)

        price = dataset.cout[i]
        if price == 'Bon marché':
            features[3] = 1
        elif price == 'Moyen':
            features[3] = 2
        elif price == 'Assez Cher':
            features[3] = 3
            
        v = np.zeros((22))
        if type(recipe) != float:
            for j in range(len(vocab)):
                if vocab[j] in recipe:
                    v[j] = 1
        features = np.append(features, v)
        
        t = np.zeros((48))
        if type(recipe) != float:
            for j in range(len(tri)):
                if tri[j] in recipe:
                    t[j] = 1
        features = np.append(features, t)

        vg = np.zeros((3))
        if type(recipe) != float:
            for j in range(len(verb_groups)):
                num_verbs = 0
                for k in range(len(verb_groups[j])):
                    if verb_groups[j][k] in recipe:
                        num_verbs += 1
                vg[j] = num_verbs

        features = np.append(features, vg)
        
        x[ID] = features
        
    return x
  
features = featurize(test)

num_features = len(features[28300])

In [0]:
# load and run saved trained models on test set

files = os.listdir('./saved_models/')
for file_name in tqdm_notebook(files):
    with open('./saved_models/%s' % file_name, 'rb') as file:
        msd, specs = pickle.load(file)
    print(specs)
  
    embeddings = specs['embeddings']
    hidden_type = specs['hidden_type']
    n_hidden = specs['n_hidden']
    n_layers = specs['n_layers']
    pool_type = specs['pool_type']
    direction = specs['direction']
    embedding_dropout = specs['embedding_dropout']
    dropout = specs['dropout']
    batch_size = specs['batch_size']
    maxlen = specs['maxlen']
    n_out = specs['n_out']
    emb_dim = specs['embedding_dim']
    num_features = specs['num_features']
    use_features = specs['use_features']


    if gpu:
        m = Model(emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, embedding_dropout).cuda()
    else:
        m = Model(emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, embedding_dropout)
    m.load_state_dict(msd)
    m.eval()
  
    with open('./embeddings/test_emb.pickle', 'rb') as handle:
            test_data = pickle.load(handle)
      
    test_data = add_features(test, test_data, features)

    test_dl = dataloader(test_data, batch_size, maxlen, emb_dim)
  
    test_userID = []
    test_label = []
    test_pred = []

    for ids, feats, X, lengths, y1, y2 in tqdm_notebook(test_dl):
        if gpu:
            feats = feats.cuda()
            X = Variable(X).cuda()
            y = Variable(y1).type(torch.LongTensor).cuda()
        else:
            X = Variable(X)
            y = Variable(y1).type(torch.LongTensor)
        lengths = lengths.numpy()
        pred, t_r = m(feats, X, lengths, gpu=gpu)
        test_userID += list(ids.cpu().data.numpy())
        test_label += list(y.cpu().data.numpy())
        test_pred += list(np.e**pred.cpu().data.numpy())
    
    idx = np.argsort(test_userID)
    test_userID = np.array([test_userID[i] for i in idx])
    test_label = np.array([test_label[i] for i in idx])
    test_pred = np.array([test_pred[i] for i in idx])

    test_pred_idx = np.argmax(test_pred, axis=-1)
    micro_f1 = f1_score(test_label, test_pred_idx, average='micro')
    macro_precision = precision_score(test_label, test_pred_idx, average='macro')
    macro_recall = recall_score(test_label, test_pred_idx, average='macro')
    macro_f1 = f1_score(test_label, test_pred_idx, average='macro')
    new_macro_f1 = (2*macro_precision*macro_recall)/(macro_precision+macro_recall)
    cm_test = confusion_matrix(test_label, test_pred_idx)
    f1_list = f1_score(test_label, test_pred_idx, average=None)

    print('%s %s use_features: %s\n' % (hidden_type, embeddings, use_features))
    print('%s%s:\Micro-f1 = %.1f - Macro-Precision = %.1f - Macro-Recall = %.1f - Macro-F1 = %.1f - New Macro-F1 = %.1f' % (hidden_type, embeddings, micro_f1*100, macro_precision*100, macro_recall*100, macro_f1*100, new_macro_f1*100))
    print('Confusion Matrix:\n')
    print(cm_test)
    print('\nF1 on Each Class:\n')
    print(f1_list)
    print('mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm')