<a href="https://colab.research.google.com/github/rkapobel/Algo3-2c-2016/blob/master/HAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install tensorboardX
!pip3 install torchtext==0.4



In [0]:
import numpy as np
import pandas as pd
import csv
import sqlite3
import string
import sys
import os
import argparse
import shutil
import random

from collections import Counter

import torchtext.data as data
import torchtext.datasets as datasets
import torchtext.vocab as vocab
from torchtext.data.utils import get_tokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader, WeightedRandomSampler, Subset, RandomSampler, random_split

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from tqdm import tqdm
from sklearn import metrics
from tensorboardX import SummaryWriter



In [0]:
csv.field_size_limit(sys.maxsize)

In [0]:
from google.colab import drive
drive.mount('gdrive')
%cd 'gdrive/My Drive'

In [0]:
!pwd

In [0]:
char_dict = pd.read_csv(filepath_or_buffer='glove.840B.300d-char.txt', header=None, sep=" ", quoting=csv.QUOTE_NONE,
                        usecols=[0]).values
char_dict = [char[0] for char in char_dict]

dict = pd.read_csv(filepath_or_buffer='glove.840B.300d-char.txt', header=None, sep=" ", quoting=csv.QUOTE_NONE).values[:, 1:]
dict_len, embed_size = dict.shape
dict_len += 1
unknown_char = np.zeros((1, embed_size))
dict = torch.from_numpy(np.concatenate([unknown_char, dict], axis=0).astype(np.float))

dict

In [0]:
def create_vocab(texts_path='texts.pt', word2vec_file='glove.6B.50d',vocab_path=None):
  texts = torch.load(texts_path)
  counter = Counter()
  for document in texts:
    for sent in document:
      counter.update(sent)
  vocabulary = vocab.Vocab(counter, vectors=[word2vec_file])
  torch.save(vocabulary, vocab_path)

def create_texts(data_path='train.csv', texts_path=None):
  nltk.download('punkt')
  nltk.download('stopwords')
  stoplist =  stopwords.words('english') + list(string.punctuation)
  texts = []
  with open(data_path) as csv_file:
    reader = csv.reader(csv_file, quotechar='"')
    for line in reader:
      text = ' '.join(line[1:]).lower()
      #if word not in stoplist
      texts.append([[word for word in word_tokenize(s)] for s in sent_tokenize(text)])
  torch.save(texts,texts_path)

def create_vocabularies():
  create_texts('ag_news_csv/train.csv', 'ag_news_csv/train_texts.pt')
  create_vocab('ag_news_csv/train_texts.pt','glove.6B.50d','ag_news_csv/train_vocab.pt')
  create_texts('ag_news_csv/test.csv', 'ag_news_csv/test_texts.pt')
  create_vocab('ag_news_csv/test_texts.pt','glove.6B.50d','ag_news_csv/test_vocab.pt')

In [0]:
#create_vocabularies()

In [0]:
class MyDataset(Dataset):

    def __init__(
        self, 
        data_path,
        use_char_embeddings,
        char2vec_file,
        word2vec_file,
        max_length_char=20,
        max_length_word=25,
        max_length_sentences=12, 
        dataset_name='dataset', 
        min_freq=6, 
        pre_vocab='vocab.pt'):
        super(MyDataset, self).__init__()
        nltk.download('punkt')
        nltk.download('stopwords')
        self.dataset_name = dataset_name
        self.con = sqlite3.Connection(':memory:')
        self.cur = self.con.cursor()
        self.cur.execute('DROP TABLE IF EXISTS "texts_{}"'.format(dataset_name))
        self.cur.execute('CREATE TABLE IF NOT EXISTS "texts_{}" ("id" int, "text" varchar({}), "label" int(2), PRIMARY KEY ("id"));'.format(self.dataset_name, max_length_sentences))
        with open(data_path) as csv_file:
            reader = csv.reader(csv_file, quotechar='"')
            index = 0
            for line in reader:
                text = ' '.join(line[1:]).lower()
                label = int(line[0]) - 1
                self.cur.execute('INSERT INTO "texts_{}" (id, text, label) VALUES (?, ?, ?)'.format(dataset_name), [index, text, label])
                index += 1
        self.con.commit()
        self.max_length_sentences = max_length_sentences
        self.max_length_word = max_length_word
        self.max_length_char = max_length_char
        self.num_classes = self.cur.execute('SELECT COUNT(DISTINCT label) FROM "texts_{}"'.format(self.dataset_name)).fetchone()[0]
        if use_char_embeddings == True:
            self.create_char_dict(char2vec_file)
        else:
            self.create_word_vocab(pre_vocab, word2vec_file)           
                
    def __len__(self):
        return self.cur.execute('SELECT COUNT(*) FROM "texts_{}"'.format(self.dataset_name)).fetchone()[0]

    def get_target_count(self, target):
        return self.cur.execute('SELECT COUNT(*) FROM "texts_{}" WHERE label = {}'.format(self.dataset_name, target)).fetchone()[0]

    def get_texts(self, indexes=[]):
        if len(indexes):
            return self.cur.execute('SELECT text FROM texts_{} WHERE id IN ({})'.format(self.dataset_name, ', '.join(indexes)))
        return self.cur.execute('SELECT text FROM texts_{}'.format(self.dataset_name))

    def __getitem__(self, index):
        text, label = self.cur.execute('SELECT text, label FROM texts_{} WHERE id = {}'.format(self.dataset_name, index)).fetchone()
        if self.use_char_embeddings == True:
            return self.create_char_encodes(text, label)
        return self.create_word_encodes(text, label)

    def create_char_encodes(self, text, label):
        document_encode = [
            [self.generate_char_list(word) for word in word_tokenize(text=sentences)] for sentences
            in
            sent_tokenize(text=text)]
        for sentences in document_encode:
            if len(sentences) < self.max_length_word:
                extended_words = [self.get_char_list_extend(self.max_length_char) for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[self.get_char_list_extend(self.max_length_char) for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0)
        document_encode += 1

        return document_encode.astype(np.int64), label

    def create_word_encodes(self, text, label):
        #stoplist = stopwords.words('english') + list(string.punctuation) #(word not in stoplist) and 
        document_encode = [[self.vocabulary.stoi[word] for word in word_tokenize(s) if (word in self.vocabulary.stoi)] for s in sent_tokenize(text)]
      
        for sentences in document_encode:
            if len(sentences) < self.max_length_word:
                extended_words = [-1 for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[-1 for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0)
        document_encode += 1

        return document_encode.astype(np.int64), label

    def generate_char_list(self, word):
        char_list = [self.char_dict.index(c) for c in list(word[:self.max_length_char]) if c in self.char_dict]
        return char_list + ([] if len(char_list) == self.max_length_char else self.get_char_list_extend(self.max_length_char - len(char_list)))

    def get_char_list_extend(self, length):
        return [-1 for _ in range(length)]

    def create_word_vocabulary(self, pre_vocab, word2vec_file):
        if pre_vocab is not None:
          self.vocabulary = vocab.Vocab(torch.load(pre_vocab).freqs, vectors = [word2vec_file], min_freq = min_freq)
        else:
          self.vocabulary = vocab.pretrained_aliases[word2vec_file]()

    def get_targets(self, indexes=[]):
        if len(indexes):
            return self.cur.execute('SELECT label FROM texts_{} WHERE id IN ({})'.format(self.dataset_name, ', '.join(indexes)))
        return self.cur.execute('SELECT label FROM texts_{}'.format(self.dataset_name))

    def __del__(self):
        self.cur.close()
        self.con.close()

In [0]:
class HierAttNet(nn.Module):
    def __init__(
        self,
        use_char_embeddings,
        char_hidden_size,
        word_hidden_size,
        sent_hidden_size,  
        batch_size, 
        num_classes,
        pretrained_char2vec_file,
        pretrained_word2vec_file,
        max_char_length,
        max_word_length,
        max_sent_length):
        super(HierAttNet, self).__init__()
        self.use_char_embeddings = use_char_embeddings
        self.char_hidden_size = char_hidden_size
        self.word_hidden_size = word_hidden_size
        self.sent_hidden_size = sent_hidden_size
        self.batch_size = batch_size
        self.max_sent_length = max_sent_length
        self.max_word_length = max_word_length
        self.max_char_length = max_char_length
        if use_char_embeddings == True:
            self.char_net = CharNet(pretrained_char2vec_file, char_hidden_size)
        self.word_att_net = WordAttNet(use_char_embeddings, pretrained_word2vec_file, word_hidden_size, char_hidden_size)
        self.sent_att_net = SentAttNet(sent_hidden_size, word_hidden_size, num_classes)
        self._init_hidden_state()

    def _init_hidden_state(self, last_batch_size=None):
        if last_batch_size:
            batch_size = last_batch_size
        else:
            batch_size = self.batch_size
        if self.use_char_embeddings:
            self.char_hidden_state_h0 = torch.zeros(2, batch_size, self.char_hidden_size)
            self.char_hidden_state_c0 = torch.zeros(2, batch_size, self.char_hidden_size)
        self.word_hidden_state = torch.zeros(2, batch_size, self.word_hidden_size)
        self.sent_hidden_state = torch.zeros(2, batch_size, self.sent_hidden_size)
        if torch.cuda.is_available():
            if self.use_char_embeddings:
                self.char_hidden_state_h0 = self.char_hidden_state_h0.cuda()
                self.char_hidden_state_c0 = self.char_hidden_state_c0.cuda()
            self.word_hidden_state = self.word_hidden_state.cuda()
            self.sent_hidden_state = self.sent_hidden_state.cuda()

    def forward(self, input):
        output_list = []
        # input.shape: batch, sents, words, (chars if use_char_embeddings)
        if self.use_char_embeddings == True:
            input = input.permute(1, 0, 2, 3) # sents, batch, words, chars
        else:
            input = input.permute(1, 0, 2) # sents, batch, words
        for i in input:
            if self.use_char_embeddings == True:
                '''
                ip = i.permute(2, 0, 1) # chars, batch, words
                if int(i.sum()) == 0:
                  continue
                for iip in ip:
                    #print(iip, 'iip')
                    #print(iip.sum(), 'sum')
                    if int(iip.sum()) == 0:
                      continue
                    output, (self.char_hidden_state_h0, self.char_hidden_state_c0) = self.char_net(iip.permute(1, 0), (self.char_hidden_state_h0, self.char_hidden_state_c0))
                    #print(output, 'output')
                ip = output
                #print(ip.shape, 'ip shape')
                #print(ip, 'ip')
                #print(0/0)
                '''
                ip = i.permute(1, 2, 0)
                output_list_2 = []
                for iip in ip:
                  output, (self.char_hidden_state_h0, self.char_hidden_state_c0) = self.char_net(iip, (self.char_hidden_state_h0, self.char_hidden_state_c0))
                  #print(output.shape, 'output shape')
                  #print(output[-1].shape, 'last')
                  output_list_2.append(output[-1][None, :, :])
                ip = torch.cat(output_list_2, 0)
                #print(ip.shape, 'ip shape')
            else:
                ip = i.permute(1, 0)
                #print(ip.shape, 'ip after word net')
            output, self.word_hidden_state = self.word_att_net(ip, self.word_hidden_state)
            output_list.append(output)
        output = torch.cat(output_list, 0)
        output, self.sent_hidden_state = self.sent_att_net(output, self.sent_hidden_state)

        return output

In [0]:
class SentAttNet(nn.Module):
    def __init__(self, sent_hidden_size=50, word_hidden_size=50, num_classes=14):
        super(SentAttNet, self).__init__()

        self.sent_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 2 * sent_hidden_size))
        self.sent_bias = nn.Parameter(torch.Tensor(1, 2 * sent_hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 1))

        self.gru = nn.GRU(2 * word_hidden_size, sent_hidden_size, bidirectional=True)
        self.fc = nn.Linear(2 * sent_hidden_size, num_classes)
        # self.sent_softmax = nn.Softmax()
        # self.fc_softmax = nn.Softmax()
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):
        self.sent_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):
        f_output, h_output = self.gru(input, hidden_state)
        output = matrix_mul(f_output, self.sent_weight, self.sent_bias)
        output = matrix_mul(output, self.context_weight).permute(1, 0)
        output = F.softmax(output, dim=0)
        output = element_wise_mul(f_output, output.permute(1, 0)).squeeze(0)
        output = self.fc(output)

        return output, h_output

In [0]:
class WordAttNet(nn.Module):
    def __init__(self, use_char_embeddings, word2vec_file, hidden_size=50, char_hidden_size=50):
        super(WordAttNet, self).__init__()
        self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 2 * hidden_size))
        self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 1))

        self.use_char_embeddings = use_char_embeddings
        if use_char_embeddings == True:
            embed_size = 2 * char_hidden_size
        else:
            dict = vocab.pretrained_aliases[word2vec_file]()
            dict_len, embed_size = len(dict.vectors), dict.dim #len(dict), dict.dim
            dict_len += 1
            unknown_word = np.zeros((1, embed_size))
            dict = torch.from_numpy(np.concatenate([unknown_word, dict.vectors], axis=0).astype(np.float))
            self.lookup = nn.Embedding(num_embeddings=dict_len, embedding_dim=embed_size).from_pretrained(dict)
        
        self.gru = nn.GRU(embed_size, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):
        self.word_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):
        output = self.lookup(input)
        f_output, h_output = self.gru(output.float(), hidden_state)  # feature output and hidden state output
        output = matrix_mul(f_output, self.word_weight, self.word_bias)
        output = matrix_mul(output, self.context_weight).permute(1,0)
        output = F.softmax(output, dim=0)
        output = element_wise_mul(f_output,output.permute(1,0))

        return output, h_output

In [0]:
class CharNet(nn.Module):
    def __init__(self, char2vec_path, hidden_size=50):
        super(CharNet, self).__init__()
        dict = self.create_char_dict(char2vec_path)
        dict_len, embed_size = dict.shape
        dict_len += 1
        unknown_char = np.zeros((1, embed_size))
        dict = torch.from_numpy(np.concatenate([unknown_char, dict], axis=0).astype(np.float))
        self.lookup = nn.Embedding(num_embeddings=dict_len, embedding_dim=embed_size).from_pretrained(dict)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True)

    def create_char_dict(self, char2vec_path):
        dict = pd.read_csv(filepath_or_buffer=char2vec_path, header=None, sep=" ", quoting=csv.QUOTE_NONE).values[:, 1:]
        return dict

    def forward(self, input, hidden_state):
        output = self.lookup(input)
        f_output, hidden_state = self.lstm(output.float(), hidden_state)  # feature output and hidden state output
        return f_output, hidden_state

In [0]:
def get_evaluation(y_true, y_prob, list_metrics):
    y_pred = np.argmax(y_prob, -1)
    output = {}
    if 'accuracy' in list_metrics:
        output['accuracy'] = metrics.accuracy_score(y_true, y_pred)
    if 'loss' in list_metrics:
        try:
            output['loss'] = metrics.log_loss(y_true, y_prob)
        except ValueError:
            output['loss'] = -1
    if 'confusion_matrix' in list_metrics:
        output['confusion_matrix'] = str(metrics.confusion_matrix(y_true, y_pred))
    return output

def matrix_mul(input, weight, bias=False):
    feature_list = []
    for feature in input:
        feature = torch.mm(feature, weight)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature).unsqueeze(0)
        feature_list.append(feature)

    return torch.cat(feature_list, 0).squeeze()

def element_wise_mul(input1, input2):
    feature_list = []
    for feature_1, feature_2 in zip(input1, input2):
        feature_2 = feature_2.unsqueeze(1).expand_as(feature_1)
        feature = feature_1 * feature_2
        feature_list.append(feature.unsqueeze(0))
    output = torch.cat(feature_list, 0)

    return torch.sum(output, 0).unsqueeze(0)

def get_max_lengths(data_path):
    word_length_list = []
    sent_length_list = []
    with open(data_path) as csv_file:
        reader = csv.reader(csv_file, quotechar='"')
        for idx, line in enumerate(reader):
            text = ""
            for tx in line[1:]:
                text += tx.lower()
                text += " "
            sent_list = sent_tokenize(text)
            sent_length_list.append(len(sent_list))

            for sent in sent_list:
                word_list = word_tokenize(sent)
                word_length_list.append(len(word_list))

        sorted_word_length = sorted(word_length_list)
        sorted_sent_length = sorted(sent_length_list)

    return sorted_word_length[int(0.8*len(sorted_word_length))], sorted_sent_length[int(0.8*len(sorted_sent_length))]

In [0]:
def get_args():
    parser = argparse.ArgumentParser(
        """Implementation of the model described in the paper: Hierarchical Attention Networks for Document Classification""")
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--use_char_embeddings", type=bool, default=False)
    parser.add_argument("--data_path", type=str, default="test.csv")
    parser.add_argument("--pre_trained_model", type=str, default="trained_models/whole_model_han")
    parser.add_argument("--char2vec_file", type=str, default="glove.6B.50d-char.txt")
    parser.add_argument("--word2vec_file", type=str, default='glove.6B.50d')
    parser.add_argument("--output", type=str, default="predictions")
    parser.add_argument("--test_subset_len", type=int, default=200)
    parser.add_argument("--min_freq", type=int, default=6)
    parser.add_argument("--pre_vocab", type=str, default="test_vocab.pt")
    args = parser.parse_args()
    return args

def test(opt):
    test_params = {"batch_size": opt.batch_size,
                   "shuffle": False,
                   "drop_last": True}
    if os.path.isdir(opt.output):
        shutil.rmtree(opt.output)
    os.makedirs(opt.output)
    if torch.cuda.is_available():
        model = torch.load(opt.pre_trained_model)
    else:
        model = torch.load(opt.pre_trained_model, map_location=lambda storage, loc: storage)
    
    test_set = MyDataset(
        opt.data_path, 
        opt.use_char_embeddings,
        opt.char2vec_file,
        opt.word2vec_file,
        model.max_char_length,
        model.max_word_length,
        model.max_sent_length,
        dataset_name='dataset', 
        min_freq=opt.min_freq,
        pre_vocab=opt.pre_vocab)
    #test_set = Subset(test_set,torch.LongTensor(random.sample(range(len(test_set)), opt.test_subset_len)))

    test_generator = DataLoader(test_set, **test_params)

    if torch.cuda.is_available():
        model.cuda()
    model.eval()

    te_label_ls = []
    te_pred_ls = []
    for te_feature, te_label in test_generator:
        num_sample = len(te_label)
        if torch.cuda.is_available():
            te_feature = te_feature.cuda()
            te_label = te_label.cuda()
        with torch.no_grad():
            model._init_hidden_state(num_sample)
            te_predictions = model(te_feature)
            te_predictions = F.softmax(te_predictions)
        te_label_ls.extend(te_label.clone().cpu())
        te_pred_ls.append(te_predictions.clone().cpu())
    te_pred = torch.cat(te_pred_ls, 0).numpy()
    te_label = np.array(te_label_ls)

    fieldnames = ['True label', 'Predicted label', 'Content']
    with open(opt.output + os.sep + "predictions.csv", 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
        writer.writeheader()
        for i, j, k in zip(te_label, te_pred, test_set.get_texts()):
            writer.writerow(
                {'True label': i + 1, 'Predicted label': np.argmax(j) + 1, 'Content': k})

    test_metrics = get_evaluation(te_label, te_pred,
                                  list_metrics=["accuracy", "loss", "confusion_matrix"])
    print("Prediction:\nLoss: {} Accuracy: {} \nConfusion matrix: \n{}".format(test_metrics["loss"],
                                                                               test_metrics["accuracy"],
                                                                               test_metrics["confusion_matrix"]))

In [0]:
def get_args():
    parser = argparse.ArgumentParser(
    """Implementation of the model described in the paper: Hierarchical Attention Networks for Document Classification""")
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--num_epoches", type=int, default=1)
    parser.add_argument("--lr", type=float, default=0.1)
    parser.add_argument("--momentum", type=float, default=0.9)
    parser.add_argument("--use_char_embeddings", type=bool, default=False)
    parser.add_argument("--char_hidden_size", type=int, default=50)
    parser.add_argument("--word_hidden_size", type=int, default=50)
    parser.add_argument("--sent_hidden_size", type=int, default=50)
    parser.add_argument("--es_min_delta", type=float, default=0.0,
                        help="Early stopping's parameter: minimum change loss to qualify as an improvement")
    parser.add_argument("--es_patience", type=int, default=100,
                        help="Early stopping's parameter: number of epochs with no improvement after which training will be stopped. Set to 0 to disable this technique.")
    parser.add_argument("--train_set", type=str, default="train.csv")
    parser.add_argument("--training_subset_percentage", type=float, default=0.2,
                        help="% of the total training data to be used by the model. Bear in mind the that this is related to the number of batch you can generate.")
    parser.add_argument("--val_subset_percentage", type=float, default=0.2,
                        help="% of the total val and validation data to be used by the model. Bear in mind the that this is related to the number of batch you can generate.")
    parser.add_argument("--val_interval", type=int, default=1, help="Number of epoches between validation phases")
    parser.add_argument("--char2vec_file", type=str, default="glove.6B.50d-char.txt")
    parser.add_argument("--word2vec_file", type=str, default="glove.6B.50d")
    parser.add_argument("--log_path", type=str, default="gdrive/My Drive/HAN_results/ag_news/tensorboard/han_voc")
    parser.add_argument("--saved_path", type=str, default="gdrive/My Drive/HAN_results/ag_news/trained_models")
    parser.add_argument("--pre_trained_model", type=str, default=None)
    parser.add_argument("--max_length_char", type=int,default=20)
    parser.add_argument("--max_length_word", type=int, default=25)  
    parser.add_argument("--max_length_sentences", type=int,default=12)
    parser.add_argument("--min_freq", type=int, default=6)
    parser.add_argument("--pre_vocab",type=int,default='train_vocab.pt')
    args = parser.parse_args()
    return args

def train(opt):
    print('Starting...')
    random.seed(123)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    print('Seed stablished')   
    try:
      os.makedirs(opt.saved_path)
    except FileExistsError:
      # directory already exists
      pass  
    output_file = open(opt.saved_path + os.sep + "logs.txt", "w+")
    print('Output file created')
    output_file.write("Model's parameters: {}".format(vars(opt)))

    print('Creating datasets')
    training_set = MyDataset(
        opt.train_set,
        opt.use_char_embeddings,
        opt.char2vec_file,
        opt.word2vec_file,
        opt.max_length_char,
        opt.max_length_word,
        opt.max_length_sentences,
        dataset_name='dataset',
        opt.min_freq, 
        opt.pre_vocab)
    num_classes = training_set.num_classes
    print(num_classes, 'Num classes')
    training_set = Subset(training_set, torch.LongTensor(random.sample(range(len(training_set)), int(opt.training_subset_percentage * len(training_set) ) ) ) )
    tr_cant = int(len(training_set) * 0.8)
    training_set, val_set = random_split(training_set, [tr_cant, len(training_set) - tr_cant])

    training_params = {"batch_size": opt.batch_size,
                       "drop_last": True}
    val_params = {"batch_size": opt.batch_size,
                   "drop_last": True}

    training_generator = DataLoader(training_set, **training_params)
    print(training_generator.dataset)
    print('Training generator created')
    val_generator = DataLoader(val_set, **val_params)
    print('Validation generator created')

    '''
    print('Creating datasets')
    training_set = MyDataset(
        opt.train_set,
        opt.use_char_embeddings,
        opt.char2vec_file,
        opt.word2vec_file,
        opt.max_length_char,
        opt.max_length_word,
        opt.max_length_sentences,
        dataset_name='dataset',
        opt.min_freq, 
        opt.pre_vocab)
    num_classes = training_set.num_classes
    print(num_classes, 'Num classes')
    print('Training dataset created')
    test_set = MyDataset(
        data_path='ag_news_csv/test.csv',
        opt.use_char_embeddings,
        opt.char2vec_file,
        opt.word2vec_file,
        opt.max_length_char,
        opt.max_length_word,
        opt.max_length_sentences,
        dataset_name='dataset',
        opt.min_freq, 
        pre_vocab='ag_news_csv/test_vocab.pt')
    print('Testing dataset created')

    training_set_total_count = len(training_set)
    subset_idx = torch.LongTensor(random.sample(range(training_set_total_count), int(1 * training_set_total_count)))
    class_sample_count = torch.tensor([training_set.get_target_count(i) for i in range(num_classes)])
    weight = 1. / class_sample_count.float()
    samples_weight = torch.tensor([weight[t] for t in training_set.get_targets([str(int(t)) for t in subset_idx])])
    training_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    test_set_total_count = len(test_set)
    subset_idx = torch.LongTensor(random.sample(range(test_set_total_count), int(0.1 * test_set_total_count)))
    class_sample_count = torch.tensor([test_set.get_target_count(i) for i in range(num_classes)])
    weight = 1. / class_sample_count.float()
    samples_weight = torch.tensor([weight[t] for t in test_set.get_targets([str(int(t)) for t in subset_idx])])
    test_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    training_params = {"batch_size": opt.batch_size,
                       "sampler": training_sampler,
                       "drop_last": True}
    test_params = {"batch_size": opt.batch_size,
                   "sampler": test_sampler,
                   "drop_last": False}

    training_generator = DataLoader(training_set, **training_params)
    print('Training generator created')
    val_generator = DataLoader(test_set, **test_params)
    print('Testing generator created')
    '''

    if opt.pre_trained_model is None:
      model = HierAttNet(
          opt.use_char_embeddings,
          opt.char_hidden_size,
          opt.word_hidden_size, 
          opt.sent_hidden_size,
          opt.batch_size, 
          num_classes, 
          opt.char2vec_file,
          opt.word2vec_file,
          opt.max_length_char,
          opt.max_length_word,
          opt.max_length_sentences)
    else:
      model = torch.load(opt.pre_trained_model)

    #if os.path.isdir(opt.log_path):
    #    shutil.rmtree(opt.log_path)
    #os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    # writer.add_graph(model, torch.zeros(opt.batch_size, max_sent_length, max_word_length))

    if torch.cuda.is_available():
        model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=opt.lr, momentum=opt.momentum)
    best_loss = 1e5
    best_epoch = 0
    model.train()
    num_iter_per_epoch = len(training_generator)
    for epoch in range(opt.num_epoches):
        print(epoch, 'Epoch')
        for iter, (feature, label) in enumerate(training_generator):
            if torch.cuda.is_available():
                feature = feature.cuda()
                label = label.cuda()
            optimizer.zero_grad()
            model._init_hidden_state()
            predictions = model(feature)
            loss = criterion(predictions, label)
            loss.backward()
            optimizer.step()
            training_metrics = get_evaluation(label.cpu().numpy(), predictions.cpu().detach().numpy(), list_metrics=["accuracy"])
            print("Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
                epoch + 1,
                opt.num_epoches,
                iter + 1,
                num_iter_per_epoch,
                optimizer.param_groups[0]['lr'],
                loss, training_metrics["accuracy"]))
            writer.add_scalar('Train/Loss', loss, epoch * num_iter_per_epoch + iter)
            writer.add_scalar('Train/Accuracy', training_metrics["accuracy"], epoch * num_iter_per_epoch + iter)
        if epoch % opt.val_interval == 0:
            model.eval()
            loss_ls = []
            te_label_ls = []
            te_pred_ls = []
            for te_feature, te_label in val_generator:
                num_sample = len(te_label)
                if torch.cuda.is_available():
                    te_feature = te_feature.cuda()
                    te_label = te_label.cuda()
                with torch.no_grad():
                    model._init_hidden_state(num_sample)
                    te_predictions = model(te_feature)
                te_loss = criterion(te_predictions, te_label)
                loss_ls.append(te_loss * num_sample)
                te_label_ls.extend(te_label.clone().cpu())
                te_pred_ls.append(te_predictions.clone().cpu())
            te_loss = sum(loss_ls) / val_set.__len__()
            te_pred = torch.cat(te_pred_ls, 0)
            te_label = np.array(te_label_ls)
            val_metrics = get_evaluation(te_label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"])
            output_file.write(
                "Epoch: {}/{} \nVal loss: {} Val accuracy: {} \nVal confusion matrix: \n{}\n\n".format(
                    epoch + 1, opt.num_epoches,
                    te_loss,
                    val_metrics["accuracy"],
                    val_metrics["confusion_matrix"]))
            print(val_metrics["confusion_matrix"], 'test confusion')
            print("Epoch: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
                epoch + 1,
                opt.num_epoches,
                optimizer.param_groups[0]['lr'],
                te_loss, val_metrics["accuracy"]))
            writer.add_scalar('Val/Loss', te_loss, epoch)
            writer.add_scalar('Val/Accuracy', val_metrics["accuracy"], epoch)
            model.train()
            if te_loss + opt.es_min_delta < best_loss:
                best_loss = te_loss
                best_epoch = epoch
                torch.save(model, opt.saved_path + os.sep + "whole_model_han")

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print("Stop training at epoch {}. The lowest loss achieved is {}".format(epoch, te_loss))
                break

In [0]:
import torchtext.vocab as vocab

vocab.pretrained_aliases['glove.6B.50d']()

In [0]:
# Training dataset to use
train_path = 'ag_news_csv/train.csv'
word2vec_file = 'glove.6B.50d'
max_length_word = 25
max_length_sentences = 12
train_pre_vocab = 'ag_news_csv/train_vocab.pt'
training_set_cat = MyDataset(
    data_path=train_path, 
    word2vec_file=word2vec_file, 
    max_length_sentences=max_length_sentences,
    max_length_word=max_length_word,
    dataset_name='dataset',
    min_freq=1, 
    pre_vocab=train_pre_vocab)

In [0]:
# Hyperparameters

#training

batch_size_train = 64
log_path_train = 'HAN_results/ag_news_1/trained_models'
lr = 0.1
momentum = 0.9
train_epochs = 20
train_saved_path = 'HAN_results/ag_news_1/trained_models'
use_char_embeddings=True
char_hidden_size=50
word_hidden_size = 50
sent_hidden_size = 50
max_length_char = 20
max_length_word = 25
max_length_sentences = 12
word2vec_file='glove.6B.50d'
char2vec_file='glove.840B.300d-char.txt' 
training_subset_percentage=.9
val_subset_percentage=.1
train_min_freq = 1 
train_pre_vocab = None #'ag_news_csv/train_vocab.pt'
pre_trained_model_train = 'HAN_results/ag_news_1/trained_models/whole_model_han'

#testing

batch_size_test = 64
test_data_path = 'ag_news_csv/test.csv' 
pre_trained_model_test = "HAN_results/ag_news_1/trained_models/whole_model_han"
test_subset_len =  int(((len(training_set_cat) * training_subset_percentage) * val_subset_percentage))
test_min_freq = 1
test_pre_vocab = None #'ag_news_csv/test_vocab.pt'
output = "HAN_results/ag_news_1/trained_models/predictions"

print(test_subset_len)

In [0]:
from argparse import Namespace
%cd "gdrive/My Drive"
train(Namespace(
    batch_size=batch_size_train, 
    es_min_delta=0.0, 
    es_patience=5, 
    log_path=log_path_train, 
    lr=lr, 
    momentum=momentum, 
    num_epoches=train_epochs, 
    saved_path=train_saved_path,
    use_char_embeddings=use_char_embeddings,
    char_hidden_size=char_hidden_size,
    word_hidden_size=word_hidden_size, 
    sent_hidden_size=sent_hidden_size,
    val_interval=1, 
    train_set=train_path,
    char2vec_file=char2vec_file,
    word2vec_file=word2vec_file, 
    training_subset_percentage=training_subset_percentage, 
    val_subset_percentage=val_subset_percentage,
    pre_trained_model=pre_trained_model_train,
    max_length_char=max_length_char,
    max_length_word=max_length_word,
    max_length_sentences=max_length_sentences, 
    min_freq=train_min_freq,
    pre_vocab=train_pre_vocab))

In [0]:
from argparse import Namespace
%cd "gdrive/My Drive"
test(Namespace(
    batch_size=batch_size_test, 
    data_path=test_data_path,
    pre_trained_model=pre_trained_model_test,
    user_char_embeddings=use_char_embeddings,
    char2vec_file=char2vec_file, 
    word2vec_file=word2vec_file,
    output=output,
    test_subset_len=test_subset_len,
    min_freq=test_min_freq,
    pre_vocab=test_pre_vocab))