## Section 1

In [18]:
from __future__ import unicode_literals, print_function, division
import pandas as pd
import numpy as np
import json
import os, glob

from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import nltk
from nltk.corpus import stopwords

In [19]:
contraction_map = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because",
                   "could've": "could have", "couldn't": "could not",
                   "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                   "hasn't": "has not", "haven't": "have not",
                   "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
                   "how'll": "how will", "how's": "how is",
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                   "I'm": "I am", "I've": "I have", "i'd": "i would",
                   "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am",
                   "i've": "i have", "isn't": "is not", "it'd": "it would",
                   "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
                   "let's": "let us", "ma'am": "madam",
                   "mayn't": "may not", "might've": "might have", "mightn't": "might not",
                   "mightn't've": "might not have", "must've": "must have",
                   "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                   "needn't've": "need not have", "o'clock": "of the clock",
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have",
                   "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
                   "she'll've": "she will have", "she's": "she is",
                   "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                   "so've": "so have", "so's": "so as",
                   "this's": "this is", "that'd": "that would", "that'd've": "that would have", "that's": "that is",
                   "there'd": "there would",
                   "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would",
                   "they'd've": "they would have",
                   "they'll": "they will", "they'll've": "they will have", "they're": "they are",
                   "they've": "they have", "to've": "to have",
                   "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
                   "we'll've": "we will have", "we're": "we are",
                   "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                   "what're": "what are",
                   "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
                   "where'd": "where did", "where's": "where is",
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
                   "who've": "who have",
                   "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
                   "won't've": "will not have",
                   "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
                   "y'all": "you all",
                   "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are",
                   "y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
                   "you'll've": "you will have",
                   "you're": "you are", "you've": "you have"}


In [20]:
files = [
    {
        'name':'16119_webhose_2019_12_db21c91a1ab47385bb13773ed8238c31_0000001.json'
     },
     {
        'name': '16119_webhose_2020_01_db21c91a1ab47385bb13773ed8238c31_0000001.json'
     }
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nltk.download('stopwords')
stop_words = stopwords.words('english')

max_len_text = 600
max_len_target = 30
SOS_token = 0
EOS_token = 1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def read_records_from_files(files):
    dataset = []
    target = []
    for file in files:
        dat, tar = read_records(file)
        for d, t in zip(dat, tar):
            dataset.append(d)
            target.append(t)

    return dataset, target


def read_records(file):
    dataset = []
    target = []
    with open(f"{file['name']}") as f:
        for line in f:
            record = json.loads(line)
            dataset.append(record['text'])
            target.append(record['title'])

    return dataset, target


def preprocess(text):
    text = text.lower()  # lowercase
    text = text.split()  # convert have'nt -> have not
    for i in range(len(text)):
        word = text[i]
        if word in contraction_map:
            text[i] = contraction_map[word]
    text = " ".join(text)
    text = text.split()
    newtext = []
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    text = text.replace("'s", '')  # convert your's -> your
    text = re.sub(r'\(.*\)', '', text)  # remove (words)
    text = re.sub(r'[^a-zA-Z0-9. ]', '', text)  # remove punctuations
    text = re.sub(r'\.', ' . ', text)
    return text


def get_temp_df(dataset, target):
    short_text = []
    short_summary = []
    
    for i in range(len(dataset)):
        if len(target[i].split()) <= max_len_target and len(dataset[i].split()) <= max_len_text:
            short_text.append(dataset[i])
            short_summary.append(target[i])
    return pd.DataFrame({'text': short_text, 'summary': short_summary})


SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


def readData(text, summary):
    print("Reading lines...")

    # Split every line into pairs and normalize
    pairs = [[text[i], summary[i]] for i in range(len(text))]

    input_lang = Lang(text)
    output_lang = Lang(summary)

    return input_lang, output_lang, pairs


def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readData(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang, pairs

In [22]:
dataset, target = read_records_from_files(files)

In [23]:
X = [preprocess(word) for word in dataset]
Y = [preprocess(word) for word in target]

In [24]:
temp_df = get_temp_df(dataset, target)
new_df = temp_df[temp_df['summary'].str.strip().astype(bool)]
df = new_df[new_df['text'].str.strip().astype(bool)]
df.head()

Unnamed: 0,text,summary
0,FDA launches app for health care professionals...,FDA launches app for health care professionals...
1,"Of all of Regina Yan ’s many traits, an open m...",C-Suite Awards: Regina Yan
2,The CURE ID app allows clinicians to share and...,FDA Launches Infectious Disease Crowdsourcing ...
3,The DSB is composed of representatives from tw...,Drug Safety Oversight Board
4,The Centre for Health Protection (CHP) of the ...,Suspected MERS case reported


In [25]:
input_lang, output_lang, pairs = prepareData(X, Y)
pairs[:5]

Reading lines...
Read 159 sentence pairs
Counting words...


[['dublin swine healthcare market  growth trends forecast  5 . 2 . 2 coccidiosis 5 . 2 . 3 respiratory diseases 5 . 2 . 4 swine dysentery 5 . 2 . 5 porcine parvovirus 5 . 2 . 6 others 5 . 3 geography 5 . 3 . 1 north america 5 . 3 . 2 europe 5 . 3 . 3 asiapacific 5 . 3 . 4 middle east  africa 5 . 3 . 5 south america 6 competitive landscape 6 . 1 company profiles 6 . 1 . 1 abaxis 6 . 1 . 2 bayer animal health 6 . 1 . 3 boehringer ingelheim 6 . 1 . 4 ceva animal health inc .  6 . 1 . 5 elanco 6 . 1 . 6 idvet 6 . 1 . 7 merck animal health 6 . 1 . 8 merial 6 . 1 . 9 vetoquinol s . a .  6 . 1 . 10 virbac 6 . 1 . 11 zoetis animal healthcare 7 market opportunities future trends information report visit httpswww . researchandmarkets . comrshhuje research markets also offers custom research services providing focused comprehensive tailored research .  contact researchandmarkets . com laura wood senior press manager pressresearchandmarkets . com e . s . t office hours call 19173000470 u . s . can

## Section 2

In [26]:
MAX_LENGTH = max_len_text

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return input_tensor, target_tensor


In [27]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.2, max_length=MAX_LENGTH):
        super(AttnDecoder, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [28]:
def trainIters(encoder, decoder, num_iters, learning_rate=0.05):
    print("Starting training")

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(num_iters)]
    criterion = nn.NLLLoss()

    losses = []
    for i in range(1, num_iters + 1):
        if i % 1000 == 0:
            print(i, "/", num_iters + 1)
        training_pair = training_pairs[i - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor,
                     encoder, decoder,
                     encoder_optimizer, decoder_optimizer, criterion)
        losses.append(loss)

    print("Ending training")
    return losses

In [29]:
def train(input_tensor, target_tensor,
          encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    loss = 0

    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_outputs[i] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    teacher_forcing_ratio = 0.5
    random_number = np.round(random.random(), 3)
    if random_number < teacher_forcing_ratio:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input,
                                                                        decoder_hidden,
                                                                        encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    # TODO: randomly generate a number between 0 and 1. If that number is greater than teacher_forcing_ratio, we will apply teacher forcing technique, otherwise not.
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / target_length


hidden_size = 300
encoder = Encoder(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoder(hidden_size, output_lang.n_words, dropout=0.1).to(device)

trainIters(encoder, decoder, 1000)


def infer(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with open("evaluation_input.txt", 'w') as f:
        for pair in pairs:
            output_words, attentions = infer(encoder, decoder, pair[0])
            output_sentence = ' '.join(output_words)
            f.write(pair[1] + "," + output_sentence + "\n")
            # For every pair, write the target pair pair[1] and output_sentence as a tuple to
            # a file.Let’s name this file as evaluation_input.txt

Starting training


TypeError: ignored