In [1]:
import json
from tqdm import tqdm
import spacy
import numpy as np

nlp = spacy.blank("en")

def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token)
    return spans

def process_file(filename, data_type, word_counter, char_counter):
    print("Generating {} examples...".format(data_type))
    examples = []
    eval_examples = {}
    total = 0
    unans = 0
    ans =0
    with open(filename, "r") as fh:
        source = json.load(fh)
        for article in tqdm(source["data"]):
            for para in article["paragraphs"]:
                context = para["context"].replace(
                    "''", '" ').replace("``", '" ')
                context_tokens = word_tokenize(context)
                context_chars = [list(token) for token in context_tokens]
                spans = convert_idx(context, context_tokens)
                for token in context_tokens:
                    word_counter[token] += len(para["qas"])
                    for char in token:
                        char_counter[char] += len(para["qas"])
                for qa in para["qas"]:
                    total += 1
                    ques = qa["question"].replace(
                        "''", '" ').replace("``", '" ')
                    ques_tokens = word_tokenize(ques)
                    ques_chars = [list(token) for token in ques_tokens]
                    for token in ques_tokens:
                        word_counter[token] += 1
                        for char in token:
                            char_counter[char] += 1
                    y1s, y2s = [], []
                    answer_texts = []
                    
                    # 2.0 Dataset
                    if 'is_impossible' in qa and qa['is_impossible']==True:
                        y1s.append(-1)
                        y2s.append(-1)
                        unans+=1
                    else:
                        ans+=1
                        for answer in qa["answers"]:
                            answer_text = answer["text"]
                            answer_start = answer['answer_start']
                            answer_end = answer_start + len(answer_text)
                            answer_texts.append(answer_text)
                            answer_span = []
                            for idx, span in enumerate(spans):
                                if not (answer_end <= span[0] or answer_start >= span[1]):
                                    answer_span.append(idx)
                            if len(answer_span)==0:
                                print(answer)
                            y1, y2 = answer_span[0], answer_span[-1]
                            y1s.append(y1)
                            y2s.append(y2)
                    example = {"context_tokens": context_tokens, "context_chars": context_chars,
                               "ques_tokens": ques_tokens,
                               "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total}
                    examples.append(example)
                    eval_examples[str(total)] = {"question":ques,
                        "context": context, "spans": spans, "answers": answer_texts, "uuid": qa["id"]}
        print("{} questions in total".format(len(examples)))
        print('answerable:',ans,'unanswerable:',unans)
    return examples, eval_examples

def get_embedding(counter, data_type, limit=-1, emb_file=None, size=None, vec_size=None):
    print("Generating {} embedding...".format(data_type))
    embedding_dict = {}
    filtered_elements = [k for k, v in counter.items() if v > limit]
    if emb_file is not None:
        assert size is not None
        assert vec_size is not None
        with open(emb_file, "r", encoding="utf-8") as fh:
            for line in tqdm(fh, total=size):
                array = line.split()
                word = "".join(array[0:-vec_size])
                vector = list(map(float, array[-vec_size:]))
                if word in counter and counter[word] > limit:
                    embedding_dict[word] = vector
        print("{} / {} tokens have corresponding {} embedding vector".format(
            len(embedding_dict), len(filtered_elements), data_type))
    else:
        assert vec_size is not None
        for token in filtered_elements:
            embedding_dict[token] = [np.random.normal(
                scale=0.1) for _ in range(vec_size)]
        print("{} tokens have corresponding embedding vector".format(
            len(filtered_elements)))

    NULL = "--NULL--"
    OOV = "--OOV--"
    token2idx_dict = {token: idx for idx,
                      token in enumerate(embedding_dict.keys(), 2)}
    token2idx_dict[NULL] = 0
    token2idx_dict[OOV] = 1
    embedding_dict[NULL] = [0. for _ in range(vec_size)]
    embedding_dict[OOV] = [0. for _ in range(vec_size)] # np.random.random((vec_size))/2-0.25
    idx2emb_dict = {idx: embedding_dict[token]
                    for token, idx in token2idx_dict.items()}
    emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))]
    return emb_mat, token2idx_dict, idx2emb_dict

In [2]:
from collections import Counter
import numpy as np
word_counter, char_counter = Counter(), Counter()

# # 2.0 Dataset
# test_examples, test_eval = process_file('original_data/dev-v2.0.json', "test", word_counter, char_counter)
# train_examples, train_eval = process_file('original_data/train-v2.0.json', "train", word_counter, char_counter)

# 1.0 Dataset
train_examples, train_eval = process_file('../../fwei/data/squad/train-v1.1.json', "train", word_counter, char_counter)
# dev_examples, dev_eval = process_file('../../fwei/data/squad/dev-v1.2.json', "dev", word_counter, char_counter)
test_examples, test_eval = process_file('../../fwei/data/squad/dev-v1.1.json', "test", word_counter, char_counter)


Generating train examples...


100%|██████████| 442/442 [01:24<00:00,  5.24it/s]
  0%|          | 0/48 [00:00<?, ?it/s]

87599 questions in total
answerable: 87599 unanswerable: 0
Generating test examples...


100%|██████████| 48/48 [00:10<00:00,  4.42it/s]

10570 questions in total
answerable: 10570 unanswerable: 0





In [3]:
# save train_eval and dev_eval
# # 2.0 Dataset
# with open('dataset/train_eval.json', "w") as fh:
#     json.dump(train_eval, fh)
# with open('dataset/test_eval.json','w') as fh:
#     json.dump(test_eval,fh)
    
# 1.0 Dataset
with open('dataset1.0/train_eval.json', "w") as fh:
    json.dump(train_eval, fh)
# with open('dataset1.0/dev_eval.json','w') as fh:
#     json.dump(dev_eval,fh)
with open('dataset1.0/test_eval.json','w') as fh:
    json.dump(test_eval,fh)

In [4]:
word_emb_mat, word2idx_dict, _ = get_embedding(
    word_counter, "word", emb_file='../../fwei/data/glove/glove.840B.300d.txt', size=int(2.2e6), vec_size=300)
char_emb_mat, char2idx_dict, _ = get_embedding(
        char_counter, "char", emb_file=None, size=None, vec_size=64)

  0%|          | 1044/2200000 [00:00<03:30, 10429.37it/s]

Generating word embedding...


100%|█████████▉| 2196017/2200000 [03:22<00:00, 10834.49it/s]


91587 / 111136 tokens have corresponding word embedding vector
Generating char embedding...
1425 tokens have corresponding embedding vector


In [5]:
# id2word
id2word_dict={}
for k in word2idx_dict:
    id2word_dict[word2idx_dict[k]]=k
id2word_dict[0]=''

In [6]:
import pickle
word_size=len(word_emb_mat)
char_input_size=len(char_emb_mat)
print(word_size)
print(char_input_size)

# save w2id
with open('dataset1.0/word2id.pkl','wb') as f:
    pickle.dump(word2idx_dict,f)
# save c2id   
with open('dataset1.0/char2id.pkl','wb') as f:
    pickle.dump(char2idx_dict,f)

# save word_mat
word_mat=np.zeros((len(word_emb_mat),len(word_emb_mat[0])))
for i,w in enumerate(word_emb_mat):
    word_mat[i,:]=w
print(word_mat.shape)
# np.save('dataset/word_emb_mat.npy',word_mat)
np.save('dataset1.0/word_emb_mat.npy',word_mat)

# save char_mat
char_mat=np.zeros((len(char_emb_mat),len(char_emb_mat[0])))
for i,w in enumerate(char_emb_mat):
    char_mat[i,:]=w
print(char_mat.shape)
# np.save('dataset/char_emb_mat.npy',char_mat)
np.save('dataset1.0/char_emb_mat.npy',char_mat)

91589
1427
(91589, 300)
(1427, 64)


In [8]:
import pickle
def build_features(config, examples, data_type, out_file, word2idx_dict, char2idx_dict, id2word_dict, \
                   is_test=False):

    para_limit = config['test_para_limit'] if is_test else config['para_limit']
    ques_limit = config['test_ques_limit'] if is_test else config['ques_limit']
    ans_limit = 100 if is_test else config['ans_limit']
    char_limit = config['char_limit']

    def filter_func(example, is_test=False):
        return len(example["context_tokens"]) > para_limit or \
               len(example["ques_tokens"]) > ques_limit or \
               (example["y2s"][0] - example["y1s"][0]) > ans_limit

    print("Processing {} examples...".format(data_type))
    total = 0
    total_ = 0
    meta = {}
    context_idxss=[]
    ques_idxss=[]
    context_char_idxss=[]
    ques_char_idxss=[]
    context_strings=[]
    ques_strings=[]
    y1s=[]
    y2s=[]
    qids=[]
    unans=0
    for example in tqdm(examples):
        total_ += 1

        if filter_func(example, is_test):
            continue

        total += 1
        qids.append(int(example['id']))
        context_idxs = np.zeros([para_limit], dtype=np.int32)
        context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32)
        ques_idxs = np.zeros([ques_limit], dtype=np.int32)
        ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32)
        if config['data_ver']==2:
            y1 = np.zeros([para_limit+1], dtype=np.float32)
            y2 = np.zeros([para_limit+1], dtype=np.float32)
        else:
            y1 = np.zeros([para_limit], dtype=np.float32)
            y2 = np.zeros([para_limit], dtype=np.float32)

        def _get_word(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1

        def _get_char(char):
            if char in char2idx_dict:
                return char2idx_dict[char]
            return 1
        
        cont_temp=[]
        ques_temp=[]
        for i, token in enumerate(example["context_tokens"]):
            context_idxs[i] = _get_word(token)
            cont_temp.append(id2word_dict[context_idxs[i]])
        while len(cont_temp)<para_limit:
            cont_temp.append('')

        for i, token in enumerate(example["ques_tokens"]):
            ques_idxs[i] = _get_word(token)
            ques_temp.append(id2word_dict[ques_idxs[i]])
        while len(ques_temp)<ques_limit:
            ques_temp.append('')

        for i, token in enumerate(example["context_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break
                context_char_idxs[i, j] = _get_char(char)

        for i, token in enumerate(example["ques_chars"]):
            for j, char in enumerate(token):
                if j == char_limit:
                    break
                ques_char_idxs[i, j] = _get_char(char)

        start, end = example["y1s"][-1], example["y2s"][-1]
        if config['data_ver']==2:    
            if start!=-1 and end!=-1:
                y1[start+1], y2[end+1] = 1.0, 1.0
            else:
                y1[0], y2[0] = 1.0, 1.0
                unans+=1
        else:
            y1[start], y2[end] = 1.0, 1.0
        context_idxss.append(np.expand_dims(context_idxs,axis=0))
        ques_idxss.append(np.expand_dims(ques_idxs,axis=0))
        context_char_idxss.append(np.expand_dims(context_char_idxs,axis=0))
        ques_char_idxss.append(np.expand_dims(ques_char_idxs,axis=0))
        y1s.append(np.expand_dims(y1,axis=0))
        y2s.append(np.expand_dims(y2,axis=0))
        context_strings.append(cont_temp)
        ques_strings.append(ques_temp)
        
    context_idxss=np.concatenate(context_idxss,axis=0)
    ques_idxss=np.concatenate(ques_idxss,axis=0)
    context_char_idxss=np.concatenate(context_char_idxss,axis=0)
    ques_char_idxss=np.concatenate(ques_char_idxss,axis=0)
    y1s=np.concatenate(y1s,axis=0)
    y2s=np.concatenate(y2s,axis=0)
    qids=np.array(qids)
    context_strings=np.array(context_strings)
    ques_strings=np.array(ques_strings)
    
    np.save(out_file+data_type+'_contw_input.npy',context_idxss)
    np.save(out_file+data_type+'_quesw_input.npy',ques_idxss)
    np.save(out_file+data_type+'_contc_input.npy',context_char_idxss)
    np.save(out_file+data_type+'_quesc_input.npy',ques_char_idxss)
    np.save(out_file+data_type+'_y_start.npy',y1s)
    np.save(out_file+data_type+'_y_end.npy',y2s)
    np.save(out_file+data_type+'_qid.npy',qids)
    np.save(out_file+data_type+'_contw_strings.npy',context_strings)
    np.save(out_file+data_type+'_quesw_strings.npy',ques_strings)
    
    print("Built {} / {} instances of features in total".format(total, total_))
    print('unanswerable:',unans)

config={
    'test_para_limit':1000,
    'test_ques_limit':50,
    'para_limit':400,
    'ques_limit':50,
    'ans_limit':30,
    'char_limit':16,
    'data_ver':1
}

# # 2.0 Dataset
# build_features(config, train_examples, 'train', 'dataset/', word2idx_dict, char2idx_dict, id2word_dict, is_test=False)
# build_features(config, test_examples, 'dev', 'dataset/', word2idx_dict, char2idx_dict, id2word_dict, is_test=False)
# build_features(config, test_examples, 'test', 'dataset/', word2idx_dict, char2idx_dict, id2word_dict, is_test=True)

# 1.0 Dataset
build_features(config, train_examples, 'train', 'dataset1.0/', word2idx_dict, char2idx_dict, id2word_dict, is_test=False)
build_features(config, test_examples, 'dev', 'dataset1.0/', word2idx_dict, char2idx_dict, id2word_dict, is_test=False)
build_features(config, test_examples, 'test', 'dataset1.0/', word2idx_dict, char2idx_dict, id2word_dict, is_test=True)

  0%|          | 98/87599 [00:00<01:30, 971.94it/s]

Processing train examples...


100%|██████████| 87599/87599 [01:18<00:00, 1122.55it/s]


Built 87358 / 87599 instances of features in total
unanswerable:

  2%|▏         | 167/10570 [00:00<00:06, 1662.81it/s]

 0
Processing dev examples...


100%|██████████| 10570/10570 [00:09<00:00, 1091.07it/s]
  1%|          | 105/10570 [00:00<00:10, 1044.20it/s]

Built 10482 / 10570 instances of features in total
unanswerable: 0
Processing test examples...


100%|██████████| 10570/10570 [00:11<00:00, 940.46it/s]


Built 10570 / 10570 instances of features in total
unanswerable: 0


In [6]:
# 将string转化为(context, sentence, words)格式并复合上词性tag, 子单元例如('IN','for')
import spacy
import numpy as np
import os

data_type='dev'
cont_string=np.load(os.path.join('dataset',data_type+'_contw_strings.npy'))

In [15]:
import spacy
nlp = spacy.load("en")
print([i.tag_ for i in nlp('cannot')])
# words=cont_string[0,:]
# print(x)
# tags_=[]
# for w in words:
#     wtag=[j.tag_ for j in nlp(str(w))]
#     tags_.append(wtag)
# print(words)
# print(tags_)

['MD', 'RB']


In [10]:
nlp = spacy.load("en")
from tqdm import tqdm

def gettag(cont_string):
    contexts=[]
    wrong_num=0
    for i in tqdm(range(cont_string.shape[0])):#range(cont_string.shape[0])
        sentences=[]
        words=[]
        for j in range(cont_string.shape[1]):
            if cont_string[i,j]=='':
                break

            # 规则矫正：
            # 1.如果只有一个'，去除
            if str(cont_string[i,j]).count('\'')==1 and len(cont_string[i,j])>1:
                cont_string[i,j]=cont_string[i,j].replace('\'','')
                
            # 2.如果如果是cannot，改为not
            if str(cont_string[i,j]).lower()=='cannot':
                cont_string[i,j]='not'
                
            # 3.im，改为I
            if str(cont_string[i,j]).lower()=='im':
                cont_string[i,j]='i'
                
            # # 其余问题过滤（暂时）128K
            # if str(cont_string[i,j])=='128K':
            #     cont_string[i,j]='128'
                
            words.append(cont_string[i,j])
            if words[-1]=='.' or words[-1]=='!' or words[-1]=='?':
                sentence=' '.join(words)
                tags=[n.tag_ for n in nlp(sentence)]
                if len(tags)!=len(words):
                    tags_=[]
                    for w in words:
                        wtag=[j.tag_ for j in nlp(str(w))]
                        if len(wtag)>1:
                            wtag=wtag[0]
                        tags_.extend(wtag)
                    tags=tags_
                    wrong_num+=1
                    assert len(tags)==len(words)
                sentences.append(list(zip(tags,words)))
                words=[]
        if len(words)>0:
            sentence=' '.join(words)
            tags=[n.tag_ for n in nlp(sentence)]
            if len(tags)!=len(words):
                tags_=[]
                for w in words:
                    wtag=[j.tag_ for j in nlp(str(w))]
                    if len(wtag)>1:
                        wtag=wtag[0]
                    tags_.extend(wtag)
                tags=tags_
                wrong_num+=1
                assert len(tags)==len(words)
            sentences.append(list(zip(tags,words)))
            words=[]
        contexts.append(sentences)
    print(wrong_num)
    
    return contexts
# contexts=gettag(cont_string)

split_num=8
temp_len=cont_string.shape[0]//split_num
params=[]
for i in range(split_num):
    if i != split_num-1:
        params.append(cont_string[i*temp_len:(i+1)*temp_len,::])
    else:
        params.append(cont_string[i*temp_len:,::])
    
from multiprocessing import Pool
pool=Pool()
result=[]
for i in params:
    result.append(pool.apply_async(gettag, kwds={'cont_string':i}))
pool.close()
pool.join()
contexts=[]
[contexts.extend(i.get()) for i in result]

Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/ada/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 883, in __del__
    self.close()
  File "/ada/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 1088, in close
    self._decr_instances(self)
  File "/ada/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 439, in _decr_instances
    cls._instances.remove(instance)
  File "/ada/anaconda3/lib/python3.6/_weakrefset.py", line 109, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x7f670efd6f48; to 'tqdm' at 0x7f67122fa710>
  1%|          | 13/1466 [00:04<08:17,  2.92it/s]]
 85%|████████▍ | 1244/1466 [02:57<00:31,  7.00it/s][A


0


 91%|█████████ | 1335/1466 [03:01<00:17,  7.36it/s]

['...', 'You', "'ve", '[', 'Western', 'nations', ']', 'increased', 'the', 'price', 'of', 'the', 'wheat', 'you', 'sell', 'us', 'by', '300', 'percent', ',', 'and', 'the', 'same', 'for', 'sugar', 'and', 'cement', '...', 'You', 'buy', 'our', 'crude', 'oil', 'and', 'sell', 'it', 'back', 'to', 'us', ',', 'refined', 'as', 'petrochemicals', ',', 'at', 'a', 'hundred', 'times', 'the', 'price', 'you', "'ve", 'paid', 'us', '...', 'It', "'s", 'only', 'fair', 'that', ',', 'from', 'now', 'on', ',', 'you', 'should', 'pay', 'more', 'for', 'oil', '.']



100%|██████████| 1466/1466 [03:20<00:00,  7.32it/s]


0


100%|██████████| 1466/1466 [03:32<00:00,  6.90it/s]


0


100%|██████████| 1466/1466 [03:32<00:00,  6.89it/s]


0


100%|██████████| 1468/1468 [03:32<00:00,  6.91it/s]


0


100%|██████████| 1466/1466 [03:42<00:00,  6.58it/s]


0


100%|██████████| 1466/1466 [03:45<00:00,  6.49it/s]


0


AssertionError: 

In [7]:
import torch
import parse_nk
torch.cuda.set_device(3)
def torch_load(load_path):
    if parse_nk.use_cuda:
        return torch.load(load_path)
    else:
        return torch.load(load_path, map_location=lambda storage, location: storage)
info = torch_load('parsing/models/en_elmo_dev.95.21.pt')
assert 'hparams' in info['spec'], "Older savefiles not supported"
info['spec']['hparams']['sentence_max_len']=400
print(info['spec']['hparams'])
parser = parse_nk.NKChartParser.from_spec(info['spec'], info['state_dict'])

{'attention_dropout': 0.2, 'char_lstm_input_dropout': 0.2, 'clip_grad_norm': 0.0, 'd_char_emb': 32, 'd_ff': 2048, 'd_kv': 64, 'd_label_hidden': 250, 'd_model': 1024, 'elmo_dropout': 0.5, 'embedding_dropout': 0.0, 'learning_rate': 0.0008, 'learning_rate_warmup_steps': 160, 'max_len_dev': 0, 'max_len_train': 0, 'morpho_emb_dropout': 0.2, 'num_heads': 8, 'num_layers': 4, 'num_layers_position_only': 0, 'partitioned': True, 'relu_dropout': 0.1, 'residual_dropout': 0.2, 'sentence_max_len': 400, 'step_decay': True, 'step_decay_factor': 0.5, 'step_decay_patience': 5, 'tag_emb_dropout': 0.2, 'timing_dropout': 0.0, 'use_chars_concat': False, 'use_chars_lstm': False, 'use_elmo': True, 'use_tags': False, 'use_words': False, 'word_emb_dropout': 0.4}


RuntimeError: While copying the parameter named embedding.position_table, whose dimensions in the model are torch.Size([400, 512]) and whose dimensions in the checkpoint are torch.Size([300, 512]).

In [None]:
import pickle
import numpy as np
with open('parsing/data/dev_tags.pkl','rb') as f:
    tags=pickle.load(f)
    tags=np.array(tags)
tags_temp=tags[64:96]

In [None]:
import numpy as np

def generate_parse_feat(tags_temp):
    batch_size = len(tags_temp)
    
    # stastic the word num in each sample
    sen_len=[sum([len(tt) for tt in t]) for t in tags_temp]
    max_len=max(sen_len)
    
    # combine the sentences to a batch
    tags_temp_new=[]
    for i in range(len(tags_temp)):
        combined_context=[]
        [combined_context.extend(t) for t in tags_temp[i]]
        tags_temp_new.append(combined_context)
    print(len(tags_temp_new[11]))
    # inference the parsing feature
    feat,idxs = parser.parse_batch(tags_temp_new)
    
    # remove the elmo useless token from feat
    inds=[]
    for j in range(len(idxs.batch_idxs_np)):
        if j==0 or j==len(idxs.batch_idxs_np)-1 or \
        idxs.batch_idxs_np[j-1]!=idxs.batch_idxs_np[j] or \
        idxs.batch_idxs_np[j+1]!=idxs.batch_idxs_np[j]:
            continue
        else:
            inds.append(j)
    feat=feat[inds,:]
    
    # convert feat to (batch_size, max_len, 1024)
    assert sum(sen_len)==feat.shape[0]
    feats=np.zeros((batch_size, max_len, 1024))
    cusum=0
    for i,s in enumerate(sen_len):
        feats[i,0:s,:]=feat[cusum:cusum+s,:]
        cusum+=s
    assert cusum==feat.shape[0]
    
    return feats

In [None]:
nlp = spacy.blank("en")

def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]
words = word_tokenize("im a footman.")

print(words)

['I', 'can', 'not', 'shoot', 'you', '.']
