In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
DATA_PATH = "./ppd_data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

SEED = 2018

In [3]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

In [4]:
question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

In [5]:
from gensim.corpora import Dictionary

word_dict = Dictionary(question_data["words"])
char_dict = Dictionary(question_data["chars"])

num_words, num_chars = len(word_dict.dfs), len(char_dict.dfs)
num_words, num_chars

(20890, 3048)

In [6]:
def adjust_embedding(embedding, corp_dict):
    index = sorted([k for k, v in corp_dict.token2id.items()], key=lambda x: corp_dict.token2id[x])
    return embedding.reindex(index).values

In [7]:
word_embedding_data = adjust_embedding(word_embedding_data, word_dict)
char_embedding_data = adjust_embedding(char_embedding_data, word_dict)

In [8]:
def pad_embedding(embedding, method="mean"):
    new_vector = np.zeros(embedding.shape[1], dtype=np.float32)
    if method == "mean":
        new_vector = embedding.mean(axis=0)
    return np.concatenate((embedding, new_vector.reshape((1, -1))), axis=0)

In [9]:
print(f"former word embedding matrix shape: {word_embedding_data.shape}")
word_embedding_data = pad_embedding(word_embedding_data, method="zero")
print(f"final word embedding matrix shape: {word_embedding_data.shape}")

print(f"former char embedding matrix shape: {char_embedding_data.shape}")
char_embedding_data = pad_embedding(char_embedding_data, method="zero")
print(f"final char embedding matrix shape: {char_embedding_data.shape}")

former word embedding matrix shape: (20890, 300)
final word embedding matrix shape: (20891, 300)
former char embedding matrix shape: (20890, 300)
final char embedding matrix shape: (20891, 300)


In [22]:
T_STOP = 100000
USE_STOP = True

word_count = sorted([(token, word_dict.dfs[id_]) for token, id_ in word_dict.token2id.items()],
                    key=lambda x: x[1], reverse=True)
char_count = sorted([(token, char_dict.dfs[id_]) for token, id_ in char_dict.token2id.items()],
                    key=lambda x: x[1], reverse=True)

word_stop = [token for token, count in word_count if count >= T_STOP]
char_stop = [token for token, count in char_count if count >= T_STOP]
word_stop, char_stop

(['W17378', 'W19355', 'W16319', 'W18238', 'W18103'],
 ['L0104',
  'L2214',
  'L1861',
  'L2582',
  'L3019',
  'L0143',
  'L2218',
  'L1132',
  'L1128',
  'L0362',
  'L1187'])

In [34]:
def pair2vec(pair, question, use_stop=USE_STOP):
    return ques2vec(pair[["q1"]].rename(columns={"q1": "qid"}), question, use_stop) + \
           ques2vec(pair[["q2"]].rename(columns={"q2": "qid"}), question, use_stop)
    
def ques2vec(ques, question, use_stop=True):
    q = ques.merge(question, how="left", on="qid")
    return seq2vec(q, "words", use_stop), seq2vec(q, "chars", use_stop)
    
def seq2vec(ques, col, use_stop=True):
    seq = ques[col]
    if col == "words":
        d = word_dict.token2id
        s = word_stop
    else:
        d = char_dict.token2id
        s = char_stop
    
    if use_stop:
        return [[d[token] for token in text if token not in s] for text in seq]
    else:
        return [[d[token] for token in text] for text in seq]

In [41]:
train_word1, train_word2, train_char1, train_char2 = pair2vec(train_data, question_data)
test_word1, test_word2, test_char1, test_char2 = pair2vec(test_data, question_data)

In [47]:
T_WORD = 6
T_CHAR = 10

def sample_padding(texts, length, pad_index, cut_method="direct"):
    texts = np.array(texts)
    np.apply_along_axis
    return texts

In [48]:
sample_padding(train_word1, T_WORD, len(word_dict.dfs))

array([list([112, 778, 0, 292]), list([138, 79, 196, 35]),
       list([29, 58, 1617, 119, 2]), ..., list([1, 40, 60, 190, 61]),
       list([321, 986, 43, 72, 448]), list([1021, 758, 1539, 39])],
      dtype=object)