# 所有特征

## 问题特征

- 问题出现次数: 1
- 问题单词数量: 2
- 问题字符数量: 2
- 问题Hash值: 2

## 问题对特征

- 问题对重复单词数量: 1
- 问题对重复字符数量: 1

## 图特征

- Clique Size, 与此问题对相互毗邻结点组成的子图中结点的数量: 1
- K-core, 每个点最大的K-core值: 2

In [None]:
import networkx as nx
import numpy as np
import pandas as pd
from itertools import combinations

In [None]:
DATA_PATH = "./data/"
TRAIN_PATH = DATA_PATH + "train.csv"
TEST_PATH = DATA_PATH + "test.csv"
WORD_EMBED_PATH = DATA_PATH + "word_embed.txt"
CHAR_EMBED_PATH = DATA_PATH + "char_embed.txt"
QUEST_PATH = DATA_PATH + "question.csv"

In [None]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
question_data = pd.read_csv(QUEST_PATH)
word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=" ", header=None, index_col=0)
char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=" ", header=None, index_col=0)

question_data["words"] = question_data["words"].str.split(" ")
question_data["chars"] = question_data["chars"].str.split(" ")

label = train_data["label"].values.copy()

In [None]:
total_question = pd.concat([train_data["q1"], train_data["q2"], test_data["q1"], test_data["q2"]])
question_feature = total_question.value_counts().reset_index()
question_feature.columns = ["qid", "q_count"]

In [None]:
unique_question = total_question.drop_duplicates().reset_index(drop=True)
question_dict = pd.Series(unique_question.index, unique_question).to_dict()

In [None]:
from keras.preprocessing.text import Tokenizer

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(question_data["words"])
char_tokenizer = Tokenizer()
char_tokenizer.fit_on_texts(question_data["chars"])

In [None]:
word_count = sorted(list(word_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)
word_count = pd.DataFrame(word_count, columns=["word", "word_times"])
char_count = sorted(list(char_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)
char_count = pd.DataFrame(char_count, columns=["cahr", "char_times"])

In [None]:
train = train_data.merge(question_data, how="left", left_on="q1", right_on="qid") \
                .drop(["qid", "label"], axis=1) \
                .rename(columns={"words": "words1", "chars": "chars1"}) \
                .merge(question_data, how="left", left_on="q2", right_on="qid") \
                .drop(["qid"], axis=1) \
                .rename(columns={"words": "words2", "chars": "chars2"})
test = test_data.merge(question_data, how="left", left_on="q1", right_on="qid") \
                .drop(["qid"], axis=1) \
                .rename(columns={"words": "words1", "chars": "chars1"}) \
                .merge(question_data, how="left", left_on="q2", right_on="qid") \
                .drop(["qid"], axis=1) \
                .rename(columns={"words": "words2", "chars": "chars2"})

In [None]:
train1 = train.merge(question_feature, how="left", left_on="q1", right_on="qid") \
    .drop("qid", axis=1) \
    .rename(columns={"q_count": "q1_count"})
train1 = train1.merge(question_feature, how="left", left_on="q2", right_on="qid") \
    .drop("qid", axis=1) \
    .rename(columns={"q_count": "q2_count"})

In [None]:
test1 = test.merge(question_feature, how="left", left_on="q1", right_on="qid") \
    .drop("qid", axis=1) \
    .rename(columns={"q_count": "q1_count"})
test1 = test1.merge(question_feature, how="left", left_on="q2", right_on="qid") \
    .drop("qid", axis=1) \
    .rename(columns={"q_count": "q2_count"})

In [None]:
def question_feature(data):
    data["word1_len"], data["word2_len"] = data["words1"].map(len), data["words2"].map(len)
    data["char1_len"], data["char2_len"] = data["chars1"].map(len), data["chars2"].map(len)
    data["word_same"] = data.apply(lambda x: len(set(x["words1"]).intersection(set(x["words2"]))), axis=1)
    data["char_same"] = data.apply(lambda x: len(set(x["chars1"]).intersection(set(x["chars2"]))), axis=1)
    data["q1_hash"], data["q2_hash"] = data["q1"].map(question_dict), data["q2"].map(question_dict)
    return data
    
train2, test2 = question_feature(train1), question_feature(test1)

In [None]:
train2.head()

In [None]:
graph = networkx.Graph()
edges = [tuple(pair) for pair in pd.concat([train_data[["q1", "q2"]], test_data[["q1", "q2"]]]).values]
graph.add_edges_from(edges)

cliques = sorted(list(networkx.find_cliques(graph)), key=lambda x: len(x), reverse=True)
map_label = dict(((x[0], x[1]), 1) for x in pd.concat([train_data[["q1", "q2"]], test_data[["q1", "q2"]]]).values)

map_clique_size = {}
for c in cliques:
    for q1, q2 in combinations(c, 2):
        if (q1, q2) in map_label:
            map_clique_size[q1, q2] = len(c)
        elif (q2, q1) in map_label:
            map_clique_size[q2, q1] = len(c)

train2['clique_size'] = train2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)
test2['clique_size'] = test2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)

In [None]:
max_kcore = pd.DataFrame(list(nx.core_number(graph).items()), columns=["qid", "kcore"])
train3 = train2.merge(max_kcore, how="left", left_on="q1", right_on="qid").drop("qid", axis=1).rename(columns={"kcore": "q1_kcore"}) \
    .merge(max_kcore, how="left", left_on="q2", right_on="qid").drop("qid", axis=1).rename(columns={"kcore": "q2_kcore"})
test3 = test2.merge(max_kcore, how="left", left_on="q1", right_on="qid").drop("qid", axis=1).rename(columns={"kcore": "q1_kcore"}) \
    .merge(max_kcore, how="left", left_on="q2", right_on="qid").drop("qid", axis=1).rename(columns={"kcore": "q2_kcore"})

In [None]:
train3.drop(["q1", "q2", "words1", "chars1", "words2", "chars2"], axis=1).to_csv("./data/train_feature.csv", index=False)
test3.drop(["q1", "q2", "words1", "chars1", "words2", "chars2"], axis=1).to_csv("./data/test_feature.csv", index=False)