 # Clean data

In [2]:
import pandas as pd
import re
from pprint import pprint
SOURCE_DATA_FILE = "Numbers to parsing.tsv"

In [3]:
source_table = pd.read_csv(SOURCE_DATA_FILE, sep='\t')

In [4]:
source_table.head(5)

Unnamed: 0,query,alternative query,main passage,citation
0,踢猫效应在直面上如何体现？,踢猫效应最初指的是什么？,一位父亲的同事耽误了他很多 时间，影响了他当天工作的完成。想 着明天要挨上司的责骂，他一肚子...,"李秋生,王小文. 踢猫效应与情绪传递[J]. 大众心理学,2016(10):41-42. \n"
1,解释一下踢猫效应,,个体会对 弱于自己或者等级低于自己的对象 发泄不满情绪，而产生坏情绪传染 的连锁反应。但是这...,"李秋生,王小文. 踢猫效应与情绪传递[J]. 大众心理学,2016(10):41-42. \n"
2,踢猫效应会带来什么连锁反应？,,假如这只猫对我们不重要，而 且它选择了反抗，我们很有可能和 它陷入战争。人踢了猫一下，猫也会...,"李秋生,王小文. 踢猫效应与情绪传递[J]. 大众心理学,2016(10):41-42. \n"
3,根据踢猫效应，踢猫会带来什么后果,踢猫又可能带来的结果有哪些？,假如这只猫对我们不重要，而 且它选择了隐忍，这是最好的情况， 它默默地承受了你的坏心情，没有...,"李秋生,王小文. 踢猫效应与情绪传递[J]. 大众心理学,2016(10):41-42. \n"
4,不踢猫的话，可以更好地发泄负面情绪吗？,有在被人身上发泄情绪的冲动时候可以如何更好的解决问题？,当然是可以的，但是并非简单 粗暴地把猫一脚踢开。我们的家人、 朋友、同事、同学都可以成为这只...,"李秋生,王小文. 踢猫效应与情绪传递[J]. 大众心理学,2016(10):41-42. \n"


In [5]:
# define some stuff for storing queries, passages, and relevance marking
queries = {} # dictionary of 'qid': 'query', ...
passages = {} # dictionary of 'pid': 'passage', ...
citations = {} # dictionary of 'pid': 'citation', ...
matches = [] # dictionary of matching passages entries {'qid': qid, 'pid': pid}
es_df = pd.DataFrame() # this is the table for elastic search index

# numbering counters
qid_counter = 1
pid_counter = 1

for row in source_table.iterrows():

    row_index = row[0]
    row_content = row[1]
    query = str(row_content['query'])
    alt_query = str(row_content['alternative query'])
    passage = str(row_content['main passage'])
    citation = str(row_content['citation'])


    # trim unnecessary spaces and tabs
    query = query.strip()
    alt_query = alt_query.strip()
    passage = passage.strip()
    citation = citation.strip()

    # trim [] from citation
    citation = re.sub(r'\[\d\]', '', citation)

    # strip space in chinese
    passage = re.sub(r' ', '', passage)
    # fix unwanted new lines in chinese
    passage = re.sub(r'(?<![。？！])\n', '', passage)
    # strip numbering at passage beginning
    passage = re.sub(r'^(\d\.)', '', passage)

    # store the entry in dic and record match
    qid = 'q' + str(qid_counter)
    pid = 'p' + str(pid_counter)
    queries[qid] = query
    passages[pid] = passage
    citations[pid] = citation
    matches.append({'qid': qid, 'pid': pid})
    qid_counter += 1
    pid_counter += 1

    if alt_query != "nan":
        # there is an alt query, make extra query
        qid = 'q' + str(qid_counter)
        queries[qid] = alt_query
        matches.append({'qid': qid, 'pid': pid})
        qid_counter += 1
        es_df = es_df.append({'qid': 'q' + str(qid_counter - 2), 'pid': pid, 'query': query, 'passage': passage, 'alt_query': alt_query, 'alt_qid': qid, 'citation': citation}, ignore_index=True)
    else:
        alt_query = None
        es_df = es_df.append({'qid': qid, 'pid': pid, 'query': query, 'passage': passage, 'alt_query': None, 'alt_qid': None, 'citation': citation}, ignore_index=True)

In [6]:
es_df = es_df[['qid', 'query', 'alt_qid', 'alt_query', 'pid', 'passage', 'citation']]

In [7]:
es_df.sample(30)

Unnamed: 0,qid,query,alt_qid,alt_query,pid,passage,citation
42,q63,年级上的差异和亲子沟通,q64,年级上的差异如何影响亲子沟通,p43,初中生亲子沟通总分及其各维度在学生年级上均存在显著差异，而且初一学生在总得分上要显著高于初二...,"魏美丹. 初中生亲子沟通与生活满意度的关系[D].福建师范大学,2019."
258,q407,随意进入孩子房间有问题吗？,,,p259,受访的17名男生中，4名男生报告在小学之前就己经与父母分开睡，并有了自己独立的房间；有12名...,"葛虹宇. 高中生亲子心理边界的调查研究[D].哈尔滨师范大学,2050"
101,q154,亲子沟通的类型有哪些,q155,有几种类型的亲子沟通？,p102,"根据父母与青少年之间的沟通存在的一种稳定的方式和倾向,Mcleod提出了划分亲子沟通类型的两...","胡悦. 亲子沟通与青少年健康成长[D].哈尔滨工程大学,2007."
245,q394,侵入个人隐私范围是怎么样的？,,,p246,“她还是要求我把门开着，吋不时来看我的情况，手机、微信、QQ的密码她都要求知道，啥都干涉。她...,"葛虹宇. 高中生亲子心理边界的调查研究[D].哈尔滨师范大学,2037"
247,q396,情绪外溢的问题有什么？,,,p248,唠叨是奶奶最大的特点，对于身边的所有人，她似乎都不满意，她需要通过不断抱怨的方式与身边的所有...,"葛虹宇. 高中生亲子心理边界的调查研究[D].哈尔滨师范大学,2039"
221,q370,亲子沟通不当是一方面的错误吗？,,,p222,家庭系统理论认为，亲子冲突是一个多人系统的问题，而不是父母或子女单方面的问题。亲子冲突的来源...,"郭学东.疫情防控期间家庭亲子冲突成因及对策初探[J].教育实践与研究(C),2020(05)..."
172,q287,家庭结构会对亲子沟通有影响吗？,q288,亲子沟通中家庭结构会带来不一样的沟通吗？,p173,"家庭结构和亲子沟通质量密切相关。核心家庭比单亲家庭的亲子沟通质量要好（Sandy,1998）...",李玲. 初中生亲子沟通、成就动机及其团体辅导的研究 [D]. 西南大学. 2037
158,q259,学术上对于青春期的研究。,q260,青春期可能带来的困扰。,p159,青春期的孩子已不像儿童期那样对父母顺从、无话不谈，国外有学者称少年期为亲子关系的危机期。在这...,李玲. 初中生亲子沟通、成就动机及其团体辅导的研究 [D]. 西南大学. 2023
263,q412,用钱胁迫孩子是好的吗？,,,p264,我国作为农业大国，这种不分化的家庭结构受生存环境、资源缺乏等现实因素限制，因为并不富饶的生存...,"葛虹宇. 高中生亲子心理边界的调查研究[D].哈尔滨师范大学,2055"
2,q4,踢猫效应会带来什么连锁反应？,,,p3,假如这只猫对我们不重要，而且它选择了反抗，我们很有可能和它陷入战争。人踢了猫一下，猫也会扑上...,"李秋生,王小文. 踢猫效应与情绪传递[J]. 大众心理学,2016(10):41-42."


In [8]:
queries["q13"]

'亲子关系会影响孩子适应社会的能力吗'

In [9]:
citations["p111"]

'胡悦. 亲子沟通与青少年健康成长[D].哈尔滨工程大学,2007.'

In [10]:
len(es_df)

267

In [11]:
len(passages)

267

# Dunk into elastic search

In [12]:
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch import helpers

es = Elasticsearch()

In [13]:
# clear index
es.indices.delete(index='ctb-nlp-v1', ignore=[400, 404])

# dump into elasticsearch
temp_bulk_actions = []
for index, row in es_df.iterrows():
	action = {
		"_index": "ctb-nlp-v1",
		"_source": {
			'qid': row['qid'], 
			'query': row['query'], 
			'alt_qid': row['alt_qid'], 
			'alt_query': row['alt_query'], 
			'pid': row['pid'], 
			'passage': row['passage'], 
			'citation': row['citation'],
			}
		}
	temp_bulk_actions.append(action)

helpers.bulk(es, temp_bulk_actions)



(267, [])

In [14]:
# don't know what this does
es.indices.refresh(index="ctb-nlp-v1")

{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

# Now make training data

it's not obvious how to generate a score label for query passage pairs... but what we can do is to generate a pseudo relevalence score using characteristics of the query and the passage.

- Set default for relevant to be 0.8 and score of negative samples to 0.1
- the relevent score shall be 0.65 + 0.30 tanh(0.1 relu(query length - 8)) + 0.05 ((1/512) relu(passage length - 200))
- the irrelevant score shall be 0.22 - 0.10 tanh(0.1 relu(query length - 8)) - 0.03 ((1/512) relu(passage length - 200))

In [15]:
import math
import random
def relu(x):
    return max([x, 0])

In [16]:
def relevant_score(qid, pid):
    query = queries[qid]
    passage = passages[pid]
    return 0.60 + (0.3 * math.tanh(0.1 * relu(len(query) - 8))) + (0.1 * (1/512.0) * relu(len(passage) - 200))
relevant_score('q34', 'p230')

0.8284782467867294

In [57]:
def irrelevant_score(qid, pid):
    query = queries[qid]
    passage = passages[pid]
    return 0.22 - (0.1 * math.tanh(0.1 * relu(len(query) - 8))) - (0.03 * (1/512.0) * relu(len(passage) - 200))
irrelevant_score('q34', 'p230')

0.14384058440442352

In [59]:
# this is for storing training data
training_queries = [] # ['This list contains the first column', 'With your sentences', 'You want your model to evaluate on']
training_passages = [] # ['Sentences contains the other column', 'The evaluator matches sentences1[i] with sentences2[i]', 'Compute the cosine similarity and compares it to scores[i]']
training_target_scores = [] #[0.3, 0.6, 0.2]
num_neg_examples = 20 # num of negative examples per positive example

In [60]:
for i, match in enumerate(matches):
    matched_qid = match['qid']
    matched_pid = match['pid']

    # positive sample
    score = relevant_score(matched_qid, matched_pid)
    query = queries[matched_qid]
    passage = passages[matched_pid]
    training_queries.append(query)
    training_passages.append(passage)
    training_target_scores.append(score)
    
    # gen negative samples
    for i in range(num_neg_examples):
        valid = False
        while valid == False: # generate until valid
            random_pid = random.sample(list(passages), 1)[0]
            if passages[random_pid] != passage: # if the passage is the same as the target passage
                valid = True
        neg_score = irrelevant_score(qid, random_pid)
        training_queries.append(query)
        training_passages.append(passages[random_pid])
        training_target_scores.append(neg_score)


In [61]:
training_df = pd.DataFrame()

In [62]:
training_df['training_queries'] = training_queries
training_df['training_passages'] = training_passages
training_df['training_target_scores'] = training_target_scores

In [65]:
len(training_df)

8715

In [67]:
training_df.to_csv('sentencetransformer training.tsv', sep='\t')