# ConcurrentQA Slices

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import csv
import sys
import ujson
import json
from tqdm import tqdm
import pandas as pd
import time
import ast
import re
import random
import statistics
from collections import Counter, defaultdict, OrderedDict
from typing import TYPE_CHECKING, Optional, Tuple, Callable, Dict, Any, List

In [6]:
prefix = "../../" # FILL IN PATH TO REPO

# Loading

In [8]:
with open(f"{prefix}/datasets/concurrentqa/data/CQA_train_all.json") as f:
    all_points = [ast.literal_eval(line) for line in f]

In [17]:
all_answers = []
all_questions = []
all_domains = []
questionddtype_counts = Counter()
all_points_dict = {}

for d in tqdm(all_points):
    q_type =  d['type']
    answer = d['answer'] 
    domain = d['domain']
    question = d['question']
    all_answers.append(answer)
    all_questions.append(question)
    all_points_dict[d['_id']] = d

100%|██████████| 15239/15239 [00:00<00:00, 344923.04it/s]


## Topical Slices of the Dataset

In [18]:
shortlisted_keys = [key for key in all_points[0].keys()]
df = pd.DataFrame.from_dict(all_points_dict, orient='index', columns=shortlisted_keys)
df.head(1)

Unnamed: 0,question,answer,_id,domain,type,sp
PAIRIDX:63096,According to the spokesperson who claimed Enro...,30-minute,PAIRIDX:63096,"[0, 0]",bridge,"[{'title': 'e856_p1', 'sents': ['The stock was..."


##### investment

In [20]:
slice_investors = []
for i, (ind, value) in enumerate(df.iterrows()):
    if re.search(".*investor.*", value['question']):
        slice_investors.append(ind)
    elif re.search(".*funded.*", value['question']):
        slice_investors.append(ind)
    elif re.search(".*funder.*", value['question']):
        slice_investors.append(ind)
    elif re.search(".*backer.*", value['question']):
        slice_investors.append(ind)
    elif re.search(".*backed.*", value['question']):
        slice_investors.append(ind)
    elif re.search("^Which investor.*backed both.*", value['question']):
        slice_investors.append(ind)
    elif re.search("^Who.*backs both.*", value['question']):
        slice_investors.append(ind)
    elif re.search("^What.*backer also backed.*?", value['question']):
        slice_investors.append(ind)
    elif re.search("^.*invested in both.*and.*?", value['question']):
        slice_investors.append(ind)
    elif re.search("^.*invested in.*round.*and.*?", value['question']):
        slice_investors.append(ind)
    elif re.search("^.*invested in both.*as well as.*?", value['question']):
        slice_investors.append(ind)

print(f"Investing, found {len(slice_investors)}, {len(slice_investors)/len(df)} questions in slice.\n")

Investing, found 1227, 0.08051709429752608 questions in slice.



##### legal discourse

In [21]:
slice_legal = []
for i, (ind, value) in enumerate(df.iterrows()):
    if re.search(".*judge.*", value['question'].lower()):
        slice_legal.append(ind)
    elif re.search(".* sued.*", value['question'].lower()):
        slice_legal.append(ind)
    elif re.search(".*bill .*", value['question']):
        slice_legal.append(ind)
    elif re.search(".*lawsuit.*", value['question']):
        slice_legal.append(ind)
print(f"Legal, found {len(slice_legal)}, {len(slice_legal)/len(df)} questions in slice.\n")

Legal, found 329, 0.021589343132751494 questions in slice.



##### newspapers

In [23]:
slice_newspapers = []
for i, (ind, value) in enumerate(df.iterrows()):
    if re.search(".*paper.*", value['question']):
        slice_newspapers.append(ind)
    elif re.search(".*reported.*", value['question']):
        slice_newspapers.append(ind)
    elif re.search(".*staff writer.*", value['question']):
        slice_newspapers.append(ind)
    elif re.search(".*article.*", value['question']):
        slice_newspapers.append(ind)
    elif re.search(".*wrote a piece.*", value['question']):
        slice_newspapers.append(ind)

print(f"News, found {len(slice_newspapers)}, {len(slice_newspapers)/len(df)} questions in slice.\n")

News, found 900, 0.05905899337226852 questions in slice.



##### geography

In [24]:
slice_geography = []
for i, (ind, value) in enumerate(df.iterrows()):
    if re.search(".*based in what state.*", value['question']):
        slice_geography.append(ind)
    elif re.search(".*based in what city.*", value['question']):
        slice_geography.append(ind)
    elif re.search(".*based in what country.*", value['question']):
        slice_geography.append(ind)
    elif re.search(".*located in what state.*", value['question']):
        slice_geography.append(ind)
    elif re.search(".*located in what city.*", value['question']):
        slice_geography.append(ind)

print(f"Geography, found {len(slice_geography)}, {len(slice_geography)/len(df)} questions in slice.\n")

Geography, found 69, 0.004527856158540586 questions in slice.



##### population

In [25]:
slice_population = []
for i, (ind, value) in enumerate(df.iterrows()):
    if re.search(".*population of.*", value['question']):
        slice_population.append(ind)

print(f"Geography, found {len(slice_population)}, {len(slice_population)/len(df)} questions in slice.\n")

Geography, found 262, 0.017192729181704836 questions in slice.



##### birth

In [26]:
slice_birth = []
for i, (ind, value) in tqdm(enumerate(df.iterrows())):
    if re.search(".*year of birth.*", value['question']):
        slice_birth.append(ind)
    elif re.search(".*date of birth.*", value['question']):
        slice_birth.append(ind)
    elif re.search(".*month of birth.*", value['question']):
        slice_birth.append(ind)
    elif re.search(".*was born in.*what year", value['question']):
        slice_birth.append(ind)
    elif re.search(".*was born in.*what month", value['question']):
        slice_birth.append(ind)
    elif re.search("^When was.*born.*", value['question']):
        slice_birth.append(ind)
    elif re.search("^Where was.*born.*", value['question']):
        slice_birth.append(ind)
    elif re.search(".*place of birth.*", value['question']):
        slice_birth.append(ind)
    elif re.search(".*was born in.*what city", value['question']):
        slice_birth.append(ind)
    elif re.search(".*was born in.*what place", value['question']):
        slice_birth.append(ind)
    elif re.search(".*was born where.*", value['question']):
        slice_birth.append(ind)

print(f"Birth, found {len(slice_birth)}, {len(slice_birth)/len(df)} questions in slice.\n")

15239it [00:06, 2536.71it/s]

Birth, found 351, 0.023033007415184725 questions in slice.






##### stock prices

In [27]:
slice_stocks = []
for i, (ind, value) in tqdm(enumerate(df.iterrows())):
    if re.search(".*stock price.*", value['question']):
        slice_stocks.append(ind)
    elif re.search(".*share price.*", value['question']):
        slice_stocks.append(ind)
    elif re.search(".*dollars per share.*", value['question']):
        slice_stocks.append(ind)
    elif re.search(".*cents.*share.*", value['question']):
        slice_stocks.append(ind)
    elif re.search(".*quarter.*earnings.*", value['question']):
        slice_stocks.append(ind)

print(f"Stocks, found {len(slice_stocks)}, {len(slice_stocks)/len(df)} questions in slice.\n")

15239it [00:04, 3773.14it/s]

Stocks, found 42, 0.002756086357372531 questions in slice.






##### email features

In [28]:
slice_email_ftrs = []
for i, (ind, value) in tqdm(enumerate(df.iterrows())):
    if re.search(".*sent an e-mail on.*", value['question']):
        slice_email_ftrs.append(ind)
    elif re.search(".*the recipient.*e-mail*", value['question']):
        slice_email_ftrs.append(ind)
    elif re.search(".*e-mail*", value['question']):
        slice_email_ftrs.append(ind)
    elif re.search(".*e-mail*", value['question']):
        slice_email_ftrs.append(ind)
        
print(f"Emails Features, found {len(slice_email_ftrs)}, {len(slice_email_ftrs)/len(df)} questions in slice.\n")

15239it [00:04, 3695.24it/s]

Emails Features, found 141, 0.009252575628322069 questions in slice.






##### Company positions and titles of employees

In [30]:
slice_position = []
for i, (ind, value) in tqdm(enumerate(df.iterrows())):
    if re.search(".*is the president.*", value['question']):
        slice_position.append(ind)
    elif re.search(".*is the vice president.*", value['question']):
        slice_position.append(ind)
    elif re.search(".*chief.*officer.*", value['question']):
        slice_position.append(ind)
    elif re.search("Who is the.*of.*", value['question']):
        slice_position.append(ind)
    elif re.search(".*holds.*position at.*", value['question']):
        slice_position.append(ind)
    elif re.search(".*is the head of.*", value['question']):
        slice_position.append(ind)
    elif re.search(".*board member.*", value['question']):
        slice_position.append(ind)

print(f"Positions, found {len(slice_position)}, {len(slice_position)/len(df)} questions in slice.\n")

15239it [00:05, 2918.85it/s]

Positions, found 274, 0.017980182426668417 questions in slice.




