# Data Cleaning

The data needs to be cleaned before we start preparing them for training models. The most important task here is to select a single answer for a question, because there are multiple answers available.

The answers are in the form of character ranges in the news text. On some exploration, several issues were found with the character ranges. Most notably, the range starts and/or ends in the middle of a word, which makes the answer unreadable/incorrect. There are also a few cases where the range extends one or two characters in a different paragraph.

The character ranges need to be updated to include entire words and not extend to different paragraphs.

In [1]:
import sys
sys.path.append("../")

import numpy as np
import pandas as pd
import random
import utils
import re

In [2]:
# Reading the data
data = pd.read_csv('../data/newsqa-data-formatted.csv')
data.head()

Unnamed: 0,story_id,question,answer_char_ranges,is_answer_absent,is_question_bad,validated_answers
0,42d01e187213e86f5fe617fe32e716ff7fa3afc4,What was the amount of children murdered?,294:297|None|None,0.0,0.0,"{""none"": 1, ""294:297"": 2}"
1,c48228a52f26aca65c31fad273e66164f047f292,Where was one employee killed?,34:60|1610:1618|34:60,0.0,0.0,
2,c65ed85800e4535f4bbbfa2c34d7d9630358d303,who did say South Africa did not issue a visa ...,103:127|114:127|839:853,0.0,0.0,"{""839:853"": 1, ""103:127"": 2}"
3,0cf66b646e9b32076513c050edf32a799200c3c2,How many years old was the businessman?,538:550|538:550,0.0,0.0,
4,13012604e3203c18df09289dfedd14cde67cf40b,What frightened the families?,690:742|688:791|630:646,0.0,0.0,"{""688:791"": 2, ""690:742"": 1}"


In [3]:
# We don't need is_answer_absent and is_question_bad columns
data = data.drop(['is_answer_absent', 'is_question_bad'], axis = 1)

In [6]:
# Reading all stories as dictionary
STORY_PATH = './cnn/stories/'
STORY_EXT = '.story'

story_ids = list(set(data['story_id']))
NEWS_STORIES = {}
cnt = 0
tot = len(story_ids)

for sid in story_ids:
    with open(STORY_PATH + sid + STORY_EXT, 'r', encoding="utf8") as file:
        NEWS_STORIES[sid] = file.read()
    cnt += 1
    utils.drawProgressBar(cnt, tot)

Progress: [=====>              ] 3000/11939

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [7]:
def adjust_answer_range(story_id, answer_range):
    '''
    Checks if answer range starts or ends in the middle of a words
    and returns the correct answer range along with answer text
    
    Parameters
    ------------
    story_id: str
              The story id where the answer is present
    
    answer_range: str
                  The answer range to check
                  
    Returns
    -----------
    [start_idx, end_idx]
    '''
    # If answer is not available, denote it as -1
    if answer_range == 'None':
        return [-1, -1]
    
    story = NEWS_STORIES[story_id]
    
    # Check for errors in answer
    if len(answer_range.split(':')) == 1:
        return [-1, -1]
    
    start_idx, end_idx = answer_range.split(':')
    start_idx, end_idx = int(start_idx), int(end_idx)
    
    # Moves back start_idx to the start of a word
    while start_idx != 0 and not utils.is_whitespace(story[start_idx - 1]) and not utils.is_punct(story[start_idx - 1]):
        start_idx = start_idx - 1
    
    # Some ranges end with a punctuation or a whitespace
    if utils.is_whitespace(story[end_idx - 1]) or utils.is_punct(story[end_idx - 1]):
        end_idx = end_idx - 1
    
    # Moves end_idx to the end of a word
    while not utils.is_whitespace(story[end_idx]) and not utils.is_punct(story[end_idx + 1]):
        end_idx = end_idx + 1
        
    # There are some answers with \n at the end followed by a letter
    # The answer will not be in two different paragraphs
    answer_text = story[start_idx:end_idx]
    answer_para = re.split('\n', answer_text)
    
    if len(answer_para[-1]) > len(answer_para[0]):
        start_idx = end_idx - len(answer_para[-1])
        answer_text = answer_para[-1]
    else:
        end_idx = start_idx + len(answer_para[0])
        answer_text = answer_para[0]
    
    return [start_idx, end_idx]

In [8]:
def get_answer(qa_details):
    '''
    A function that selects an answer for a question
    
    > If validated answers are available, the one with most votes is selected
    > If there's a tie in validated answer votes or if validated answer is not
      available, the most frequent answer is selected
    > If there's a tie here too, a random answer is selected
    
    Parameters
    ------------
    qa_details: Pandas Series
                Details of a QA pair, must include story_id, question, 
                answer_char_ranges, validated_answers as index
    '''
    # If validated answers are available, select the one with most votes
    if qa_details['validated_answers'] is not np.nan:
        validated_answers = eval(qa_details['validated_answers'])
        
        # Get the answers with maximum votes
        max_vote_ans = utils.get_max_keys(validated_answers)
        
        # Check for ties
        if len(max_vote_ans) == 1:
            return adjust_answer_range(qa_details['story_id'], max_vote_ans[0])
    
    # If validated answers are not available or if there is a tie in validated answers
    # Get all available answers
    answers = re.split(',|\|', qa_details['answer_char_ranges'])
    
    # If there is just one answer
    if len(answers) == 1:
        return adjust_answer_range(qa_details['story_id'], answers[0])
    
    # Get counts of each answer
    answer_freq = utils.get_frequency(answers)
    max_vote_ans = utils.get_max_keys(answer_freq)
    
    if len(max_vote_ans) == 1:
        return adjust_answer_range(qa_details['story_id'], max_vote_ans[0])
    
    # If there is a tie for multiple answers, return a random answer
    return adjust_answer_range(qa_details['story_id'], random.choice(answers))

In [9]:
# --- Takes a few seconds to run ---
# Select one answer range among multiple answers
data[['start_idx', 'end_idx']] = data.apply(get_answer, axis = 1, result_type = 'expand')

In [10]:
data.sample(n = 5)

Unnamed: 0,story_id,question,answer_char_ranges,validated_answers,start_idx,end_idx
19967,cb1dac61fd8e3916536890b113ba64ebce2a142c,"Which person described it as a ""gonzo trip""?",867:880|268:286|36:48,"{""867:880"": 2, ""36:48"": 1}",871,882
22456,59369636ea1808819090c7d3dd8d357348f19f74,how many teams compete,None|71:74|71:74,,71,73
70412,efecf88245408e9a1e61e1a0b3342fda394f1ad5,What is France scheduled to do?,None|None,,-1,-1
23878,5bda86b0620b9e9e760b605bdc01d21a990650aa,Which Rhodes Scholar is moving back to the foo...,6423:6436|6423:6436,,6420,6437
68689,c45bafd5377932a64d10e7f02ff261d30d6ceca0,Who supports legislative repeal?,"766:782,786:804|766:782,786:804",,764,783


In [11]:
# Save the cleaned data
data.to_csv('../data/newsqa-dataset-cleaned.csv', index = False)
utils.save_pickle('../data/news_stories.pkl', NEWS_STORIES)

Variable successfully saved in ../data/news_stories.pkl
