In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [2]:
import torch

from nltk.tokenize import word_tokenize

from src.main import setup_torch, get_corpus
from src.utils import get_latest_model_file
from src.winograd_schema_challenge import find_missing_wsc_words_in_corpus_vocab

from src.wsc_parser import generate_df, generate_json, generate_full_sentences

In [3]:
still_in_english = [
    "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because the spot under the tree was hot.",
    "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because the dog was cooler.",
    "I couldn't put the pot on the shelf because the shelf was too tall.",
    "I couldn't put the pot on the shelf because the pot was too high.",
    "There is a pillar between me and the stage, and I can't see around the stage.",
    "There is a pillar between me and the stage, and I can't see the pillar.",
    "They broadcast an announcement, but a subway came into the station and I couldn't hear the subway.",
    "They broadcast an announcement, but a subway came into the station and I couldn't hear over the announcement.",
]

In [4]:
df = generate_df(' '.join(still_in_english))

In [5]:
import json
with open('data/processed/WSC_associative_label.json', 'r') as fp:
    english_associative_json = json.load(fp)

In [6]:
df['is_associative'] = False
for item in english_associative_json:
    if item['is_associative'] == 1:
        df.loc[item['index'],'is_associative'] = True

In [7]:
df[df['is_associative']].count().values[0]

37

In [8]:
import json
with open('data/processed/WSC_switched_label.json', 'r') as fp:
    english_switched_json = json.load(fp)

In [9]:
df['is_switchable'] = False
for item in english_switched_json:
    if item['is_switchable'] == 1:
        df.loc[item['index'],'is_switchable'] = True

In [10]:
df[df['is_switchable']].count().values[0]

131

In [11]:
with open('data/processed/portuguese_capitalized_words.txt', 'r') as capitalized_words_file:
    capitalized_words = [line.strip() for line in capitalized_words_file.readlines()]

def generate_switched_sentence(row):
    if not row.is_switchable:
        return ''
    
    switched = row.schema.lower()
    
    subs_a = row.substitution_a.lower()
    subs_b = row.substitution_b.lower()
    
    if subs_b[0:2] == subs_a[0:2]:
        subs_a = subs_a[2:]
        subs_b = subs_b[2:]
    
    if subs_a not in switched:
        print('Couldnt find subs a:', subs_a)
    if subs_b not in switched:
        print('Couldnt find subs b:', subs_b)

    switched = switched.replace(subs_a, '<PLACEHOLDER>')
    switched = switched.replace(subs_b, subs_a)
    switched = switched.replace('<PLACEHOLDER>', subs_b)
    
    switched = '. '.join(map(lambda s: s.strip().capitalize(), switched.split('.')))
    switched = '! '.join(map(lambda s: s.strip().capitalize(), switched.split('!')))
    switched = '? '.join(map(lambda s: s.strip().capitalize(), switched.split('?')))
    
    words = []
    for word in word_tokenize(switched, language='portuguese'):
        if word.capitalize() in capitalized_words:
            word = word.capitalize()
        if word in ['.', ',', '!', '?', ';']:
            words[-1] += word
        else:
            words += [word]
    switched = ' '.join(words).strip()
    
    return switched

In [12]:
df['switched'] = df.apply(generate_switched_sentence, axis=1)

Couldnt find subs b: desenho da tina
Couldnt find subs b: desenho da tina
Couldnt find subs b:  alunos mais jovens
Couldnt find subs b:  alunos mais jovens
Couldnt find subs b: o cara que vestia uma farda
Couldnt find subs b: o cara que vestia uma farda
Couldnt find subs b: o homem
Couldnt find subs b: o homem
Couldnt find subs b:  biscoitos com gotas de chocolate
Couldnt find subs b:  biscoitos com gotas de chocolate


In [13]:
df['correct_sentence'], df['incorrect_sentence'], df['correct_switched'], df['incorrect_switched'] = \
    zip(*df.apply(generate_full_sentences, axis=1))

In [14]:
df['manually_fixed_correct_sentence'], df['manually_fixed_incorrect_sentence'], \
    df['manually_fixed_correct_switched'], df['manually_fixed_incorrect_switched'] = \
    df['correct_sentence'], df['incorrect_sentence'], df['correct_switched'], df['incorrect_switched']

In [15]:
import json
with open('data/processed/manual_fixes_portuguese.json', 'r') as fp:
    manual_fixes_json = json.load(fp)

In [16]:
for item in manual_fixes_json:
    df.loc[item['question_id'], 'manually_fixed_correct_sentence'] = item['manually_fixed_correct_sentence']
    df.loc[item['question_id'], 'manually_fixed_incorrect_sentence'] = item['manually_fixed_incorrect_sentence']

In [17]:
# missing manually_fixed_switched

In [18]:
generate_json(df)

In [23]:
switched_that_may_need_to_fix = df[df['manually_fixed_correct_switched'] != ''].index.values

In [25]:
[item['question_id'] for item in manual_fixes_json if item['question_id'] in switched_that_may_need_to_fix]

[40,
 41,
 189,
 201,
 202,
 207,
 215,
 218,
 219,
 220,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 244,
 272]

In [26]:
for item in manual_fixes_json:
    if item['question_id'] >= 177:
        item['question_id'] += 1
        
with open('data/processed/manual_fixes_portuguese.json', 'w') as fp:
    json.dump(manual_fixes_json, fp)