In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [2]:
import torch

from nltk.tokenize import word_tokenize

from src.main import setup_torch, get_corpus
from src.utils import get_latest_model_file
from src.winograd_schema_challenge import find_missing_wsc_words_in_corpus_vocab

from src.wsc_parser import generate_df, generate_json, generate_full_sentences

In [3]:
still_in_english = [
    "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because the spot under the tree was hot.",
    "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because the dog was cooler.",
    "I couldn't put the pot on the shelf because the shelf was too tall.",
    "I couldn't put the pot on the shelf because the pot was too high.",
    "There is a pillar between me and the stage, and I can't see around the stage.",
    "There is a pillar between me and the stage, and I can't see the pillar.",
    "They broadcast an announcement, but a subway came into the station and I couldn't hear the subway.",
    "They broadcast an announcement, but a subway came into the station and I couldn't hear over the announcement."
]

In [4]:
df = generate_df(' '.join(still_in_english))

In [5]:
import json
with open('data/processed/WSC_associative_label.json', 'r') as fp:
    english_associative_json = json.load(fp)

In [6]:
df['is_associative'] = False
for item in english_associative_json:
    if item['is_associative'] == 1:
        df.loc[item['index'],'is_associative'] = True

In [7]:
df[df['is_associative']].count().values[0]

37

In [8]:
import json
with open('data/processed/WSC_switched_label.json', 'r') as fp:
    english_switched_json = json.load(fp)

In [9]:
df['is_switchable'] = False
for item in english_switched_json:
    if item['is_switchable'] == 1:
        df.loc[item['index'],'is_switchable'] = True

In [10]:
df[df['is_switchable']].count().values[0]

131

In [11]:
extra_switchable_indexes = [277, 278, 279, 280]
for index in extra_switchable_indexes:
    df.loc[index,'is_switchable'] = True

In [12]:
import re

def cap(match):
    return(match.group().capitalize())

with open('data/processed/portuguese_capitalized_words.txt', 'r') as capitalized_words_file:
    capitalized_words = [line.strip() for line in capitalized_words_file.readlines()]

def minimize_substitutions(subs_a, subs_b):
    i = 1
    while subs_b[:i] == subs_a[:i]:
        i += 1
    if subs_b[i-1] == ' ':
        subs_a = subs_a[i-1:]
        subs_b = subs_b[i-1:]
    
    return subs_a, subs_b

def apply_substitution_exceptions(subs_a, subs_b):
    if subs_b == 'o cara que vestia uma farda':
        subs_a = 'o jim'
        subs_b = 'um cara que vestia uma farda e tinha uma grande barba ruiva'
    if subs_b == 'o homem' and subs_a == 'john':
        subs_b = 'um homem'
    if subs_a == 'o desenho do sam':
        subs_a = 'do sam'
        subs_b = 'da tina'
    if subs_a == 'o homem' and subs_b == 'o filho':
        subs_a = 'homem'
        subs_b = 'filho'
    if subs_a == 'Goodman':
        subs_a = 'Sam Goodman'

    return subs_a, subs_b
    
def capitalize_each_sentence(text):
    p = re.compile(r'((?<=[\.\?!]\s)(\w+)|(^\w+))')
    text = p.sub(cap, text)
    
    return text
    
def generate_switched_sentence(row):
    if not row.is_switchable:
        return ''
    
    switched = row.schema.lower()
    subs_a, subs_b = minimize_substitutions(row.substitution_a.lower(),
                                            row.substitution_b.lower())
    subs_a, subs_b = apply_substitution_exceptions(subs_a, subs_b)
    
    switched = switched.replace(subs_a, '<PLACEHOLDER>')\
                       .replace(subs_b, subs_a)\
                       .replace('<PLACEHOLDER>', subs_b)\
                       .replace('seu homem', 'o homem')

    switched = capitalize_each_sentence(switched)
    
    return switched

In [13]:
def capitalize_words(sentence):
    words = []
    for word in word_tokenize(sentence, language='portuguese'):
        if word.capitalize() in capitalized_words or (len(words) >= 1 and words[-1][-1] in ['.', '!', '?']):
            word = word.capitalize()
        if len(words) >= 1 and words[-1] == '``':
            words[-1] = '"' + word
        elif word in ['.', ',', '!', '?', ';', "''"]:
            if word == "''":
                word = '"'
            words[-1] += word
        else:
            words += [word]
    sentence = ' '.join(words).strip()
    sentence = sentence.replace('" eu primeiro! "', '"Eu primeiro"!')
    sentence = sentence.replace('" Eu primeiro! "', '"Eu primeiro"!')
    sentence = sentence.replace('tv', 'TV')
    sentence = sentence.replace('tv.', 'TV.')
    
    return sentence

In [14]:
df['switched'] = df.apply(generate_switched_sentence, axis=1)

In [15]:
df['correct_sentence'], df['incorrect_sentence'], df['correct_switched'], df['incorrect_switched'] = \
    zip(*df.apply(generate_full_sentences, axis=1))

In [16]:
df['switched'] = df['switched'].apply(capitalize_words)
df['correct_sentence'] = df['correct_sentence'].apply(capitalize_words)
df['incorrect_sentence'] = df['incorrect_sentence'].apply(capitalize_words)
df['correct_switched'] = df['correct_switched'].apply(capitalize_words)
df['incorrect_switched'] = df['incorrect_switched'].apply(capitalize_words)

In [17]:
df['manually_fixed_correct_sentence'], df['manually_fixed_incorrect_sentence'], \
    df['manually_fixed_correct_switched'], df['manually_fixed_incorrect_switched'] = \
    df['correct_sentence'], df['incorrect_sentence'], df['correct_switched'], df['incorrect_switched']

In [18]:
import json
with open('data/processed/manual_fixes_portuguese.json', 'r', encoding='utf8') as fp:
    manual_fixes_json = json.load(fp)

In [19]:
for item in manual_fixes_json:
    df.loc[item['question_id'], 'manually_fixed_correct_sentence'] = item['manually_fixed_correct_sentence']
    df.loc[item['question_id'], 'manually_fixed_incorrect_sentence'] = item['manually_fixed_incorrect_sentence']
    if 'manually_fixed_correct_switched' in item:
        df.loc[item['question_id'], 'manually_fixed_correct_switched'] = item['manually_fixed_correct_switched']
        df.loc[item['question_id'], 'manually_fixed_incorrect_switched'] = item['manually_fixed_incorrect_switched']

In [20]:
generate_json(df)