In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [2]:
import torch

from nltk.tokenize import word_tokenize

from src.main import setup_torch, get_corpus
from src.utils import get_latest_model_file
from src.winograd_schema_challenge import find_missing_wsc_words_in_corpus_vocab

from src.wsc_parser import generate_df, generate_json, generate_full_sentences

In [3]:
still_in_english = [
    "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because the spot under the tree was hot.",
    "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because the dog was cooler.",
    "I couldn't put the pot on the shelf because the shelf was too tall.",
    "I couldn't put the pot on the shelf because the pot was too high.",
    "There is a pillar between me and the stage, and I can't see around the stage.",
    "There is a pillar between me and the stage, and I can't see the pillar.",
    "They broadcast an announcement, but a subway came into the station and I couldn't hear the subway.",
    "They broadcast an announcement, but a subway came into the station and I couldn't hear over the announcement.",
]

In [4]:
df = generate_df(' '.join(still_in_english))

In [5]:
import json
with open('data/processed/WSC_associative_label.json', 'r') as fp:
    english_associative_json = json.load(fp)

In [6]:
df['is_associative'] = False
for item in english_associative_json:
    if item['is_associative'] == 1:
        df.loc[item['index'],'is_associative'] = True

In [7]:
df[df['is_associative']].count().values[0]

37

In [8]:
import json
with open('data/processed/WSC_switched_label.json', 'r') as fp:
    english_switched_json = json.load(fp)

In [9]:
df['is_switchable'] = False
for item in english_switched_json:
    if item['is_switchable'] == 1:
        df.loc[item['index'],'is_switchable'] = True

In [10]:
df[df['is_switchable']].count().values[0]

131

In [11]:
with open('data/processed/portuguese_capitalized_words.txt', 'r') as capitalized_words_file:
    capitalized_words = [line.strip() for line in capitalized_words_file.readlines()]

def generate_switched_sentence(row):
    if not row.is_switchable:
        return ''
    
    switched = row.schema.lower()
    
    subs_a = row.substitution_a.lower()
    subs_b = row.substitution_b.lower()
    
    i = 1
    while subs_b[:i] == subs_a[:i]:
        i += 1
    
    subs_a = subs_a[i-1:]
    subs_b = subs_b[i-1:]
    
    if subs_b == 'o cara que vestia uma farda':
        subs_a = 'o jim'
        subs_b = 'um cara que vestia uma farda'
    if subs_b == 'o homem' and subs_a == 'john':
        subs_b = 'um homem'
    
    if subs_a not in switched:
        print('Couldnt find subs a:', subs_a)
    if subs_b not in switched:
        print('Couldnt find subs b:', subs_b)

    switched = switched.replace(subs_a, '<PLACEHOLDER>')
    switched = switched.replace(subs_b, subs_a)
    switched = switched.replace('<PLACEHOLDER>', subs_b)
    
    switched = '. '.join(map(lambda s: s.strip().capitalize(), switched.split('.')))
    switched = '! '.join(map(lambda s: s.strip().capitalize(), switched.split('!')))
    switched = '? '.join(map(lambda s: s.strip().capitalize(), switched.split('?')))
    
    words = []
    for word in word_tokenize(switched, language='portuguese'):
        if word.capitalize() in capitalized_words:
            word = word.capitalize()
        if word == '``' or word == "''":
            word == '"'
        if word in ['.', ',', '!', '?', ';', '"']:
            words[-1] += word
        else:
            words += [word]
    switched = ' '.join(words).strip()
    
    return switched

In [12]:
# Eu primeiro!

In [13]:
df['switched'] = df.apply(generate_switched_sentence, axis=1)

In [14]:
df['correct_sentence'], df['incorrect_sentence'], df['correct_switched'], df['incorrect_switched'] = \
    zip(*df.apply(generate_full_sentences, axis=1))

In [15]:
df['correct_sentence'].replace({'alice': 'Alice'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'alice': 'Alice'}, inplace=True, regex=True)
df['correct_sentence'].replace({'fred': 'Fred'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'fred': 'Fred'}, inplace=True, regex=True)
df['correct_sentence'].replace({'mary': 'Mary'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'mary': 'Mary'}, inplace=True, regex=True)
df['correct_sentence'].replace({'anne': 'Anne'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'anne': 'Anne'}, inplace=True, regex=True)
df['correct_sentence'].replace({'jim': 'Jim'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'jim': 'Jim'}, inplace=True, regex=True)
df['correct_sentence'].replace({'O golpista': 'o golpista'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'O golpista': 'o golpista'}, inplace=True, regex=True)
df['correct_sentence'].replace({'desenho da tina': 'desenho da Tina'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'desenho da tina': 'desenho da Tina'}, inplace=True, regex=True)
df['correct_sentence'].replace({'desenho do sam': 'desenho do Sam'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'desenho do sam': 'desenho do Sam'}, inplace=True, regex=True)
df['correct_sentence'].replace({' Os pais da Amy': ' os pais da Amy'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({' Os pais da Amy': ' os pais da Amy'}, inplace=True, regex=True)
df['correct_sentence'].replace({' O tio do Joe': ' o tio do Joe'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({' O tio do Joe': ' o tio do Joe'}, inplace=True, regex=True)
df['correct_sentence'].replace({'joe': 'Joe'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'joe': 'Joe'}, inplace=True, regex=True)
df['correct_sentence'].replace({' A Dra. Adams': ' a Dra. Adams'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({' A Dra. Adams': ' a Dra. Adams'}, inplace=True, regex=True)
df['correct_sentence'].replace({' A Kate': ' a kate'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({' A Kate': ' a kate'}, inplace=True, regex=True)
df['correct_sentence'].replace({'madonna': 'Madonna'}, inplace=True, regex=True)
df['incorrect_sentence'].replace({'madonna': 'Madonna'}, inplace=True, regex=True)

In [16]:
df['manually_fixed_correct_sentence'], df['manually_fixed_incorrect_sentence'], \
    df['manually_fixed_correct_switched'], df['manually_fixed_incorrect_switched'] = \
    df['correct_sentence'], df['incorrect_sentence'], df['correct_switched'], df['incorrect_switched']

In [17]:
import json
with open('data/processed/manual_fixes_portuguese.json', 'r', encoding='utf8') as fp:
    manual_fixes_json = json.load(fp)

In [18]:
for item in manual_fixes_json:
    df.loc[item['question_id'], 'manually_fixed_correct_sentence'] = item['manually_fixed_correct_sentence']
    df.loc[item['question_id'], 'manually_fixed_incorrect_sentence'] = item['manually_fixed_incorrect_sentence']
    if 'manually_fixed_correct_switched' in item:
        df.loc[item['question_id'], 'manually_fixed_correct_switched'] = item['manually_fixed_correct_switched']
        df.loc[item['question_id'], 'manually_fixed_incorrect_switched'] = item['manually_fixed_incorrect_switched']

In [19]:
generate_json(df)