In [1]:
import os
os.chdir('..')

%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime

from src.main import get_corpus
from src.winograd_collection_manipulation.text_manipulation import custom_tokenizer
from src.winograd_collection_manipulation.wsc_json_handler import generate_df_from_json

In [3]:
print(f'Ran on {datetime.utcnow()}')
print('Using Corpus after filtering by good and featured articles')

Ran on 2020-01-26 21:01:23.311989
Using Corpus after filtering by good and featured articles


In [4]:
df = generate_df_from_json()
corpus = get_corpus()

In [5]:
# Just to check that we have the correct language
corpus.dictionary.idx2word[20]

'competição'

In [6]:
df = df[df.translated].copy()

In [7]:
df.head()

Unnamed: 0,correct_sentence,incorrect_sentence,manually_fixed_correct_sentence,manually_fixed_incorrect_sentence,correct_switched,incorrect_switched,is_switchable,is_associative,translated
0,Os vereadores recusaram a autorização aos mani...,Os vereadores recusaram a autorização aos mani...,Os vereadores recusaram a autorização aos mani...,Os vereadores recusaram a autorização aos mani...,,,False,False,True
1,Os vereadores recusaram a autorização aos mani...,Os vereadores recusaram a autorização aos mani...,Os vereadores recusaram a autorização aos mani...,Os vereadores recusaram a autorização aos mani...,,,False,False,True
2,A medalha não cabe na maleta porque a medalha ...,A medalha não cabe na maleta porque a maleta é...,A medalha não cabe na maleta porque a medalha ...,A medalha não cabe na maleta porque a maleta é...,,,False,False,True
3,A medalha não cabe na maleta porque a maleta é...,A medalha não cabe na maleta porque a medalha ...,A medalha não cabe na maleta porque a maleta é...,A medalha não cabe na maleta porque a medalha ...,,,False,False,True
4,Joan certificou-se de agradecer Susan por toda...,Joan certificou-se de agradecer Susan por toda...,Joan certificou-se de agradecer Susan por toda...,Joan certificou-se de agradecer Susan por toda...,Susan certificou-se de agradecer Joan por toda...,Susan certificou-se de agradecer Joan por toda...,True,False,True


In [8]:
english = False
text_columns = df.loc[:, (df.applymap(type) == str).all(axis=0)].columns
wsc_vocab = set(df[text_columns].applymap(lambda x: custom_tokenizer(x, english, for_model=True)).sum().sum())
missing_words = list(wsc_vocab - set(corpus.dictionary.word2idx))

In [9]:
print(f'Total WSC vocab {len(wsc_vocab)}')
print(f'Total Missing Words {len(missing_words)}')
print(f'Percentage Missing Words {len(missing_words)/len(wsc_vocab)*100:.2f}%')

Total WSC vocab 1096
Total Missing Words 89
Percentage Missing Words 8.12%


In [10]:
from unittest.mock import patch

with patch('src.main.CORPUS_FILE_NAME', 'models/english-wikitext-2/corpus.pkl'), \
  patch('src.winograd_collection_manipulation.wsc_json_handler.WINOGRAD_SCHEMAS_FILE', 
        'data/processed/english_wsc.json'):
    from src.main import get_corpus
    from src.winograd_collection_manipulation.text_manipulation import custom_tokenizer
    from src.winograd_collection_manipulation.wsc_json_handler import generate_df_from_json
    df = generate_df_from_json()
    corpus = get_corpus()

In [11]:
corpus.dictionary.idx2word[20]

'role'

In [12]:
df.head()

Unnamed: 0,correct_sentence,incorrect_sentence,manually_fixed_correct_sentence,manually_fixed_incorrect_sentence,correct_switched,incorrect_switched,is_switchable,is_associative,translated
0,The city councilmen refused the demonstrators ...,The city councilmen refused the demonstrators ...,,,,,False,False,True
1,The city councilmen refused the demonstrators ...,The city councilmen refused the demonstrators ...,,,,,False,False,True
2,The trophy doesn't fit into the brown suitcase...,The trophy doesn't fit into the brown suitcase...,,,,,False,False,True
3,The trophy doesn't fit into the brown suitcase...,The trophy doesn't fit into the brown suitcase...,,,,,False,False,True
4,Joan made sure to thank Susan for all the help...,Joan made sure to thank Susan for all the help...,,,Susan made sure to thank Joan for all the help...,Susan made sure to thank Joan for all the help...,True,False,True


In [13]:
# Just to check that we have the correct language
english = True
text_columns = df.loc[:, (df.applymap(type) == str).all(axis=0)].columns
wsc_vocab = set(df[text_columns].applymap(lambda x: custom_tokenizer(x, english, for_model=True)).sum().sum())
missing_words = list(wsc_vocab - set(corpus.dictionary.word2idx))

In [14]:
print(f'Total WSC vocab {len(wsc_vocab)}')
print(f'Total Missing Words {len(missing_words)}')
print(f'Percentage Missing Words {len(missing_words)/len(wsc_vocab)*100:.2f}%')

Total WSC vocab 956
Total Missing Words 93
Percentage Missing Words 9.73%
