# Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys, os
sys.path.append(os.path.abspath('../'))
del sys, os

In [3]:
from database_creation.database import Database
from database_creation.article import Article
from database_creation.coreference import Coreference
from database_creation.sentence import Sentence
from database_creation.np import Np
from database_creation.word import Word

# Processing the database

## Preprocessing

### Re-initializing the display parameters

In [4]:
Database.set_parameters(to_print=[], print_attribute=True, random_print=True, limit_print=20)
Article.set_parameters(to_print=[], print_attribute=True)
Coreference.set_parameters(to_print=[], print_attribute=True)
Sentence.set_parameters(to_print=[], print_attribute=True)
Np.set_parameters(to_print=[], print_attribute=True)
Word.set_parameters(to_print=[], print_attribute=True)

### Initializing the database

In [5]:
database = Database(max_size=1000)

In [6]:
print(database)

max_size: 1000

year: 2000

root: ../databases/nyt_jingyun

size: 1000

articles: 

id 1165327: 
original_path: ../databases/nyt_jingyun/data/2000/01/02/1165327.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165327.txt.xml

id 1165366: 
original_path: ../databases/nyt_jingyun/data/2000/01/02/1165366.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165366.txt.xml

id 1165646: 
original_path: ../databases/nyt_jingyun/data/2000/01/02/1165646.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165646.txt.xml

id 1165074: 
original_path: ../databases/nyt_jingyun/data/2000/01/01/1165074.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165074.txt.xml

id 1165390: 
original_path: ../databases/nyt_jingyun/data/2000/01/02/1165390.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165390.txt.xml

id 1165850: 
original

In [7]:
Database.set_parameters(to_print=['articles'], print_attribute=False)

### Preprocessing the database

In [8]:
database.preprocess_candidates()


Preprocessing the articles...

Cleaning the database...
Initial size: 1000
Final size: 511
Done (elapsed time: 0s).


Cleaning the database...
Initial size: 511
Final size: 325
Done (elapsed time: 0s).

Done (elapsed time: 44s).



In [9]:
print(database)


original_path: ../databases/nyt_jingyun/data/2000/01/02/1165392.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165392.txt.xml
title: The Forerunner of the Piano Explained
entities_persons: Robert Sherman; Jann Parker; Igor Kipnis
sentences: 
    parse: (ROOT (S (NP (NNP IGOR) (NNP KIPNIS)) (VP (VBZ is) (ADJP (JJ well-known) (PP (PP (TO to) (NP (NP (NNP Connecticut) (NNS audiences)) (PP (PP (IN in) (NP (NP (DT a) (NN number)) (PP (IN of) (NP (NNS guises))))) (: :) (PP (IN as) (NP (NP (DT a) (VBN noted) (NN performer)) (PP (IN on) (NP (NN piano) (, ,) (NN fortepiano) (CC and) (NN harpsichord))) (, ,) (NP (NP (NN concert) (NN host)) (PP (IN for) (NP (DT the) (NNP Silvermine) (NNP Guild)))) (CC and) (NP (JJ other) (VBG presenting) (NNS groups))))))) (, ,) (CC and) (PP (IN as) (NP (NP (DT the) (JJ artistic) (NN director)) (PP (IN of) (NP (NP (NNPS Friends)) (PP (IN of) (NP (NP (NNP Music)) (PP (IN of) (NP (NNP Fairfield) (NNP County)))))))))))) (. .))

In [10]:
Article.set_parameters(to_print=['title', 'preprocessed_entities', 'sentences'])
Sentence.set_parameters(to_print=['nps'], print_attribute=False)
Np.set_parameters(print_attribute=True)
Word.set_parameters(print_attribute=False)

In [11]:
print(database)


title: Water in Phoenix
sentences: 
    
        words: the (DT) Editor (NNP) : Suzanne (NNP) Winckler (NNP) 's (POS) '' 
        words: the (DT) Editor (NNP) 
        words: Suzanne (NNP) Winckler (NNP) 's (POS) 
        words: Winckler (NNP) 's (POS) 
        words: Phoenix (NNP) '' 
        words: Nov. (NNP) 28 (CD) 
        words: a (DT) number (NN) of (IN) the (DT) city (NN) 's (POS) most (RBS) interesting (JJ) sights (NNS) 
        words: a (DT) number (NN) 
        words: the (DT) city (NN) 's (POS) most (RBS) interesting (JJ) sights (NNS) 
        words: the (DT) city (NN) 's (POS) 
    
        words: I (PRP) 
        words: Phoenix (NNP) 's (POS) canals (NNS) , fountains (NNS) 
        words: Phoenix (NNP) 's (POS) canals (NNS) 
        words: Phoenix (NNP) 's (POS) 
        words: fountains (NNS) 
        words: she (PRP) 
        words: this (DT) water (NN) 
        words: the (DT) formerly (RB) wild (JJ) , beautiful (JJ) rivers (NNS) of (IN) the (DT) Southwest (NNP) 
    

## Processing

In [12]:
database.process_candidates()


Processing the articles candidates...

Loading word embeddings...
Done (elapsed time: 35s).


Loading entity embeddings...
Done (elapsed time: 32s).

Done (elapsed time: 95s).



In [13]:
print(database)


title: Maybe the 15 Minutes Aren't Up
sentences: 
    
        words: There (EX)
        entity_similarity: 0.11
        entity_similar_entities: /en/there /en/argent 
        words: all (DT)
        entity_similarity: 0.07
        entity_similar_entities: /en/all /en/atwood 
        words: a (DT) market (NN, 0.28, Argent Trading) for (IN) those (DT) Year (NN, 0.15, Argent Trading) 2000 (CD) bears (NNS, 0.1, Argent Trading) and (CC) millennium (JJ, 0.08, Bill Levitz) sweatshirts (NNS, 0.13, Bill Levitz) languishing (VBG, 0.08, Argent Trading) on (IN) store (NN, 0.36, Bill Levitz) shelves (NNS, 0.19, Bill Levitz)
        word_similarity: 0.36
        word_similar_entities: store Bill Levitz
        candidate: 1
        entities: Bill Levitz; Atwood Richards Inc; Argent Trading
        context:  There is , after all , a market for those Year 2000 bears and millennium sweatshirts languishing on store shelves . At least two international corporate barter companies , which swap surplus goo

In [14]:
database.write_candidates('../results/out.txt')


Writing the candidates...
Done (elapsed time: 4s).

