# Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys, os
sys.path.append(os.path.abspath('../'))
del sys, os

In [3]:
from database_creation.database import Database
from database_creation.article import Article
from database_creation.coreference import Coreference
from database_creation.sentence import Sentence
from database_creation.np import Np
from database_creation.word import Word

# Processing the database

## Preprocessing

### Re-initializing the display parameters

In [10]:
Database.set_parameters(to_print=[], print_attribute=True, random_print=True, limit_print=20)
Article.set_parameters(to_print=[], print_attribute=True)
Coreference.set_parameters(to_print=[], print_attribute=True)
Sentence.set_parameters(to_print=[], print_attribute=True)
Np.set_parameters(to_print=[], print_attribute=True)
Word.set_parameters(to_print=[], print_attribute=True)

### Initializing the database

In [11]:
database = Database(max_size=1000)

In [12]:
print(database)

max_size: 1000

year: 2000

root: ../databases/nyt_jingyun

size: 1000

articles: 

id 1165111: 
original_path: ../databases/nyt_jingyun/data/2000/01/01/1165111.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165111.txt.xml

id 1165857: 
original_path: ../databases/nyt_jingyun/data/2000/01/03/1165857.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165857.txt.xml

id 1165219: 
original_path: ../databases/nyt_jingyun/data/2000/01/01/1165219.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165219.txt.xml

id 1165899: 
original_path: ../databases/nyt_jingyun/data/2000/01/04/1165899.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165899.txt.xml

id 1165429: 
original_path: ../databases/nyt_jingyun/data/2000/01/02/1165429.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165429.txt.xml

id 1165135: 
original

In [14]:
Database.set_parameters(to_print=['articles'], print_attribute=False)

### Preprocessing the database

In [15]:
database.preprocess()

Preprocessing the articles...
Initial size: 1000
Final size: 290
Done (elapsed time: 84s).



In [16]:
print(database)


original_path: ../databases/nyt_jingyun/data/2000/01/01/1165211.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1165211.txt.xml
title: Looking Back, With an Eye to the Future
entities_persons: Johnson, Khalilah A; Bardelli, Alfred; Bradsher, Keith; Meredith, Robyn
entities_organizations: Ford Motor Co; United Automobile Workers
preprocessed_entities: Ford Motor Co; United Automobile Workers; Khalilah A Johnson; Alfred Bardelli; Keith Bradsher; Robyn Meredith
sentences: 
    parse: (ROOT (S (NP (NP (DT The) (JJ enormous) (NN growth) (CC and) (JJ subsequent) (NN unionization)) (PP (IN of) (NP (DT the) (NN auto) (NN industry)))) (VP (VBD created) (NP (NNS opportunities)) (PP (IN for) (NP (NP (NNS millions)) (PP (IN of) (NP (NNPS Americans))))) (S (VP (TO to) (VP (VB enter) (NP (DT the) (JJ middle) (NN class)))))) (. .))) 
    text: The enormous growth and subsequent unionization of the auto industry created opportunities for millions of Americans to e

In [17]:
Article.set_parameters(to_print=['title', 'preprocessed_entities', 'sentences'])
Sentence.set_parameters(to_print=['nps'], print_attribute=False)
Np.set_parameters(print_attribute=True)
Word.set_parameters(print_attribute=False)

In [18]:
print(database)


title: The Lives They Lived: Lili St. Cyr, b. 1918; When Burlesque Was Beautiful
preprocessed_entities: Luc Sante; Lili St Cyr
sentences: 
    
        words: Lili (NNP) St. (NNP) Cyr (NNP) , who (WP) died (VBD) last (JJ) January (NNP) at (IN) the (DT) age (NN) of (IN) 80 (CD) , 
        words: Lili (NNP) St. (NNP) Cyr (NNP) 
        words: the (DT) age (NN) of (IN) 80 (CD) 
        words: the (DT) age (NN) 
        words: 80 (CD) 
        words: one (CD) of (IN) the (DT) last (JJ) survivors (NNS) of (IN) an (DT) era (NN) that (WDT) now (RB) seems (VBZ) as (IN) remote (JJ) as (IN) the (DT) Hapsburg (NNP) dynasty (NN) 
        words: one (CD) 
        words: the (DT) last (JJ) survivors (NNS) of (IN) an (DT) era (NN) that (WDT) now (RB) seems (VBZ) as (IN) remote (JJ) as (IN) the (DT) Hapsburg (NNP) dynasty (NN) 
        words: the (DT) last (JJ) survivors (NNS) 
        words: an (DT) era (NN) that (WDT) now (RB) seems (VBZ) as (IN) remote (JJ) as (IN) the (DT) Hapsburg (NNP) dynasty 

## Processing

In [19]:
database.process()

Processing the articles...
Loading word embeddings...
Done (elapsed time: 40s).

Loading entity embeddings...
Done (elapsed time: 31s).

Done (elapsed time: 99s).



In [20]:
print(database)


title: First the Revolution, Now the Battle for Respect
preprocessed_entities: New Jersey; Robert Griffin; Karen Demasters
sentences: 
    
        words: More (RBR) Revolutionary (NNP, 0.26, New Jersey) War (NNP, 0.16, New Jersey) battles (NNS, 0.11, Robert Griffin) and (CC) skirmishes (NNS, 0.05, New Jersey) took (VBD, 0.12, Robert Griffin) place (NN, 0.1, New Jersey) on (IN) New (NNP, 1.0, New Jersey) Jersey (NNP, 1.0, New Jersey)
        word_similarity: 1.0
        word_similar_entities: Jersey New Jersey
        candidate: 1
        entities: New Jersey; Robert Griffin; Karen Demasters
        context:  More Revolutionary War battles and skirmishes took place on New Jersey soil than anywhere else , and much of that action was in Bergen County near New Bridge Landing , where a pivotal battle took place . The house where George Washington issued orders to his fleeing troops is still standing near the wooden bridge , which was rebuilt with iron in 1889 . 
        words: More (RBR) 

In [21]:
database.write('../results/out.txt')

Writing the candidates...
Done (elapsed time: 4s).

