# Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys, os
sys.path.append(os.path.abspath('../'))
del sys, os

In [3]:
from database_creation.database import Database
from database_creation.article import Article
from database_creation.coreference import Coreference
from database_creation.sentence import Sentence
from database_creation.np import Np
from database_creation.word import Word

# Processing the database

## Preprocessing

### Re-initializing the display parameters

In [4]:
Database.set_parameters(to_print=[], print_attribute=True, random_print=True, limit_print=20)
Article.set_parameters(to_print=[], print_attribute=True)
Coreference.set_parameters(to_print=[], print_attribute=True)
Sentence.set_parameters(to_print=[], print_attribute=True)
Np.set_parameters(to_print=[], print_attribute=True)
Word.set_parameters(to_print=[], print_attribute=True)

### Initializing the database

In [5]:
database = Database(max_size=10000)

In [6]:
print(database)

max_size: 10000

year: 2000

root: ../databases/nyt_jingyun

size: 10000

articles: 

id 1169136: 
original_path: ../databases/nyt_jingyun/data/2000/01/16/1169136.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1169136.txt.xml

id 1173791: 
original_path: ../databases/nyt_jingyun/data/2000/02/04/1173791.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1173791.txt.xml

id 1170228: 
original_path: ../databases/nyt_jingyun/data/2000/01/21/1170228.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1170228.txt.xml

id 1174938: 
original_path: ../databases/nyt_jingyun/data/2000/02/08/1174938.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1174938.txt.xml

id 1168588: 
original_path: ../databases/nyt_jingyun/data/2000/01/14/1168588.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1168588.txt.xml

id 1167355: 
origin

In [7]:
Database.set_parameters(to_print=['articles'], print_attribute=False)

### Preprocessing the database

In [8]:
database.preprocess_tuples()


Preprocessing the articles...

Cleaning the database...
Initial size: 10000
Final size: 6514
Done (elapsed time: 0s).

File 1000/6514...
File 2000/6514...
File 3000/6514...
File 4000/6514...
File 5000/6514...
File 6000/6514...

Cleaning the database...
Initial size: 6514
Final size: 4334
Done (elapsed time: 0s).


Computing the most frequent entities...
File 1000/4334...
File 2000/4334...
File 3000/4334...
File 4000/4334...
Done (elapsed time: 16s).


Cleaning the database...
Initial size: 4334
Final size: 616
Done (elapsed time: 0s).

Done (elapsed time: 18s).



In [14]:
print("Most frequent entities (in {} articles):\n".format(len(database.frequent_entities_articles)))
for t in database.frequent_entities:
    print(database.to_string(t))


Most frequent entities (in 616 articles):

George Bush John Mccain 111
Al Gore Bill Bradley 84
St Louis Rams Tennessee Titans 53
New York City New York State 47
Chechnya Russia 44
George Bush Steve Forbes 36
Al Gore George Bush 33
Hillary Rodham Clinton Rudolph Giuliani 33
Israel Syria 31
Bill Bradley John Mccain 31
Bill Bradley George Bush 29
Al Gore John Mccain 29
America Online Time Warner Inc 29
Frank Bruni George Bush 28
John Mccain Steve Forbes 27
Al Gore Bill Bradley George Bush 27
Al Gore Bill Clinton 26
Al Gore Bill Bradley John Mccain 25
George Bush John Mccain Steve Forbes 23
Kenneth Boss Richard Murphy Sean Carroll 23
Edward Mcmellon Richard Murphy 23
Amadou Diallo Edward Mcmellon Kenneth Boss Richard Murphy 23
Amadou Diallo Sean Carroll 23
Amadou Diallo Edward Mcmellon Richard Murphy 23
Amadou Diallo Edward Mcmellon Richard Murphy Sean Carroll 23
Edward Mcmellon Kenneth Boss Sean Carroll 23
Richard Murphy Sean Carroll 23
Kenneth Boss Richard Murphy 23
Edward Mcmellon Kenne

In [9]:
print(database)


original_path: ../databases/nyt_jingyun/data/2000/01/11/1167789.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1167789.txt.xml
entities_locations: Europe
entities_persons: Edmund Andrews
entities_organizations: AOL Time Warner; America Online; Time Warner Inc


original_path: ../databases/nyt_jingyun/data/2000/01/31/1172885.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1172885.txt.xml
entities_locations: New Hampshire
entities_persons: Alison Mitchell; George Bush; John Mccain; Steve Forbes


original_path: ../databases/nyt_jingyun/data/2000/02/02/1173353.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotated/1173353.txt.xml
entities_locations: New Hampshire
entities_persons: Al Gore; Bill Bradley; George Bush; John Mccain; Jr Apple


original_path: ../databases/nyt_jingyun/data/2000/01/12/1168004.xml
annotated_path: ../databases/nyt_jingyun/content_annotated/2000content_annotate

In [10]:
Article.set_parameters(to_print=['title', 'preprocessed_entities', 'sentences'])
Sentence.set_parameters(to_print=['nps'], print_attribute=False)
Np.set_parameters(print_attribute=True)
Word.set_parameters(print_attribute=False)

In [11]:
print(database)


title: Water in Phoenix
sentences: 
    
        words: the (DT) Editor (NNP) : Suzanne (NNP) Winckler (NNP) 's (POS) '' 
        words: the (DT) Editor (NNP) 
        words: Suzanne (NNP) Winckler (NNP) 's (POS) 
        words: Winckler (NNP) 's (POS) 
        words: Phoenix (NNP) '' 
        words: Nov. (NNP) 28 (CD) 
        words: a (DT) number (NN) of (IN) the (DT) city (NN) 's (POS) most (RBS) interesting (JJ) sights (NNS) 
        words: a (DT) number (NN) 
        words: the (DT) city (NN) 's (POS) most (RBS) interesting (JJ) sights (NNS) 
        words: the (DT) city (NN) 's (POS) 
    
        words: I (PRP) 
        words: Phoenix (NNP) 's (POS) canals (NNS) , fountains (NNS) 
        words: Phoenix (NNP) 's (POS) canals (NNS) 
        words: Phoenix (NNP) 's (POS) 
        words: fountains (NNS) 
        words: she (PRP) 
        words: this (DT) water (NN) 
        words: the (DT) formerly (RB) wild (JJ) , beautiful (JJ) rivers (NNS) of (IN) the (DT) Southwest (NNP) 
    

## Processing

In [12]:
database.process_candidates()


Processing the articles candidates...

Loading word embeddings...
Done (elapsed time: 35s).


Loading entity embeddings...
Done (elapsed time: 32s).

Done (elapsed time: 95s).



In [13]:
print(database)


title: Maybe the 15 Minutes Aren't Up
sentences: 
    
        words: There (EX)
        entity_similarity: 0.11
        entity_similar_entities: /en/there /en/argent 
        words: all (DT)
        entity_similarity: 0.07
        entity_similar_entities: /en/all /en/atwood 
        words: a (DT) market (NN, 0.28, Argent Trading) for (IN) those (DT) Year (NN, 0.15, Argent Trading) 2000 (CD) bears (NNS, 0.1, Argent Trading) and (CC) millennium (JJ, 0.08, Bill Levitz) sweatshirts (NNS, 0.13, Bill Levitz) languishing (VBG, 0.08, Argent Trading) on (IN) store (NN, 0.36, Bill Levitz) shelves (NNS, 0.19, Bill Levitz)
        word_similarity: 0.36
        word_similar_entities: store Bill Levitz
        candidate: 1
        entities: Bill Levitz; Atwood Richards Inc; Argent Trading
        context:  There is , after all , a market for those Year 2000 bears and millennium sweatshirts languishing on store shelves . At least two international corporate barter companies , which swap surplus goo

In [14]:
database.write_candidates('../results/out.txt')


Writing the candidates...
Done (elapsed time: 4s).

