# Merging EB terms-  NLS -  Encyclopaedia Britannica


### Loading the necessary libraries

In [1]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [2]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize

### Functions

In [3]:
def read_query_results(filename):
    with open('./results_NLS/'+filename, 'r') as f:
        query_results = safe_load(f)
    return query_results


In [4]:
def write_query_results(filename, results):
    with open('./results_NLS/'+filename, 'w') as f:
        documents = yaml.dump(results, f)

In [5]:
def create_dataframe(data):
    return pd.DataFrame(data.items())

In [6]:
def prune_json(json_dict):
    """
    Method that given a JSON object, removes all its empty fields.
    This method simplifies the resultant JSON.
    :param json_dict input JSON file to prune
    :return JSON file removing empty values
    """
    final_dict = {}
    if not (isinstance(json_dict, dict)):
        # Ensure the element provided is a dict
        return json_dict
    else:
        for a, b in json_dict.items():
            if b or isinstance(b, bool):
                if isinstance(b, dict):
                    aux_dict = prune_json(b)
                    if aux_dict:  # Remove empty dicts
                        final_dict[a] = aux_dict
                elif isinstance(b, list):
                    aux_list = list(filter(None, [prune_json(i) for i in b]))
                    if len(aux_list) > 0:  # Remove empty lists
                        final_dict[a] = aux_list
                else:
                    final_dict[a] = b
    return final_dict

In [7]:
def merge_terms(query_results):
    for page in query_results:
        for element in query_results[page]:
            if "previous_page" in element['term']:
                current_page=int(element['text_unit_id'].split("Page")[1])
                current_definition= element["definition"]
                previous_page_number= current_page-1
                previous_page=previous_page_number
                num_article_words=element["num_article_words"]
                related_terms=element["related_terms"]
                try:
                    if query_results[previous_page]:
                        flag_prev = 1
                except:
                    flag_prev = 0
                while not flag_prev:
                    previous_page_number = previous_page_number -1
                    previous_page=previous_page_number
                    try: 
                        if query_results[previous_page]:
                            flag_prev = 1
                    except:
                        flag_prev = 0
        
                #print("Current Page %s - Previous Page %s" %(element['text_unit_id'], previous_page))
                for prev_elements in query_results[previous_page]:
                    if prev_elements["last_term_in_page"]:
                        prev_elements["definition"]+=current_definition
                        prev_elements["num_article_words"]+=num_article_words
                        prev_elements["related_terms"]+= related_terms
                    
                    prev_elements["num_page_words"]+=num_article_words
                    
                for update_element in query_results[page]:
                    update_element["num_page_words"]-=num_article_words
                    update_element["num_articles"]-=1
                    
                element_index=query_results[page].index(element)
                del query_results[page][element_index]
    return query_results

In [8]:
query_results=read_query_results('results_eb_first')

In [10]:
query_results_updated=merge_terms(query_results)

In [11]:
prune_results=prune_json(query_results_updated)

In [12]:
write_query_results("results_eb_first_updated", prune_results)

In [13]:
df=create_dataframe(query_results_updated)

In [14]:
df[0]

0        2
1        3
2        8
3        9
4       11
      ... 
715    826
716    827
717    828
718    829
719    830
Name: 0, Length: 720, dtype: int64

In [15]:
df.iloc[0][1]

[{'archive_filename': '/Users/rosafilgueira/HW-Work/NLS-Fellowship/work/defoe/nls-data-encyclopaediaBritannica/first_edition/144133901/',
  'definition': 'n*s-f 7^\' v L i A j J ^ /^^W / ; H:;^’ J }r-r£c9\'&} "*— " ..^4-—>, \'I ■ . ,/. ■ -,... v V *•/U^v UJ~L ^ (txk^L j 1rvt*Xitj $ /i *4j/cJysx*£>Xb<. f^oLZ^^c^. % \'bv C JJ. \' }v*c Ccl U^K <77t . t^cCv-yt^yA. *-? ^v. •^GL* ftc *frt yylrr Cj? yu>t f\\ ^^2!',
  'edition': 'First edition, 1771, Volume 1, A-B',
  'header': 'iiiubnsfvnsfv',
  'last_term_in_page': 1,
  'model': 'nlsArticles',
  'num_article_words': 58,
  'num_articles': 1,
  'num_page_words': 59,
  'num_text_unit': 832,
  'place': 'Edinburgh',
  'related_terms': [],
  'source_text_file': 'alto/188082735.34.xml',
  'term': 'iiiubnsfvnsfv',
  'term_id_in_page': 0,
  'text_unit': 'page',
  'text_unit_id': 'Page2',
  'title': 'Encyclopaedia Britannica; or, A dictionary of arts and sciences, compiled upon a new plan',
  'type_archive': 'book',
  'type_page': 'FullPage',
  'year'