# Merging EB terms-  NLS -  Encyclopaedia Britannica


### Loading the necessary libraries

In [1]:
import yaml
import numpy as np
import collections

In [2]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize
from difflib import SequenceMatcher

### Functions

In [3]:
def read_query_results(filename):
    with open('./results_NLS/'+filename, 'r') as f:
        query_results = safe_load(f)
    return query_results


In [4]:
def write_query_results(filename, results):
    with open('./results_NLS/'+filename, 'w') as f:
        documents = yaml.dump(results, f)

In [5]:
def create_dataframe(query_results):
  
    
    for edition in query_results:
        for page in query_results[edition]:
            #print(page[1].keys())
            column_list=list(page[1].keys())
            break
        break
        
    data=[]
    for edition in query_results:
        for page in query_results[edition]:
            try:
                data.append(page[1])
               
            except:
                pass
    df = pd.DataFrame(data, columns = column_list)
    #removing the columns that I dont need 
    df= df.drop(['last_term_in_page', 'model', 'num_articles', 'num_page_words', 'num_text_unit' , 'text_unit', 'type_archive'], axis=1)
    #renaming the page num
    df= df.rename(columns={"text_unit_id": "start_page", "type_page": "type_article"})
    #removing 'Page' from the string
    df["start_page"] = df["start_page"].str.replace("Page", "")
    df["start_page"] = df["start_page"].astype(int)
    df["end_page"] = df["end_page"].astype(int)
    df_tmp= df["edition"].str.split("Volume", expand=True)[1].str.split(",", expand=True)
    df["volume"]= df_tmp[0]
    df["letters"] = df_tmp[1]
    df['volume'] = df["volume"].str.replace(" ", "").astype(int)
    df['term'] = df["term"].str.replace("_def", "")
    df['term']= df["term"].str.replace('[^a-zA-Z0-9]', '')
    mask=df["term"].str.isalpha()
    df=df.loc[mask] 
    df['term'] = df['term'].str.upper()
    
    #df['archive'] = df["archive_filename"].str.replace("/ ", "")144133901/
    

    list_editions={"1":["first", "First"], "2":["second", "Second"],\
               "3":["third", "Third"],\
               "4":["fourth", "Fourth"], \
               "5":["fifth","Fifth"], "6":["sixth","Sixth"],\
               "7":["seventh", "Seventh"], "8":["eighth", "Eighth"]} 
    
    for ed in list_editions:
        for ed_versions in list_editions[ed]:
            mask = df["edition"].str.contains(ed_versions)
            df.loc[mask, 'edition_num'] = ed  
    df['edition_num']=df["edition_num"].astype(int)
    a=df["archive_filename"].str.split("/").str[-2]
    df['source_text_file']= a+ "/" + df["source_text_file"]   
    df= df.drop(['edition', 'archive_filename'], axis=1)
    
    
    df = df[["term", "definition", "related_terms", "num_article_words", "header", "start_page", "end_page",  "term_id_in_page", "type_article", "edition_num", "volume", "letters", "year", "title",  "place", "source_text_file"  ]]
    
    df = df[df['term'] != '']
    
    return df

In [6]:
def similar(a, b):
    a=a.lower()
    b=b.lower()
    return SequenceMatcher(None, a, b).ratio()

In [7]:
def create_dataframe_from_file(filename):
    with open('./results_NLS/'+filename, 'r') as f:
        query_results = safe_load(f)
    
    df = create_dataframe(query_results)
    return df

In [8]:
def prune_json(json_dict):
    """
    Method that given a JSON object, removes all its empty fields.
    This method simplifies the resultant JSON.
    :param json_dict input JSON file to prune
    :return JSON file removing empty values
    """
    final_dict = {}
    if not (isinstance(json_dict, dict)):
        # Ensure the element provided is a dict
        return json_dict
    else:
        for a, b in json_dict.items():
            if b or isinstance(b, bool):
                if isinstance(b, dict):
                    aux_dict = prune_json(b)
                    if aux_dict:  # Remove empty dicts
                        final_dict[a] = aux_dict
                elif isinstance(b, list):
                    aux_list = list(filter(None, [prune_json(i) for i in b]))
                    if len(aux_list) > 0:  # Remove empty lists
                        final_dict[a] = aux_list
                else:
                    final_dict[a] = b
    return final_dict

In [9]:
def delete_entries(query_results_updated, eliminate_pages):
    new_results={}
    for edition in query_results_updated:
        new_results[edition]=[]
        for page_idx in range(0, len(query_results_updated[edition])):
            if page_idx not in eliminate_pages[edition]:
                new_results[edition].append(query_results_updated[edition][page_idx])
    return new_results

In [10]:
def merge_articles(query_results):
    eliminate_pages={}
    for edition in query_results:
        eliminate_pages[edition]=[]
        page_number_dict={}
        for page_idx in range(0, len(query_results[edition])):
            
            current_page=query_results[edition][page_idx][0]
            if current_page not in page_number_dict:
                page_number_dict[current_page]=page_idx
            
            element = query_results[edition][page_idx][1]
            if "previous_page" in element['term']:
                current_definition= element["definition"]
                previous_page_idx= page_idx -1
                previous_page_number = current_page -1
                num_article_words=element["num_article_words"]
                related_terms=element["related_terms"]
            
                
                prev_elements = query_results[edition][previous_page_idx][1]
                if prev_elements["last_term_in_page"]:
                   
                    prev_elements["definition"]+=current_definition
                    prev_elements["num_article_words"]+=num_article_words
                    prev_elements["related_terms"]+= related_terms
                    prev_number = int(prev_elements['text_unit_id'].split("Page")[1])
                    prev_elements["end_page"] = current_page
                    
                    for prev_articles_idx in range(page_number_dict[prev_number], page_idx):
                       
                        if query_results[edition][prev_articles_idx][0] == prev_number:
                           
                            query_results[edition][prev_articles_idx][1]["num_page_words"]+=num_article_words
                    
                  
                    for update_element_idx in range(page_number_dict[current_page], page_idx+1):
                        if query_results[edition][update_element_idx][0] == current_page:
                            query_results[edition][update_element_idx][1]["num_page_words"]-=num_article_words
                            query_results[edition][update_element_idx][1]["num_articles"]-=1
                    
                
                eliminate_pages[edition].append(page_idx)
            else:
                element["end_page"] = current_page  
   
    new_results= delete_entries(query_results, eliminate_pages)
    
    return new_results

In [29]:
def merge_topics(query_results):
    eliminate_pages={}
    provenance_removal={}
    for edition in query_results:
        eliminate_pages[edition]=[]
        provenance_removal[edition]=[]
        merged_topics={}
        page_idx = 0
        while page_idx < len(query_results[edition]):
            current_page=query_results[edition][page_idx][0]        
            element = query_results[edition][page_idx][1]

            if "Topic" in element['type_page']:
                term= element["term"]
                next_page_idx= page_idx + 1
                       
                if next_page_idx < len(query_results[edition]):
                    flag=0
                    for p_id in range(next_page_idx, len(query_results[edition])):
                        next_element = query_results[edition][p_id][1]
                      
                        if similar(term, next_element["term"]) > 0.72:
            
                            if term not in merged_topics:
                                merged_topics[term]=[]
                            merged_topics[term].append(next_element["term"])
                            
                            element["definition"]+=next_element["definition"]
                            element["num_article_words"]+=next_element["num_article_words"]
                            element["num_page_words"]+=next_element["num_page_words"]                  
                            element["related_terms"]+= next_element["related_terms"]
                            element["end_page"] = int(next_element['text_unit_id'].split("Page")[1])
                            provenance_removal[edition].append(element["end_page"])

                            eliminate_pages[edition].append(p_id)
                     
                        else:
                            break
                    page_idx= p_id 
                    
                else:
                    page_idx = next_page_idx
               
            else:
                page_idx += 1
           
    for ed in provenance_removal:
        print("ED:%s -- removing the following pages %s" %(ed, provenance_removal[ed]))
    new_results= delete_entries(query_results, eliminate_pages)
    
    return new_results, merged_topics

### 1. Reading, Merging articles,  and Writing the results in a new file

Here we are going to take the output of the defoe files, and we are going to merge the terms that splitted across pages. 

In [13]:
query_results=read_query_results('results_eb_1_edition')

Lets mege articles splitted across pages together

In [14]:
query_results_articles =merge_articles(query_results)

Now lets merge topics together!

In [30]:
query_results_updated, merged_topics =merge_topics(query_results_articles)

ED:First edition, 1771, Volume 1, A-B -- removing the following pages [14, 62, 65, 66, 71, 72, 73, 79, 80, 81, 82, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 99, 115, 119, 120, 121, 122, 123, 124, 137, 138, 141, 142, 143, 146, 150, 155, 200, 201, 208, 209, 232, 235, 244, 245, 253, 254, 256, 257, 261, 270, 271, 302, 303, 309, 328, 329, 370, 373, 402, 403, 404, 406, 420, 421, 422, 439, 440, 442, 443, 451, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 475, 483, 484, 487, 488, 494, 495, 496, 498, 499, 500, 501, 505, 520, 521, 522, 523, 524, 525, 526, 527, 528, 531, 532, 533, 534, 535, 536, 537, 538, 541, 544, 547, 548, 549, 554, 555, 556, 557, 558, 559, 560, 563, 564, 565, 566, 569, 570, 571, 572, 575, 576, 577, 578, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 592, 593, 594, 595, 596, 601, 602, 603, 604, 671, 674, 675, 678, 679, 680, 694, 703, 707, 708, 709, 710, 711, 712, 713, 739, 749, 750, 751, 752, 756, 757, 758, 759, 760, 761, 765, 766, 778, 779, 780, 781, 782, 783, 785, 7

This is the list of merged topics

In [17]:
merged_topics

{'MECHANICS': ['MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'AMECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECIIANICS',
  'oMECHANICS',
  'MECHANICS',
  'MECHANICS',
  'MECHANICS',
  'SMECHANICS'],
 'MEDICINE': ['MEDCINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEIICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDCINE',
  'jMEDICINE',
  'MEDCINE',
  'MEDCINE',
  'MEDCINE',
  'MEDICINE',
  'MEDICINE',
  'MEDIcINE',
  'MEDICINE',
  'MEDICINE',
  'looMEDICINE',
  'MEDCINE',
  'ioMEDICINE',
  'MEDICINE',
  'jcMEDICINE',
  'MEDICINE',
  'JMEDICINE',
  'MEDICINE',
  'noMEDICINE',
  'MEDICINE',
  'iiMEDICINE',
  'MEDICINE',
  'iiMEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'jMEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDICINE',
  'MEDIClNE',
  'MEDICINE',
  'MEDICINE',
  'VyMEDICINE',
  

Once the data has been merged, we are going to store it in a file, just to have the data merged.  

In [18]:
write_query_results("results_eb_1_edition_updated", query_results_updated)

### 2. Creating a dataframe from the updated results

Once, we have the terms properly merged, we are going to create a dataframe, which we will be use later to do further exploration. In this dataframe we have dropped some information from the original defoe files, that we don not longer need. 

**The dataframe will have the following columns**

- definition:           Definition of the article
- edition_num:          1,2,3,4,5,6,7,8
- header:               Header of the page's article                                  
- num_article_words:    Number of words per article
- place:                Place where the volume was edited (e.g. Edinburgh)                                    
- related_terms:        Related articles (see X article)  
- source_text_file:     File Path of the XML file from which the article belongs       
- term:                 Article name                            
- term_id_in_page:      Number of article in the page     
- start_page:           Number page in which the article starts 
- end_page:             Number page in which the article ends 
- title:               Title of the Volume
- type_article:            Type of Page [Full Page| Topic| Mix | Articles]                                       
- year:                 Year of the Volume
- volume:               volume (e.g. 1)
- letters:              leters of the volume (A-B)


### IMPORTANT DECISION

I am going to filter OUT all the entries which are not Articles, Topics, or Mix.

In [34]:
df=create_dataframe(query_results_updated)

In [35]:
includeKeywords=["Article", "Topic", "Mix"]
df=df[df["type_article"].str.contains('|'.join(includeKeywords)).any(level=0)].reset_index()

In [36]:
df.head()

Unnamed: 0,index,term,definition,related_terms,num_article_words,header,start_page,end_page,term_id_in_page,type_article,edition_num,volume,letters,year,title,place,source_text_file
0,1,FIRSTARTICLE,S :u -I >;J .1 M U a C V',[],10,**■*,8,8,0,Article,1,1,A-B,1771,"Encyclopaedia Britannica; or, A dictionary of ...",Edinburgh,nls-data-encyclopaediaBritannica/alto/18808281...
1,4,VIPREFACE,"TH E Editors, though fully fen&ble of the prop...",[],410,viPREFACE,12,12,0,Topic,1,1,A-B,1771,"Encyclopaedia Britannica; or, A dictionary of ...",Edinburgh,nls-data-encyclopaediaBritannica/alto/18808286...
2,5,LISTOFAUTHORSC,"Albini tabule anatomies, Alfton’s ‘Tirocinium ...",[],911,LISTofAUTHORSc,13,14,0,Topic,1,1,A-B,1771,"Encyclopaedia Britannica; or, A dictionary of ...",Edinburgh,nls-data-encyclopaediaBritannica/alto/18808287...
3,6,OR,"A NEW A D I C T I A A, the name of several riv...",[],54,EncyclopaediaBritannica,15,15,0,Article,1,1,A-B,1771,"Encyclopaedia Britannica; or, A dictionary of ...",Edinburgh,nls-data-encyclopaediaBritannica/alto/18808290...
4,7,AABAM,"a term, among alchemifts, for lead,",[],6,EncyclopaediaBritannica,15,15,1,Article,1,1,A-B,1771,"Encyclopaedia Britannica; or, A dictionary of ...",Edinburgh,nls-data-encyclopaediaBritannica/alto/18808290...


### 3. Saving the dataframe to json file 

In [37]:
df.to_json(r'./results_NLS/results_eb_1_edition_postprocess_dataframe', orient="index") 