# Exploring Terms in the Encyclopaedia Britannica


### Loading the necessary libraries

In [1]:
import yaml
import matplotlib.pyplot as plt
import numpy as np
import collections
import matplotlib as mpl

In [2]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize

In [3]:
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.express as px

## We have dataframe with these information

- definition:           Definition of a term
- editionNum:           1,2,3,4,5,6,7,8
- editionTitle:         Title of the edition
- header:               Header of the page's term                                  
- place:                Place where the volume was edited (e.g. Edinburgh)                                    
- relatedTerms:         Related terms (see X article)  
- altoXML:              File Path of the XML file from which the term belongs       
- term:                 Term name                            
- positionPage:         Position of ther term in the page     
- startsAt:             Number page in which the term definition starts 
- endsAt:               Number page in which the term definition ends 
- volumeTitle:          Title of the Volume
- typeTerm:             Type of term [Topic| Articles]                                       
- year:                 Year of the edition
- volumeNum:            Volume number (e.g. 1)
- letters:              leters of the volume (A-B)
- part:                 Part of the volume (e.g 1)
- supplement:           Supplement's Title
- supplementsTo:        It suppelements to editions [1, 2, 3....]
- numberOfWords:        Number of words per term definition
- numberOfTerms:        Number of terms per page
- numberOfPages:        Number of pages per volume

### 1. Load dataframe from JSON file

In [4]:
df = pd.read_json('./results_NLS/results_eb_1_edition_dataframe', orient="index") 

In [5]:
df = df[["term", "definition", "relatedTerms", "header", "startsAt", "endsAt", "numberOfTerms","numberOfWords", "numberOfPages", \
             "positionPage", "typeTerm", "editionTitle", "editionNum", "supplementTitle", "supplementsTo",\
             "year", "place", "volumeTitle", "volumeNum", "letters", "part", "altoXML"]]
df

Unnamed: 0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionNum,supplementTitle,supplementsTo,year,place,volumeTitle,volumeNum,letters,part,altoXML
10,AADE,"the name of two rivers, one in the country of ...",[],EncyclopaediaBritannica,15,15,22,19,832,3,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
100,ABETTOR,"a law-term, implying one who encourages anothe...",[],ABE,18,18,16,55,832,0,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082943.34.xml
1000,ALBY,"or Alb 1, a city of France in the province of ...",[],ALBALC,106,106,31,20,832,22,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188084090.34.xml
10000,INYBURG,"a town of Denmark, situated at the eafiend of ...",[],NYBNYS,473,473,14,25,872,2,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810223.34.xml
10001,NYCHTHEMERON,"the natural day, or day and night, which toget...",[],NYBNYS,473,473,14,13,872,3,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810223.34.xml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,NUT,"among botaniils, denotes a pericarpiurn of an ...",[],NUTNUT,472,472,12,14,872,8,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml
9996,NUTATION,"in aflronomy, a kind of tremulous motion of th...",[],NUTNUT,472,472,12,33,872,9,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml
9997,NUTMEG,"the kernel of a large fruit, not unlike the Th...","[MACE, PEEMED, DUTCH, THELARGEFT, EAP-INDIES, ...",NUTNUT,472,472,12,451,872,10,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml
9998,NUTRITION,"in the animal ceconomy, is the repairing the c...",[PISTACHIA],NUTNUT,472,473,12,486,872,11,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",3,M-Z,0,144133903/alto/144810211.34.xml


### 2. Group results by year

In [6]:
df.groupby("year").count()

Unnamed: 0_level_0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionTitle,editionNum,supplementTitle,supplementsTo,place,volumeTitle,volumeNum,letters,part,altoXML
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1771,13616,13616,13616,13616,13616,13616,13616,13616,13616,13616,...,13616,13616,13616,13616,13616,13616,13616,13616,13616,13616
1773,13812,13812,13812,13812,13812,13812,13812,13812,13812,13812,...,13812,13812,13812,13812,13812,13812,13812,13812,13812,13812


### 3. Group results by letters

In [7]:
df.groupby("letters").count()

Unnamed: 0_level_0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionTitle,editionNum,supplementTitle,supplementsTo,year,place,volumeTitle,volumeNum,part,altoXML
letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A-B,7897,7897,7897,7897,7897,7897,7897,7897,7897,7897,...,7897,7897,7897,7897,7897,7897,7897,7897,7897,7897
C-L,10167,10167,10167,10167,10167,10167,10167,10167,10167,10167,...,10167,10167,10167,10167,10167,10167,10167,10167,10167,10167
M-Z,9364,9364,9364,9364,9364,9364,9364,9364,9364,9364,...,9364,9364,9364,9364,9364,9364,9364,9364,9364,9364


### 4. Group results by letters and years

In [8]:
df.groupby(['year', 'letters'])["letters"].count()

year  letters
1771  A-B        3898
      C-L        5097
      M-Z        4621
1773  A-B        3999
      C-L        5070
      M-Z        4743
Name: letters, dtype: int64

#### Remark:
Note, that some of those terms can be repeated

### 5. Filtering by TERMS: ABACUS

We are going to explore the term ABACUS

Notice that the first edition of the EB, there 6 volumes, 3 published in 1771, and 3 published in 1773. However, the volumes from 1773 are a re-print from the ones of 1771. 

In [9]:
df_by_term=df.groupby(['term', 'year'])["term"].count()
df_by_term["ABACUS"]


year
1771    4
1773    2
Name: term, dtype: int64

### 5.1 Exploring ABACUS in 1771

We are going to explore the term "ABACUS" in the volumes from 1771.

In [10]:
abacus_df= df[df['term'].str.contains("ABACUS")]
abacus_df = abacus_df[abacus_df['year'] == 1771]
abacus_df

Unnamed: 0,term,definition,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,numberOfPages,positionPage,...,editionNum,supplementTitle,supplementsTo,year,place,volumeTitle,volumeNum,letters,part,altoXML
27,ABACUS,"a table strewed over with dust or sand, upon w...",[],EncyclopaediaBritannica,15,15,22,23,832,20,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
28,ABACUS,"in architeflure, signifies the superior part o...",[],EncyclopaediaBritannica,15,16,22,122,832,21,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082904.34.xml
29,ABACUS,is also the name of an ancient instrument for ...,[],ABAABB,16,16,37,125,832,1,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082917.34.xml
30,ABACUS,"logijlicus, a right-angled triangle, whose sid...",[],ABAABB,16,16,37,50,832,2,...,1,,[],1771,Edinburgh,"Encyclopaedia Britannica; or, A dictionary of ...",1,A-B,0,144133901/alto/188082917.34.xml


#### Remark:
So, the TERM "ABACUS" appears 4 times, across two pages. 

#### 5.2 Getting the definnition for each of them 

In [11]:
for i in abacus_df["definition"]:
    print ("ABACUS - Definition: %s" %i)
    print("---")

ABACUS - Definition: a table strewed over with dust or sand, upon which the ancient mathematicians drew their figures, It also signified a cupboard, or buffet.
---
ABACUS - Definition: in architeflure, signifies the superior part or member of the capital of a column, and serves as a kind of crowning to both. It was originally intended to represent a square tile covering a basket. The form of the abacus is not the same in all orders: in the Tuscan, Doric, and Ionic, it‘is generally square; but in the Corinthian and Compofite, its four sides are arched ir Avards, and embellilhed in the middle withornament, as a rose or other flower, Scammozzi uses abacus for a concave moulding on the capital of the Tuscan pedefial; and Palladio calls the plinth above the echinus, or boultin, in the Tufean and Doric orders, by the same name. See plate I. fig. i. and
---
ABACUS - Definition: is also the name of an ancient instrument for facilitating operations in arithmetic. It is vadoully contrived. That 

#### 5.3 Creating groups of terms and years

In [12]:
#df.groupby(['term', 'year']).groups.keys()

In [13]:
len(df.groupby(['term']).groups['ABACUS'])

6

### 6. Grouping the results by TERM, YEAR and DEFINITION

Group data by term and years, and count each group. This will help us to see how many times each term is repeated by year. 

In [14]:
a=df.groupby(['term', 'year'])['definition'].count()
a

term         year
A            1771    7
             1773    6
AA           1771    2
             1773    3
AAB          1773    1
                    ..
ZYGOMA       1771    1
             1773    1
ZYGOMATICUS  1773    1
ZYGOPHYLLUM  1771    1
             1773    1
Name: definition, Length: 26760, dtype: int64

**Remark**: This means that the term "A" appears 7 times  in 1771, and 6 times in 1773. 

#### 6.1 Obtaining for each term, its years and the definitions

We are going to create groups of ("TERM", "YEAR"), and for each of those groups, we are going to print their definition.

**Remark**: I am going to restrict them to 10 groups. 

In [15]:
groups = df[['term', 'year', 'definition']].groupby(['term', 'year'])
cont = 0
for group_key, group_value in groups:
    group = groups.get_group(group_key)
    print(group)
    print("---- Len of this group: %s - group_key %s " %(len(group), group_key))
    cont+=1
    if cont > 10:
        break


      term  year                                         definition
1018     A  1771                                    See Alchemilla.
1088     A  1771  gives a—by and “ the investigation of that sur...
12050    A  1771  r Y\ C / 7 f C.A ( ^y \ ~^\ \ C' A h \A v m aa...
12051    A  1771  I -w 'i <? ^' 0 IY\, y f‘ 1 A_-A IV^-/Y\< -'/W...
12724    A  1771  /kins may be tawed : but thc-se chiefly used f...
3238     A  1771  -Bladder, in physiology. See Air. ^//-Bladders...
6935     A  1771  in London is cieditor to (B) in Paris, value 1...
---- Len of this group: 7 - group_key ('A', 1771) 
      term  year                                         definition
20828    A  1773  in London is creditor to (B) in Paris, value t...
22717    A  1773  so served, and the day of appearance. When the...
23621    A  1773                                     f a 2±= m I ±P
24231    A  1773  l class. ' The calix consists of five leaves, ...
25138    A  1773  worate performance. Government lias, however, .

#### 6.2 Exploring how many times each term appears per year

Now, lets get the size of those groups, so we can see how many definitions we have per term and per year.
This exactly the same that we did in the 6.1, but having the results in dataframe format.

In [16]:
g_year_term=df.groupby(['term', 'year']).size().reset_index()
g_year_term=g_year_term.rename(columns={0: "number_of_times"})
g_year_term

Unnamed: 0,term,year,number_of_times
0,A,1771,7
1,A,1773,6
2,AA,1771,2
3,AA,1773,3
4,AAB,1773,1
...,...,...,...
26755,ZYGOMA,1771,1
26756,ZYGOMA,1773,1
26757,ZYGOMATICUS,1773,1
26758,ZYGOPHYLLUM,1771,1


In [17]:
fig = px.line(g_year_term, x="term", y="number_of_times", title='Number of times that each term appears')
fig.show()

#### 6.2.1 Grouping the previous results per year. 
This will give us the number of terms that we have per year. 
**Remember that a term can appear several time per volume**.

In [18]:
g_year_term.groupby(['year']).size()

year
1771    13248
1773    13512
dtype: int64

#### 6.3 Exploring in how many years each term appears across years

Here we are interested to explore, for each unique term, in how many years appears. 

**Remark**: In the first eddition of the EB, 3 volumes (A-B, C-L, and M-Z) are published in two years: 1771 and 1773. So, the miminum time that each term appears in the first eddition is one, being two the maximum number of times. 

In [19]:
#here we get if a term appears 1 or in 2 sub_edditions. 
terms_per_ed=g_year_term[['term', 'year']].groupby(['term']).count()
#print(terms_per_ed.max())
terms_per_ed

Unnamed: 0_level_0,year
term,Unnamed: 1_level_1
A,2
AA,2
AAB,1
AABAM,1
AACH,1
...,...
ZUYDERSEE,2
ZWEIBRUGGEN,2
ZYGOMA,2
ZYGOMATICUS,1


This means that the term "A" appears in two years. And the term "AAB" only appears in one year

#### 6.3.1 Exploring the terms that only appear in 1 year

In [20]:
terms_only_once=terms_per_ed[terms_per_ed["year"]<2].reset_index()
terms_only_once

Unnamed: 0,term,year
0,AAB,1
1,AABAM,1
2,AACH,1
3,AADE,1
4,AAHUS,1
...,...,...
4149,ZEDILE,1
4150,ZEGIAS,1
4151,ZEGILETHRON,1
4152,ZINC,1


Spliting the previous results, so we can know how many terms (that only appears once), correspond to "1771" and to "1773" years.

In [21]:
list_terms_once=terms_only_once["term"].to_list()
cont_dict={1771:0, 1773:0}

for i in list_terms_once:
    i_year=g_year_term.loc[g_year_term['term'] == i]["year"].to_string(index=False)

    cont_dict[int(i_year)]+=1
cont_dict

{1771: 1945, 1773: 2209}

#### 6.3.2 Exploring the terms that only appear in 2 years

In [22]:
terms_more_once=terms_per_ed[terms_per_ed["year"]>1].reset_index()
terms_more_once

Unnamed: 0,term,year
0,A,2
1,AA,2
2,AB,2
3,ABACUS,2
4,ABADAN,2
...,...,...
11298,ZUTPHEN,2
11299,ZUYDERSEE,2
11300,ZWEIBRUGGEN,2
11301,ZYGOMA,2
