# Metadata and Pages Text-  NLS 


We can have two types of digital collections:
    - One that has a volume 
    - Has several books per collection


### Loading the necessary libraries

In [1]:
import yaml
import numpy as np
import collections
import string
import copy

In [2]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize
from difflib import SequenceMatcher

In [3]:
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.express as px

In [4]:
import networkx as nx
import matplotlib.pyplot as plt

In [5]:
#!python -m spacy download en_core_web_sm

In [6]:
import spacy

In [7]:
nlp = spacy.load("en_core_web_sm")

### Functions

In [8]:
def read_query_results(filename):
    with open('results/'+filename, 'r') as f:
        query_results = safe_load(f)
    return query_results


In [9]:
def write_query_results(filename, results):
    with open('results/'+filename, 'w') as f:
        documents = yaml.dump(results, f)

In [10]:
def find_persons(text):
    if text:
        doc2 = nlp(text)
    
        # Identify the persons
        persons = [ent.text for ent in doc2.ents if ent.label_ == 'PERSON']
    
        # Return persons
        return persons
    else:
        return None

In [169]:
def create_dataframe(metadata_results, collection_name):
  
    for volume in metadata_results:
        for page in metadata_results[volume]:
            print(page.keys())
            column_list=list(page.keys())
            break
        break
        
    data=[]
    for volume in metadata_results:
        for page in metadata_results[volume]:
            try:
                data.append(page)
            except:
                pass
    df = pd.DataFrame(data, columns = column_list)
    
     
    df= df.rename(columns={"serie":"serieTitle", "subtitle":"serieSubTitle",
                           "num_words":"numberOfWords",\
                           "title":"volumeTitle", "referenced_by":"referencedBy",\
                           "num_pages":"numberOfPages", "name_termsOfAddress":"termsOfAddress", \
                           "source_text_file": "altoXML", "num_text_unit": "numberOfPages", \
                           "text_unit_id":"pageNum",\
                           "num_words":"numberOfWords", \
                           "physical_description": "physicalDescription"})
    
   
    df= df.drop(['geographic', 'country', 'topic', 'city', 'temporal', 'dateIssued'], axis=1)
    
    alto=df["altoXML"]
    
    df["altoXML"]= df["volumeId"]+"/"+alto
    
    list_mmsid=(df["MMSID"].unique()).tolist()
    
    for i, row in df.iterrows():
        position= list_mmsid.index(row['MMSID'])
        df.at[i,'serieNum'] = position
        
    df["serieNum"]=df["serieNum"].fillna(0).astype(int)
    df["part"] = 0
    df["year"] = df["year"].fillna(0).astype(int)
    df["collectionName"]=collection_name
    
    vol_tmp=df["volumeTitle"].apply(lambda x: x.split('.'))
    list_vol=(vol_tmp).tolist()

    
    for i, row in df.iterrows():
        if len(list_vol[i])>1:
            if "year" in list_vol[i][1]:
                vol_info='.'.join(list_vol[i][0:2])
                sub_info=' '.join(list_vol[i][2:])
            else:
                vol_info='.'.join(list_vol[i][0:1])
                sub_info=' '.join(list_vol[i][1:])
            df.at[i,'volumeTitle'] = vol_info
            if df.at[i,'serieSubTitle']:
                df.at[i,'serieSubTitle'] = sub_info + " " + df.at[i,'serieSubTitle']
            else:
                df.at[i,'serieSubTitle'] = sub_info
        else:
            vol_info="".join(list_vol[i][0])
            df.at[i,'volumeTitle'] = vol_info
    
    vol_tmp=df["volumeTitle"].apply(lambda x: x.split(':'))
    list_vol=(vol_tmp).tolist()

    
    for i, row in df.iterrows():
        if len(list_vol[i])>1:
            if "year" in list_vol[i][1]:
                vol_info='.'.join(list_vol[i][0:2])
                sub_info=' '.join(list_vol[i][2:])
            else:
                vol_info='.'.join(list_vol[i][0:1])
                sub_info=' '.join(list_vol[i][1:])
            df.at[i,'volumeTitle'] = vol_info
            if df.at[i,'serieSubTitle']:
                df.at[i,'serieSubTitle'] = sub_info + " " + df.at[i,'serieSubTitle']
            else:
                df.at[i,'serieSubTitle'] = sub_info
        else:
            vol_info="".join(list_vol[i][0])
            df.at[i,'volumeTitle'] = vol_info
      
    df["serieTitle"]=df["volumeTitle"]
    if df["edition"].loc[0]!= None:
        df["volumeTitle"]=df["volumeTitle"]+" "+df["edition"]
    
    tmp = df['pageNum'].apply(lambda s:s.split('Page')[1])
    df['pageNum']= tmp
   
    try:
        df["referencedBy"]=df["referencedBy"].split("----")
    except:
        pass
    
    ### this takes a while! 
    df["publisherPersons"]=df["publisher"].apply(lambda x: find_persons(x))

    ### this should be improved in the future - these would not work for other collections
    ##  i have detected copies of the same book, with different volumeID. 
    # So, going to add them as new vols, of the same serie. 
    
    series_vol=df.groupby(['MMSID', 'volumeId']).size().reset_index()
    count_vol=series_vol.groupby(['MMSID'])['volumeId'].size().reset_index()
    for i, row in count_vol.iterrows():
        mmsid=row['MMSID']
        num_vol=row['volumeId']
        df.loc[df['MMSID'] == mmsid, 'numberOfVolumes'] = num_vol
    
    
    series_mmsid=series_vol.groupby(['MMSID'])['volumeId'].unique().reset_index()
    if df["edition"].loc[0] == None :
        for i, row in series_mmsid.iterrows():
            mmsid=row['MMSID']
            num_vol=len(row['volumeId'])
            list_vol=row['volumeId']
            for nv in range(0, num_vol):
                v_id=list_vol[nv]
                df.loc[df['volumeId'] == v_id, 'volumeNum'] = nv+1
    else:
        for i, row in df.iterrows():
            volum_num=row['edition']
            if "Volume" in volum_num:
                vnum=volum_num.split("Volume ")[1]
                vn=vnum.split(" ")[0]
            else:
                vn="1"
            row['volumeNum']=vn    
            v_id=row['volumeId']
            df.loc[df['volumeId'] == v_id, 'volumeNum'] = vn

    df["volumeNum"]=df["volumeNum"].fillna(0).astype(int)
    df["numberOfVolumes"]=df["numberOfVolumes"].fillna(0).astype(int)
    
    
    return df

### 1. Reading data

Here we are going to take the output of the defoe files and we are going to create the final dataframe, that we will use later for creating the knowlege graph.

In [122]:
!ls results/

README                                gazetters_scotland.ttl
README.md                             ladiesDebating.ttl
chapbooks_dataframe                   ladiesDebating_dataframe
chapbooks_metadata_pages.yml          ladiesDebating_dataframe.ttl
chapbooks_scotland.ttl                ladiesDebating_metadata_pages.yml
gazetterOfScotland_dataframe          ladies_debating.ttl
gazetterOfScotland_metadata_pages.yml


### Note: The next line takes time!

In [182]:
query_results=read_query_results('chapbooks_metadata_pages.yml')

In [183]:
metadata_results = copy.deepcopy(query_results)

### Note: The next line takes (a lot of) time!

In [184]:
df= create_dataframe(metadata_results, "Chapbooks printed in Scotland")

dict_keys(['MMSID', 'city', 'country', 'dateIssued', 'edition', 'editor', 'editor_date', 'genre', 'geographic', 'language', 'metsXML', 'name_termsOfAddress', 'num_pages', 'num_words', 'permanentURL', 'physical_description', 'place', 'publisher', 'referenced_by', 'shelfLocator', 'source_text_file', 'subtitle', 'temporal', 'text', 'text_unit_id', 'title', 'topic', 'volumeId', 'year'])


### Listing the columns

In [185]:
df.columns

Index(['MMSID', 'edition', 'editor', 'editor_date', 'genre', 'language',
       'metsXML', 'termsOfAddress', 'numberOfPages', 'numberOfWords',
       'permanentURL', 'physicalDescription', 'place', 'publisher',
       'referencedBy', 'shelfLocator', 'altoXML', 'serieSubTitle', 'text',
       'pageNum', 'volumeTitle', 'volumeId', 'year', 'serieNum', 'part',
       'collectionName', 'serieTitle', 'publisherPersons', 'numberOfVolumes',
       'volumeNum'],
      dtype='object')

In [186]:
#df.groupby(['MMSID', 'permanentURL']).size()
#df.loc[df['volumeTitle'].str.len() >200]
df["volumeTitle"]
df.iloc[0]

MMSID                                                   9937033633804341
edition                                                             None
editor                                                       Milne, John
editor_date                                                    1792-1871
genre                              Chapbooks-Scotland-Aberdeen-1801-1900
language                                                             eng
metsXML                                               104184105-mets.xml
termsOfAddress                                                      None
numberOfPages                                                          8
numberOfWords                                                         53
permanentURL                            https://digital.nls.uk/104184105
physicalDescription                                        8 p. ; 18 cm.
place                                                           Aberdeen
publisher                             Printed by A.

In [187]:
#df[df["volumeId"]=="104184137"]
df.loc[396]

MMSID                                                   9930677523804341
edition                                                             None
editor                                                              None
editor_date                                                         None
genre                                                               None
language                                                             eng
metsXML                                               104184137-mets.xml
termsOfAddress                                                      None
numberOfPages                                                          8
numberOfWords                                                         29
permanentURL                            https://digital.nls.uk/104184137
physicalDescription                                            8p. ; 8vo
place                                                         Edinburgh]
publisher                                          

In [188]:
df.iloc[0]

MMSID                                                   9937033633804341
edition                                                             None
editor                                                       Milne, John
editor_date                                                    1792-1871
genre                              Chapbooks-Scotland-Aberdeen-1801-1900
language                                                             eng
metsXML                                               104184105-mets.xml
termsOfAddress                                                      None
numberOfPages                                                          8
numberOfWords                                                         53
permanentURL                            https://digital.nls.uk/104184105
physicalDescription                                        8 p. ; 18 cm.
place                                                           Aberdeen
publisher                             Printed by A.

### Brief explanation of each column

- MMSID: Metadata Management System ID
- serieTitle:        Title of the collection
- serieSubTilte:     Subtitle of the collection
- editor:              Editor (person) of an edition or a supplement
- termsOfAddress:      Terms of Address of the editor (e.g. Sir)
- editor_date: Year of Birth - Year of Death
- genre:        genre of the editions
- language:     language used to write the volumes
- numberOfPages: number of pages of a volume
- physicalDescription: physical description of a collection
- place: place printed of a edition or a supplement
- publisher: publisher (organization or person) of a collection
- referencedBy: books which reference a collection
- shelfLocator: shelf locator of a collection
- subTitle: subtitle of an edition
- volumeTitle: title of a volume
- year: year of print
- volumeId: volume identifier
- metsXML: XML mets file
- permanentURL: URL of a volume
- publisherPersons: list of publishers which are persons - we have applied NLP for detecting the people!
- volumeNum: Number of a volume
- part: Part of a volume
- collectionNum: Number of a collection
- numberOfVolumes: Number of volumes per edition or supplement
- text: text of page
- numberOfWords: number of words per page
- altoXML: alto xml of each page
- PageNum: number of page

### Sanity checks

In [189]:

df[df["volumeId"]=="104184137"]

Unnamed: 0,MMSID,edition,editor,editor_date,genre,language,metsXML,termsOfAddress,numberOfPages,numberOfWords,...,volumeTitle,volumeId,year,serieNum,part,collectionName,serieTitle,publisherPersons,numberOfVolumes,volumeNum
396,9930677523804341,,,,,eng,104184137-mets.xml,,8,29,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
397,9930677523804341,,,,,eng,104184137-mets.xml,,8,249,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
398,9930677523804341,,,,,eng,104184137-mets.xml,,8,274,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
399,9930677523804341,,,,,eng,104184137-mets.xml,,8,269,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
400,9930677523804341,,,,,eng,104184137-mets.xml,,8,290,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
401,9930677523804341,,,,,eng,104184137-mets.xml,,8,278,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
402,9930677523804341,,,,,eng,104184137-mets.xml,,8,280,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1
403,9930677523804341,,,,,eng,104184137-mets.xml,,8,205,...,"mournfull song, upon the breach of national, a...",104184137,1724,32,0,Chapbooks printed in Scotland,"mournfull song, upon the breach of national, a...",[],1,1


In [190]:
df.loc[0]

MMSID                                                   9937033633804341
edition                                                             None
editor                                                       Milne, John
editor_date                                                    1792-1871
genre                              Chapbooks-Scotland-Aberdeen-1801-1900
language                                                             eng
metsXML                                               104184105-mets.xml
termsOfAddress                                                      None
numberOfPages                                                          8
numberOfWords                                                         53
permanentURL                            https://digital.nls.uk/104184105
physicalDescription                                        8 p. ; 18 cm.
place                                                           Aberdeen
publisher                             Printed by A.

In [191]:
df[["volumeId","numberOfVolumes"]]

Unnamed: 0,volumeId,numberOfVolumes
0,104184105,1
1,104184105,1
2,104184105,1
3,104184105,1
4,104184105,1
...,...,...
47324,120101372,2
47325,120101372,2
47326,120101372,2
47327,120101372,2


In [192]:
df.groupby(df["MMSID"])["volumeId"].count()

MMSID
9910029463804341     8
9910476853804341    48
9911213523804341    34
9911322163804341    12
991351003804341     24
                    ..
99910913804341      18
9991175753804341     8
999360103804341      8
999441113804341     20
999989663804341     16
Name: volumeId, Length: 2728, dtype: int64

In [193]:
df.groupby(df["volumeId"])["volumeId"].count()

volumeId
104184105     8
104184106     8
104184107    12
104184108    12
104184109    16
             ..
117874574     8
117874575     8
117874576     8
117874577     8
120101372    24
Name: volumeId, Length: 3080, dtype: int64

In [194]:
df[df["volumeId"]=="109857781"]

Unnamed: 0,MMSID,edition,editor,editor_date,genre,language,metsXML,termsOfAddress,numberOfPages,numberOfWords,...,volumeTitle,volumeId,year,serieNum,part,collectionName,serieTitle,publisherPersons,numberOfVolumes,volumeNum


In [195]:
df.groupby(['MMSID','permanentURL'])['permanentURL'].size()

MMSID             permanentURL                    
9910029463804341  https://digital.nls.uk/104184208     8
9910476853804341  https://digital.nls.uk/104185169    24
                  https://digital.nls.uk/104185768    24
9911213523804341  https://digital.nls.uk/104184169    24
                  https://digital.nls.uk/104186985    10
                                                      ..
9991175753804341  https://digital.nls.uk/104185287     8
999360103804341   https://digital.nls.uk/104185844     8
999441113804341   https://digital.nls.uk/104184325    20
999989663804341   https://digital.nls.uk/104185807     8
                  https://digital.nls.uk/104186586     8
Name: permanentURL, Length: 3080, dtype: int64

In [196]:
df.groupby(df["MMSID"])["volumeId"].count()

MMSID
9910029463804341     8
9910476853804341    48
9911213523804341    34
9911322163804341    12
991351003804341     24
                    ..
99910913804341      18
9991175753804341     8
999360103804341      8
999441113804341     20
999989663804341     16
Name: volumeId, Length: 2728, dtype: int64

## Saving the final dataframe into a json file

In [208]:
df['volumeTitle'] = df['volumeTitle'].str.replace('[\[\]]', '', regex=True)
df['serieTitle'] = df['serieTitle'].str.replace('[\[\]]', '', regex=True)
df['serieSubTitle'] = df['serieSubTitle'].str.replace('[\[\]]', '', regex=True)

In [212]:
df.to_json(r'./results/chapbooks_dataframe', orient="index") 

### A couple of explorations

In [198]:
df["volumeTitle"]

0        song in praise of the highland lads
1        song in praise of the highland lads
2        song in praise of the highland lads
3        song in praise of the highland lads
4        song in praise of the highland lads
                        ...                 
47324                   Burns' popular songs
47325                   Burns' popular songs
47326                   Burns' popular songs
47327                   Burns' popular songs
47328                   Burns' popular songs
Name: volumeTitle, Length: 47329, dtype: object

In [199]:
df["editor"]

0          Milne, John
1          Milne, John
2          Milne, John
3          Milne, John
4          Milne, John
             ...      
47324    Burns, Robert
47325    Burns, Robert
47326    Burns, Robert
47327    Burns, Robert
47328    Burns, Robert
Name: editor, Length: 47329, dtype: object

In [200]:
df.groupby(df["editor"]).count()

Unnamed: 0_level_0,MMSID,edition,editor_date,genre,language,metsXML,termsOfAddress,numberOfPages,numberOfWords,permanentURL,...,volumeTitle,volumeId,year,serieNum,part,collectionName,serieTitle,publisherPersons,numberOfVolumes,volumeNum
editor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Abercromby, Ralph",24,0,24,24,24,24,24,24,24,24,...,24,24,24,24,24,24,24,24,24,24
Abraham,24,0,0,24,24,24,24,24,24,24,...,24,24,24,24,24,24,24,24,24,24
Aesop.,24,0,0,24,24,24,0,24,24,24,...,24,24,24,24,24,24,24,24,24,24
"Ainslie, Hew",24,0,24,24,24,24,0,24,24,24,...,24,24,24,24,24,24,24,24,24,24
"Aitken, A.",12,0,0,0,12,12,0,12,12,12,...,12,12,12,12,12,12,12,12,12,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wolfe, Charles",48,0,48,48,48,48,0,48,48,48,...,48,48,48,48,48,48,48,48,48,48
"Wolfe, James",48,0,48,48,48,48,0,48,48,48,...,48,48,48,48,48,48,48,48,48,48
"Woodward, Josiah",24,0,24,0,24,24,0,24,24,24,...,24,24,24,24,24,24,24,24,24,24
"Yarrington, Rob.",8,0,0,8,8,8,0,8,8,8,...,8,8,8,8,8,8,8,8,8,8


In [201]:
df.groupby(df["publisher"]).count()

Unnamed: 0_level_0,MMSID,edition,editor,editor_date,genre,language,metsXML,termsOfAddress,numberOfPages,numberOfWords,...,volumeTitle,volumeId,year,serieNum,part,collectionName,serieTitle,publisherPersons,numberOfVolumes,volumeNum
publisher,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(Printed by J. Chalmers & Co. Castlestreet Aberdeen.),8,0,8,8,8,8,8,0,8,8,...,8,8,8,8,8,8,8,8,8,8
- Printed and sold by M. Randall,8,0,0,0,8,8,8,0,8,8,...,8,8,8,8,8,8,8,8,8,8
- Printed by C. Randall.,16,0,0,0,16,16,16,0,16,16,...,16,16,16,16,16,16,16,16,16,16
-Printed by C. Randall,8,0,8,8,8,8,8,0,8,8,...,8,8,8,8,8,8,8,8,8,8
-Printed by J. Morren,16,0,0,0,16,16,16,0,16,16,...,16,16,16,16,16,16,16,16,16,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"printed, for the author",8,0,8,0,0,8,8,0,8,8,...,8,8,8,8,8,8,8,8,8,8
re-printed and sold by James Watson,24,0,24,0,0,24,24,0,24,24,...,24,24,24,24,24,24,24,24,24,24
"reprinted, and sold by most book-sellers in town",20,0,0,0,0,20,20,0,20,20,...,20,20,20,20,20,20,20,20,20,20
s.n.,272,0,156,86,256,272,272,62,272,272,...,272,272,272,272,272,272,272,272,272,272


In [202]:
fig = px.line(df, x="volumeId", y="numberOfPages", title='Number of pages per collection and volume')
fig.show()

In [203]:
collection=df.groupby(['serieTitle','volumeId'])['volumeId'].size()
collection

serieTitle                                                           volumeId 
 history of Moll Flanders                                            104184799    24
                                                                     104186700    24
 hundred] godly lessons                                              104184318     8
'Twas on the morn of sweet May Day                                   104185804     8
                                                                     104186584     8
                                                                                  ..
young lasses' song, or What wou'd a young lassie do wi' an auld man  104184876     8
young man's dream                                                    104185671     8
young squire's frolic                                                104185386     8
young woman's wish                                                   104185681     8
younger brother or, The sufferings of Saint Andre                    10

In [204]:
a=df.groupby(['genre', 'volumeId']).size()
a

genre                         volumeId 
Biography                     104184957    144
                              104184969     48
                              104186348     48
                              104186391     30
Chapbooks-England-1801-1900.  104187077      8
                                          ... 
letter                        104184120     12
short story                   104184108     12
                              104184119     24
                              104184368     24
speech                        104185456     24
Length: 2826, dtype: int64

In [205]:
a=df.groupby(['place', 'volumeId']).size()
a

place        volumeId 
Aberdeen     104184105     8
             104184115     8
             104184118     8
             104184124     8
             104184125     8
                          ..
Stirling]    104186915     8
             104186916     8
             104186917     8
             104186918     8
s.n., s.d.]  104185838    20
Length: 3079, dtype: int64