#### Importing Libraries

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import copy
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
import os
import timeit
import time
import glob
from natsort import natsorted
from tqdm import tqdm


In [2]:
# Defining paths
path ='data_files/data_del'
os.makedirs(path, exist_ok=True) 

In [3]:
#Webpage of the coin site
url = "http://numismatics.org/ocre/results"

#To open a page, pass url to urlopen
page = urlopen(url)

#urlopen() returns an HTTPResponse object:
page

<http.client.HTTPResponse at 0x1188c3dc0>

In [4]:
def extract_data(url,number):
    # Creating a dictionary of empty 
    data= {"record_id":[], "record_description":[], "Date": [], "Denomination": [], "Mint": [], "Obverse": [],
      "Reverse":[], "Reference":[], "Images":[]}
       
    #To open a page, pass url to urlopen
    page = urlopen(url)
    html_bytes = page.read()
    html = html_bytes.decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    # All entries on a page are within the "col-md-12" class container. 
    # Asking BS4 to provide a list of all of these classes on a particular page 
    cont_list = soup.find_all("div", {"class": "col-md-12"})
        
    # Within each page are several other elements of interest, which will be retrieved using their specific elements
    for i in range(len(cont_list)):
        # Extract function removes the Tag or Navigable String from the parse three i.e. removes the specific element from the HTML document.
        # The extracted element i.e. record ID
        # Here we also make use of the re module, which is a built-in package from python.
        # It finds the href items from within a given text
        c = re.findall('href="(.+?)">',str(cont_list[i].extract()))
        data["record_id"].extend(c)

        # For the record description, we access the string content of a Tag object and then immediately removing it from the parse tree.      
        rec_des = cont_list[i].string.extract()        
        data["record_description"].append(rec_des)


    # Next step, we need to determine if the record has an image in the thumbnail.
    # If a thumbnail is present, then the record will have at least one image.    
    
    # Thumbnail images are denoted by the class "col-md-5 col-lg-4 pull-right"
    images = soup.find_all("div", {"class": "col-md-5 col-lg-4 pull-right"})


    for i in range(len(images)):
        # Extract the HTML content
        a_images = str(images[i].extract())
        
        if "thumbImage" in a_images:
            data["Images"].append("Yes")
        else:
            data["Images"].append("No")
        
    # Lastly, we will obtain the details for each record
    details = soup.find_all("dl", {"class": "dl-horizontal"})

    for i in range(len(details)):
        
        # Tags with DD have the details
        deets = re.findall('<dd>(.+?)</dd>', str(details[i].extract()))
        
        # Tags with DT have the Labels, which we will use to assign the details to the data.
        label = re.findall('<dt>(.+?)</dt>', str(details[i].extract()))
        
        for i in range(len(label)):
            data[label[i]].append(deets[i])
    

    
    df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))
    path ='data_files/data'
    df.to_csv(path + '/pages_'+str(number)+'.csv', encoding = 'utf-8')

In [5]:

basic = "http://numismatics.org/ocre/results?q=&start="
numbers = list(range(0,41720,20))

for i in tqdm (range(len(numbers))):
    url = basic + str(numbers[i])
    extract_data(url,numbers[i])

100%|██████████| 2086/2086 [39:49<00:00,  1.15s/it]


In [6]:
# Merging together all the CSV files
# First, reading the path
file_path = "data_files/data"


#list all the files from the directory
file_list = os.listdir(file_path)
file_list_ordered = natsorted(file_list)

In [7]:
file_list_ordered

['.DS_Store',
 'pages_0.csv',
 'pages_20.csv',
 'pages_40.csv',
 'pages_60.csv',
 'pages_80.csv',
 'pages_100.csv',
 'pages_120.csv',
 'pages_140.csv',
 'pages_160.csv',
 'pages_180.csv',
 'pages_200.csv',
 'pages_220.csv',
 'pages_240.csv',
 'pages_260.csv',
 'pages_280.csv',
 'pages_300.csv',
 'pages_320.csv',
 'pages_340.csv',
 'pages_360.csv',
 'pages_380.csv',
 'pages_400.csv',
 'pages_420.csv',
 'pages_440.csv',
 'pages_460.csv',
 'pages_480.csv',
 'pages_500.csv',
 'pages_520.csv',
 'pages_540.csv',
 'pages_560.csv',
 'pages_580.csv',
 'pages_600.csv',
 'pages_620.csv',
 'pages_640.csv',
 'pages_660.csv',
 'pages_680.csv',
 'pages_700.csv',
 'pages_720.csv',
 'pages_740.csv',
 'pages_760.csv',
 'pages_780.csv',
 'pages_800.csv',
 'pages_820.csv',
 'pages_840.csv',
 'pages_860.csv',
 'pages_880.csv',
 'pages_900.csv',
 'pages_920.csv',
 'pages_940.csv',
 'pages_960.csv',
 'pages_980.csv',
 'pages_1000.csv',
 'pages_1020.csv',
 'pages_1040.csv',
 'pages_1060.csv',
 'pages_1080.csv

In [8]:
# Taking a look at the individual file created
df_temp = pd.read_csv('data_files/data/pages_0.csv')
df_temp.head()


Unnamed: 0.1,Unnamed: 0,record_id,record_description,Date,Denomination,Mint,Obverse,Reverse,Reference,Images
0,0,id/ric.1(2).aug.1A,RIC I (second edition) Augustus 1A,25 BCE - 23 BCE,Quinarius,Emerita,"AVGVST: Head of Augustus, bare, left","P CARISI LEG: Victory standing right, placing ...",,Yes
1,1,id/ric.1(2).aug.1B,RIC I (second edition) Augustus 1B,25 BCE - 23 BCE,Quinarius,Emerita,"AVGVST: Head of Augustus, bare, left","P CARISI LEG: Victory standing right, placing ...",,Yes
2,2,id/ric.1(2).aug.2A,RIC I (second edition) Augustus 2A,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVST: Head of Augustus, bare, right","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes
3,3,id/ric.1(2).aug.2B,RIC I (second edition) Augustus 2B,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVST: Head of Augustus, bare, left","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes
4,4,id/ric.1(2).aug.3,RIC I (second edition) Augustus 3,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVSTVS: Head of Augustus, bare, r...","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes


In [9]:
# Concatenating the files together to a single file

#Concatenating the files
df_concat = pd.DataFrame()
#append all files together
for file in tqdm(range(0,41700,20)):

            #Reading CSV from filepath
            df_temp = pd.read_csv(file_path+"/pages_"+str(file)+".csv")

            # Replace empty values with NaNs
            df_temp['Date'].replace('',np.nan, inplace=True)
            df_temp['Images'].replace('',np.nan,inplace=True)
            
            #Dropping rows that do no have images or dates
            #df_temp.dropna(subset=['Date','Images'], inplace=True)
            
            #Filtering rows where record has an image
            #df_temp = df_temp[df_temp['Images'].str.contains("Yes")]
            
            #Concatenating files
            df_concat = pd.concat([df_concat, df_temp], ignore_index=True)

df_concat.head()

100%|██████████| 2085/2085 [00:07<00:00, 272.79it/s]


Unnamed: 0.1,Unnamed: 0,record_id,record_description,Date,Denomination,Mint,Obverse,Reverse,Reference,Images
0,0,id/ric.1(2).aug.1A,RIC I (second edition) Augustus 1A,25 BCE - 23 BCE,Quinarius,Emerita,"AVGVST: Head of Augustus, bare, left","P CARISI LEG: Victory standing right, placing ...",,Yes
1,1,id/ric.1(2).aug.1B,RIC I (second edition) Augustus 1B,25 BCE - 23 BCE,Quinarius,Emerita,"AVGVST: Head of Augustus, bare, left","P CARISI LEG: Victory standing right, placing ...",,Yes
2,2,id/ric.1(2).aug.2A,RIC I (second edition) Augustus 2A,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVST: Head of Augustus, bare, right","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes
3,3,id/ric.1(2).aug.2B,RIC I (second edition) Augustus 2B,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVST: Head of Augustus, bare, left","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes
4,4,id/ric.1(2).aug.3,RIC I (second edition) Augustus 3,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVSTVS: Head of Augustus, bare, r...","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes


In [10]:
# Save file
df_concat.to_csv('data_files/data_concat/concatenated_data.csv', sep=';', index=False)

In [11]:
# Import concatenated file
df = pd.read_csv('data_files/data_concat/concatenated_data.csv', sep=';')
df['record_id'] = df['record_id'].str.replace('id/','')
df = df.drop(df.columns[0], axis=1)
df.head()

Unnamed: 0,record_id,record_description,Date,Denomination,Mint,Obverse,Reverse,Reference,Images
0,ric.1(2).aug.1A,RIC I (second edition) Augustus 1A,25 BCE - 23 BCE,Quinarius,Emerita,"AVGVST: Head of Augustus, bare, left","P CARISI LEG: Victory standing right, placing ...",,Yes
1,ric.1(2).aug.1B,RIC I (second edition) Augustus 1B,25 BCE - 23 BCE,Quinarius,Emerita,"AVGVST: Head of Augustus, bare, left","P CARISI LEG: Victory standing right, placing ...",,Yes
2,ric.1(2).aug.2A,RIC I (second edition) Augustus 2A,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVST: Head of Augustus, bare, right","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes
3,ric.1(2).aug.2B,RIC I (second edition) Augustus 2B,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVST: Head of Augustus, bare, left","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes
4,ric.1(2).aug.3,RIC I (second edition) Augustus 3,25 BCE - 23 BCE,Denarius,Emerita,"IMP CAESAR AVGVSTVS: Head of Augustus, bare, r...","P CARISIVS LEG PRO PR: Round shield, spear-hea...",,Yes


In [12]:
#The way the link is structured is as follows:
# The main part of the hyperlink is https://numismatics.org/ocre/id/
# Followed by the record ID in the DataFrame record_ id ric.1(2).aug.1A
# Once there, it provides an option to download the CSV File with links to the images

url2 = "https://numismatics.org/ocre/id/ric.1(2).aug.1A"
page = urlopen(url2)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
# The "Download CSV" Link is in the "col-md-12" a container 
step_1 = soup.find_all("div", {"class": "col-md-12"})

# It is always the third (zero indexed) "col-md-12" container
link = step_1[2].find('h3').find('a')

# Select the href attribute of the 'a' tag
href = link['href']

print(href)


http://nomisma.org/query?query=PREFIX%20rdf%3A%20%20%20%20%20%20%3Chttp%3A%2F%2Fwww.w3.org%2F1999%2F02%2F22-rdf-syntax-ns%23%3E%0APREFIX%20crm%3A%09%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0APREFIX%20dcterms%3A%20%20%3Chttp%3A%2F%2Fpurl.org%2Fdc%2Fterms%2F%3E%0APREFIX%20nm%3A%20%20%20%20%20%20%20%3Chttp%3A%2F%2Fnomisma.org%2Fid%2F%3E%0APREFIX%20nmo%3A%09%3Chttp%3A%2F%2Fnomisma.org%2Fontology%23%3E%0APREFIX%20skos%3A%20%20%20%20%20%20%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX%20foaf%3A%09%3Chttp%3A%2F%2Fxmlns.com%2Ffoaf%2F0.1%2F%3E%0APREFIX%20rdfs%3A%09%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20void%3A%09%3Chttp%3A%2F%2Frdfs.org%2Fns%2Fvoid%23%3E%0APREFIX%20geo%3A%09%3Chttp%3A%2F%2Fwww.w3.org%2F2003%2F01%2Fgeo%2Fwgs84_pos%23%3E%0APREFIX%20edm%3A%20%3Chttp%3A%2F%2Fwww.europeana.eu%2Fschemas%2Fedm%2F%3E%0A%0ASELECT%20DISTINCT%20%3Fobject%20%3Ftitle%20%28group_concat%28%3Fidentifier%3B%20separator%3D%22%7C%7C%22%29%20as%20%3Fide

In [23]:
def extract_csv(dataframe,start):
    path2 = "data_files/data_csv_content"
    checkpoint = 0

    for id in tqdm (range(start, len(dataframe["record_id"]))):

        checkpoint =+ 1

        #The record ID is always in the format id/record_id. As a result, we will split the query and take the second element.
        rec_id = str(dataframe["record_id"].iloc[id])
        
        #To open a page, pass url to urlopen
        url = "https://numismatics.org/ocre/id/"+rec_id
        page = urlopen(url)
        html_bytes = page.read()
        html = html_bytes.decode("utf-8")
        soup = BeautifulSoup(html, "html.parser")    
        main_tag = soup.find_all("div", {"class": "col-md-12"})

        try:
            attribute = main_tag[2].find('a') 
        
            href = attribute['href']
        

            #Download file        
            id_csv = pd.read_csv(href)
            id_csv.to_csv(path2+rec_id+'.csv', index=False)
        
        
        except Exception as e:
            #print(e)
            pass

    return checkpoint 
    


In [24]:
# For some links, there won't be any associated CSV files
start= 0
extract_csv(df, start)


100%|██████████| 41700/41700 [23:59:47<00:00,  2.07s/it]   


1

## Combining the collected ref data with contents


In [25]:
# Checking if all CSV files have the same columns. Important for merging
# ref file
ref_file_path ='data_files/data_ref_links/ric.1(2).aug.1A.csv'
ref_file= pd.read_csv(ref_file_path)
column_headers = list(ref_file.columns.values)
print(column_headers)




['object', 'title', 'identifiers', 'findUri', 'findspot', 'hoard', 'collection', 'publisher', 'dataset', 'datasetTitle', 'weight', 'axis', 'diameter', 'obvThumb', 'revThumb', 'obvRef', 'revRef', 'comThumb', 'comRef', 'obvManifest', 'revManifest', 'comManifest', 'model', 'record_id']


In [26]:
# Merging together all the CSV files
# First, reading the path
file_path_ref = "data_files/data_csv_content"


#list all the files from the directory
file_list = os.listdir(file_path_ref)
file_list_ordered = natsorted(file_list)
file_list_ordered

['.DS_Store',
 'ric.1(2).aug.1A.csv',
 'ric.1(2).aug.1B.csv',
 'ric.1(2).aug.2A.csv',
 'ric.1(2).aug.2B.csv',
 'ric.1(2).aug.3.csv',
 'ric.1(2).aug.4A.csv',
 'ric.1(2).aug.4B.csv',
 'ric.1(2).aug.5.csv',
 'ric.1(2).aug.6.csv',
 'ric.1(2).aug.7A.csv',
 'ric.1(2).aug.7B.csv',
 'ric.1(2).aug.8.csv',
 'ric.1(2).aug.9A.csv',
 'ric.1(2).aug.9B.csv',
 'ric.1(2).aug.10.csv',
 'ric.1(2).aug.11A.csv',
 'ric.1(2).aug.11B.csv',
 'ric.1(2).aug.12.csv',
 'ric.1(2).aug.13.csv',
 'ric.1(2).aug.14.csv',
 'ric.1(2).aug.15A.csv',
 'ric.1(2).aug.15B.csv',
 'ric.1(2).aug.16.csv',
 'ric.1(2).aug.18.csv',
 'ric.1(2).aug.20.csv',
 'ric.1(2).aug.21.csv',
 'ric.1(2).aug.22.csv',
 'ric.1(2).aug.23.csv',
 'ric.1(2).aug.24.csv',
 'ric.1(2).aug.25.csv',
 'ric.1(2).aug.26A.csv',
 'ric.1(2).aug.28.csv',
 'ric.1(2).aug.29A.csv',
 'ric.1(2).aug.29B.csv',
 'ric.1(2).aug.31.csv',
 'ric.1(2).aug.33A.csv',
 'ric.1(2).aug.33B.csv',
 'ric.1(2).aug.35.csv',
 'ric.1(2).aug.36A.csv',
 'ric.1(2).aug.36B.csv',
 'ric.1(2).aug.37A.

In [27]:
# define your column names
column_names = column_headers

def column_check(path, file_list):
        abs_paths = []
        for file in range(1,len(file_list)):
                abs_path = path + '/'+file_list[file]
                abs_paths.append(abs_path)

        cols = [pd.read_csv(abs_path, nrows=0).columns for abs_path in abs_paths]

        cols_identical = [all(cols[0] == colx) for colx in cols[1:]]

        all_cols_same = all(cols_identical)
        print(all_cols_same) 

In [28]:
column_check(file_path_ref, file_list_ordered)

True


In [None]:
# All columns are identical. Time to add the reference name to all columns

