# 01. Pull data for core survey
This notebook contains code that accesses Texas A&M's IODP database and pulls metadata, chemical data, and core photos for all existing DSDP, ODP, and IODP cores that come in contact with the sediment water interface. DSDP, ODP, and IODP are three different iterations of a very similar program for international ocean drilling, but the metadata for each era of the international ocean drilling program, so each case must be handled indepenedently. <br><br>
This data will be used to identify which sites have banding at active diagenetic fronts.

## Setup

### Import Modules

In [55]:
import pandas as pd
import requests
from IPython.display import clear_output
import time
import cv2 as cv
import os

### Set Paths

In [59]:
data_path='/Users/danielbabin/GitHub/Green_Bands/Data/'
survey_data='/Users/danielbabin/GitHub/Green_Bands/Data/Survey/Photos/PDFs/'

## Pull data

### Load sites metadata
This data was sourced from "Drilled Holes" at https://www.iodp.org/resources/maps-and-kml-tools

In [5]:
sites=pd.read_csv(survey_data+'all.DSDP.ODP.IODP.sites.csv')
sites['era']=None
sites.loc[sites['leg']<=96,'era']='DSDP'
sites.loc[sites['leg'].between(99,210),'era']='ODP'
id_beginning_IODP=sites.loc[sites['expedition']=='301'].index[0]
id_end_IODP=sites.loc[sites['expedition']=='312'].index[-1]
sites.loc[id_beginning_IODP:id_end_IODP,'era']='Early IODP'
sites.loc[id_end_IODP+1:3964,'era']='Modern IODP'

### DSDP era
#### Site Summaries

In [6]:
dsdp_site_summaries_list=[]
for i in sites.loc[sites['era']=='DSDP'].index:
    clear_output(wait=True)
    leg=str(sites.loc[i,'leg'].astype(int))
    site=sites.loc[i,'site']
    url='https://www.ngdc.noaa.gov/mgg/geology/dsdp/data/'+leg+'/'+site+'/sitesum.txt'
    try:
        df=pd.read_csv(url, sep='\t')
        dsdp_site_summaries_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(sites.loc[sites['era']=='DSDP']))
dsdp_site_summaries = pd.concat(dsdp_site_summaries_list, ignore_index=True)

1057 / 1058


In [8]:
dsdp_site_summaries.to_csv(survey_data+'dsdp_site_summaries.csv',index=False)

#### Age constraints
To get sedimentation rate

In [11]:
dsdp_age_list=[]
for i in sites.loc[sites['era']=='DSDP'].index:
    clear_output(wait=True)
    leg=str(sites.loc[i,'leg'].astype(int))
    site=sites.loc[i,'site']
    url='https://www.ngdc.noaa.gov/mgg/geology/dsdp/data/'+leg+'/'+site+'/ageprof.txt'
    try:
        df=pd.read_csv(url, sep='\t')
        dsdp_age_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(sites.loc[sites['era']=='DSDP']))
dsdp_age = pd.concat(dsdp_age_list, ignore_index=True)

1057 / 1058


In [12]:
dsdp_age.to_csv(survey_data+'dsdp_age.csv',index=False)

#### Core Recovery

In [14]:
dsdp_core_recovery_list=[]
for i in sites.loc[sites['era']=='DSDP'].index:
    clear_output(wait=True)
    leg=str(sites.loc[i,'leg'].astype(int))
    site=sites.loc[i,'site']
    url='https://www.ngdc.noaa.gov/mgg/geology/dsdp/data/'+leg+'/'+site+'/coredep.txt'
    try:
        df=pd.read_csv(url, sep='\t')
        dsdp_core_recovery_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(sites.loc[sites['era']=='DSDP']))
dsdp_core_recovery = pd.concat(dsdp_carbonate_list, ignore_index=True)

1057 / 1058


In [15]:
dsdp_core_recovery.to_csv(survey_data+'dsdp_core_recovery.csv',index=False)

#### Core Photos
##### Download Core Photo metadata

In [16]:
dsdp_legs_sites=dsdp_core_recovery[['leg','site']].drop_duplicates()

In [17]:
dsdp_core_photo_links_list=[]
for i,idx in enumerate(dsdp_legs_sites.index):
    clear_output(wait=True)
    leg=str(dsdp_legs_sites.loc[idx,'leg'])
    site=str(dsdp_legs_sites.loc[idx,'site'])
    url='https://web.iodp.tamu.edu/janusweb/imaging/photo.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=18).iloc[:-18]
        dsdp_core_photo_links_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(dsdp_legs_sites))
dsdp_core_photo_links = pd.concat(dsdp_core_photo_links_list, ignore_index=True)
dsdp_core_photo_links['Image Link']=dsdp_core_photo_links['Image Link'].str[9:]
dsdp_core_photo_links['Image Link']=dsdp_core_photo_links['Image Link'].str.split('>',expand=True)[0].str[:-1]
dsdp_core_photo_links['Filename']=dsdp_core_photo_links['Image Link'].str.split('/',expand=True).iloc[:,-1]

529 / 530


In [18]:
dsdp_core_photo_links.to_csv(survey_data+'dsdp_core_photo_links.csv',index=False)

In [19]:
dsdp_core_tops=dsdp_core_photo_links[dsdp_core_photo_links['Depth (mbsf)']==0].copy()
dsdp_core_tops.to_csv(survey_data+'dsdp_core_tops.csv',index=False)

##### Download Core Photos

In [24]:
dsdp_path='/Users/danielbabin/GitHub/Green_Bands/Data/Survey/Photos/PDFs/DSDP/'

In [25]:
for i,idx in enumerate(dsdp_core_tops.index):
    clear_output(wait=True)
    url=dsdp_core_tops.loc[idx,'Image Link']
    filename=dsdp_core_tops.loc[idx,'Filename']
    response = requests.get(url)
    with open(dsdp_path+filename, 'wb') as file:
        file.write(response.content)
    print(i,'/',len(dsdp_core_tops))

488 / 489


### ODP era

In [26]:
odp_era_sites=sites.loc[sites['era']=='ODP'].copy()
odp_era_sites['leg']=odp_era_sites['leg'].astype(int)
odp_legs_sites=odp_era_sites[['leg','site']].drop_duplicates()

#### Site Summaries

In [27]:
odp_site_summaries_list=[]
for i,idx in enumerate(odp_legs_sites.index):
    clear_output(wait=True)
    leg=str(odp_legs_sites.loc[idx,'leg'])
    site=str(odp_legs_sites.loc[idx,'site'])
    url='https://web.iodp.tamu.edu/janusweb/coring_summaries/sitesumm.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=19).iloc[:-22]
        odp_site_summaries_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(odp_legs_sites))
odp_site_summaries = pd.concat(odp_site_summaries_list, ignore_index=True)

654 / 655


In [28]:
odp_site_summaries.to_csv(survey_data+'odp_site_summaries.csv',index=False)

#### Age constraints
##### Type: Depth-Age Model

In [31]:
odp_depth_age_list=[]
for i,idx in enumerate(odp_legs_sites.index):
    clear_output(wait=True)
    leg=str(odp_legs_sites.loc[idx,'leg'])
    site=str(odp_legs_sites.loc[idx,'site'])
    url='https://web.iodp.tamu.edu/janusweb/paleo/agemodel.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=18)
        odp_depth_age_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(odp_legs_sites))
odp_depth_age=pd.concat(odp_depth_age_list, ignore_index=True)
odp_depth_age=odp_depth_age[odp_depth_age['Site'].isnull()==False].copy()

654 / 655


In [32]:
odp_depth_age.to_csv(survey_data+'odp_depth_age.csv',index=False)

##### Type: Age Profile (datum list)

In [33]:
odp_age_profile_list=[]
for i,idx in enumerate(odp_legs_sites.index):
    clear_output(wait=True)
    leg=str(odp_legs_sites.loc[idx,'leg'])
    site=str(odp_legs_sites.loc[idx,'site'])
    url='https://web.iodp.tamu.edu/janusweb/paleo/ageprofile.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=18)
        odp_age_profile_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(odp_legs_sites))
odp_age_profile=pd.concat(odp_age_profile_list, ignore_index=True)
odp_age_profile=odp_age_profile[odp_age_profile['Site'].isnull()==False].copy()

654 / 655


In [34]:
odp_age_profile.to_csv(survey_data+'odp_age_profile.csv',index=False)

#### Core Photos
##### Download Core Photos Metadata

In [35]:
odp_core_photo_links_list=[]
for i,idx in enumerate(odp_legs_sites.index):
    clear_output(wait=True)
    leg=str(odp_legs_sites.loc[idx,'leg'])
    site=str(odp_legs_sites.loc[idx,'site'])
    url='https://web.iodp.tamu.edu/janusweb/imaging/photo.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=18)
        odp_core_photo_links_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(odp_legs_sites))
odp_core_photo_links = pd.concat(odp_core_photo_links_list, ignore_index=True)
odp_core_photo_links=odp_core_photo_links[odp_core_photo_links['Site'].isnull()==False].copy()
odp_core_photo_links['Image Link']=odp_core_photo_links['Image Link'].str[9:]
odp_core_photo_links['Image Link']=odp_core_photo_links['Image Link'].str.split('>',expand=True)[0].str[:-1]
odp_core_photo_links['Filename']=odp_core_photo_links['Image Link'].str.split('/').str[-1]
odp_core_photo_links['Site']=odp_core_photo_links['Site'].astype(int)
odp_core_photo_links['Cor']=odp_core_photo_links['Cor'].astype(int)

654 / 655


In [36]:
odp_core_photo_links.to_csv(survey_data+'odp_core_photo_links.csv',index=False)

In [37]:
odp_core_tops=odp_core_photo_links[odp_core_photo_links['Depth (mbsf)']==0].copy()
odp_core_tops=odp_core_tops[odp_core_tops['Filename'].str.endswith('.PDF')].copy()
odp_core_tops.to_csv(survey_data+'odp_core_tops.csv',index=False)

##### Download Core Photos

In [38]:
odp_path='//Users/danielbabin/GitHub/Green_Bands/Data/Survey/Photos/PDFs/ODP/'

In [39]:
for i,idx in enumerate(odp_core_tops.index):
    clear_output(wait=True)
    url=odp_core_tops.loc[idx,'Image Link']
    filename=odp_core_tops.loc[idx,'Filename']
    response = requests.get(url)
    with open(odp_path+filename, 'wb') as file:
        file.write(response.content)
    print(i,'/',len(odp_core_tops))

1256 / 1257


### Early IODP era

In [40]:
iodp_era_sites=sites.loc[sites['era']=='Early IODP'].copy()
iodp_era_sites['leg']=iodp_era_sites['expedition']
iodp_exps_sites=iodp_era_sites[['expedition','site']].drop_duplicates().iloc[1:]

#### Site Summaries

In [41]:
early_iodp_site_summaries_list=[]
for i,idx in enumerate(iodp_exps_sites.index):
    clear_output(wait=True)
    leg=str(iodp_exps_sites.loc[idx,'expedition'])
    site=str(iodp_exps_sites.loc[idx,'site'])[1:]
    url='https://web.iodp.tamu.edu/janusweb/coring_summaries/sitesumm.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=19).iloc[:-22]
        early_iodp_site_summaries_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(iodp_exps_sites))
early_iodp_site_summaries=pd.concat(early_iodp_site_summaries_list, ignore_index=True)
early_iodp_site_summaries=early_iodp_site_summaries[early_iodp_site_summaries['Latitude   '].isnull()==False].copy()

58 / 59


In [42]:
early_iodp_site_summaries.to_csv(survey_data+'early_iodp_site_summaries.csv',index=False)

#### Core Photos
##### Download Core Photos Metadata

In [45]:
early_iodp_core_photo_links_list=[]
for i,idx in enumerate(iodp_exps_sites.index):
    clear_output(wait=True)
    leg=str(iodp_exps_sites.loc[idx,'expedition'])
    site=str(iodp_exps_sites.loc[idx,'site'])[1:]
    url='https://web.iodp.tamu.edu/janusweb/imaging/photo.cgi?leg='+leg+'&site='+site
    try:
        df=pd.read_csv(url, sep='\t',header=18)
        early_iodp_core_photo_links_list.append(df)
    except:
        print(f"No table found for URL: {url}")
    print(i,'/',len(iodp_exps_sites))
early_iodp_core_photo_links = pd.concat(early_iodp_core_photo_links_list, ignore_index=True)
early_iodp_core_photo_links=early_iodp_core_photo_links[early_iodp_core_photo_links['Site'].isnull()==False].copy()
early_iodp_core_photo_links['Image Link']=early_iodp_core_photo_links['Image Link'].str[9:]
early_iodp_core_photo_links['Image Link']=early_iodp_core_photo_links['Image Link'].str.split('>',expand=True)[0].str[:-1]
early_iodp_core_photo_links['Filename']=early_iodp_core_photo_links['Image Link'].str.split('/').str[-1]
early_iodp_core_photo_links['Site']=early_iodp_core_photo_links['Site'].astype(int)
early_iodp_core_photo_links['Cor']=early_iodp_core_photo_links['Cor'].astype(int)

58 / 59


In [46]:
early_iodp_core_photo_links.to_csv(survey_data+'early_iodp_core_photo_links.csv',index=False)

In [47]:
early_iodp_core_tops=early_iodp_core_photo_links[early_iodp_core_photo_links['Depth (mbsf)']==0].copy()
early_iodp_core_tops=early_iodp_core_tops[early_iodp_core_tops['Filename'].str.endswith('.PDF')].copy()
early_iodp_core_tops.to_csv(survey_data+'early_iodp_core_tops.csv',index=False)

##### Download Core Photos

In [48]:
early_iodp_path='/Users/danielbabin/GitHub/Green_Bands/Data/Survey/Photos/PDFs/EarlyIODP/'

In [50]:
for i,idx in enumerate(early_iodp_core_tops.index):
    clear_output(wait=True)
    url=early_iodp_core_tops.loc[idx,'Image Link']
    filename=early_iodp_core_tops.loc[idx,'Filename']
    response = requests.get(url)
    with open(early_iodp_path+filename, 'wb') as file:
        file.write(response.content)
    print(i,'/',len(early_iodp_core_tops))

69 / 70


### Modern IODP era
Core photos from the modern IODP era are more easily downloaded in batch form from the Texas A&M database. I downloaded them and am loading them in from an external hard drive here.

In [51]:
in_path='/Volumes/SanDisk.Data/IODPSurvey/'
out_path='/Users/danielbabin/GitHub/Green_Bands/Data/Survey/ModernIODP_Photos/'

the core photo links file below is also downloaded from the IODP website.

In [53]:
modern_iodp_photos=pd.read_csv(survey_data+'modern_iodp_core_photo_links.csv')
modern_iodp_core_tops=modern_iodp_photos[modern_iodp_photos['Top depth CSF-A (m)']==0].copy()

In [56]:
files=os.listdir(in_path)
files=files[1:]

In [57]:
def shrink_rotate_write(filename,shrinkfactor):
    raw = cv.imread(in_path+filename)
    size=raw.shape
    newsize=tuple((int(size[1]/shrinkfactor),int(size[0]/shrinkfactor)))
    sraw=cv.resize(raw,newsize,interpolation = cv.INTER_AREA)
    srotraw=cv.rotate(sraw, cv.ROTATE_90_COUNTERCLOCKWISE)
    cv.imwrite(out_path+filename,srotraw)

In [58]:
start=time.time()
for i,s in enumerate(files):
    clear_output(wait=True)
    shrink_rotate_write(s,6)
    stop=time.time()-start
    if i/len(modern_iodp_core_tops)*100<2:
        expected_time='Calculating...'
    else:
        expected_time=round((stop-0)/(i/len(modern_iodp_core_tops))/60,2)
    print('Current progress: ',round(i/len(modern_iodp_core_tops)*100,2),'%')
    print('Current run time: ',round((stop-0)/60,2),' minutes')
    print('Expected run time: ',expected_time,' minutes')

Current progress:  97.84 %
Current run time:  2.9  minutes
Expected run time:  2.97  minutes
