Include more cores (LV28-44-3, LV29-114-3, and SO178-12-3). Basically they have higher TOC and lower Carbonates measurements. S0178-12-3 only has TOC measurements.

In [1]:
import numpy as np 
import pandas as pd
import glob
import matplotlib.pyplot as plt

#plt.style.use('ggplot')
plt.style.use('seaborn-colorblind')
plt.style.use('dark_background')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.transparent'] = True

%matplotlib inline

import datetime
date = datetime.datetime.now().strftime('%Y%m%d')

# Read and build spectral datasets

## The cores having same spe format as the previous cores

In [2]:
file_name = []
spe_all = []
depth_all = []
cps_all = []
core_all = []
s_depth_all = []

# do it core by core
for core in ['data/LV29-114-3', 'data/SO178-12-3']:
    
    # only read the 10kV which having better signal to the light elements
    spe_dir = glob.glob('{}/Run 1 at  10kV/*.spe'.format(core))
    
    # make sure the order follows the depthes in filename
    spe_dir.sort()
    
    for spe in spe_dir:
        check_depth = spe.split()[3].split('_')[-1]
        
        # there are some inconsistencies in nameing...as usual
        # 5 and 6 digis means in mm
        if len(check_depth) >= 5:
            start_depth = int(check_depth)
        # 3 and 4 digit means in cm, needs to be multipled to be mm
        elif len(check_depth) >= 3:
            start_depth = int(check_depth) * 10

        file_name.append(spe.split('/')[-1])
        
        with open(spe, 'r') as f:
            content = []
            lines = f.readlines()
            for line in lines[49:]:
                content = np.hstack((content, line.split()))
            
            section_depth = int(lines[13][:-3])
            spe_all.append(content.astype(int))
            cps_all.append(int(lines[28]))
            core_all.append(core[5:])
            s_depth_all.append(section_depth)
            depth_all.append(section_depth + start_depth)
            
    print('core {} is done.'.format(core))

core data/LV29-114-3 is done.
core data/SO178-12-3 is done.


## LV28-44-3
The spe format of the core LV28-44-3 is different to the previous cores so the codes to catch information need to be modified.<br>
1. No X_Position: this value was used for section depth. I adopt the value from the file name instead.
1. No TotalCPS: I simply use 0 as the value. In future, if we need cps (so far we don't use it), the values from this core should be detected since no CPS should be 0.
1. The channels' values start from line 22 instead of 49.

In [3]:
core = 'data/LV28-44-3'
    
# only read the 10kV which having better signal to the light elements
spe_dir = glob.glob('{}/Run 1 at  10kV/*.spe'.format(core))

# make sure the order follows the depthes in filename
spe_dir.sort()

for spe in spe_dir:
    check_depth = spe.split()[3].split('_')[-1]

    # there are some inconsistencies in nameing...as usual
    # 5 and 6 digis means in mm
    if len(check_depth) >= 5:
        start_depth = int(check_depth)
    # 3 and 4 digit means in cm, needs to be multipled to be mm
    elif len(check_depth) >= 3:
        start_depth = int(check_depth) * 10

    file_name.append(spe.split('/')[-1])

    with open(spe, 'r') as f:
        content = []
        lines = f.readlines()
        for line in lines[22:]:
            content = np.hstack((content, line.split()))

        section_depth = round(float(spe.split()[4][:-2]))
        spe_all.append(content.astype(int))
        cps_all.append(0)
        core_all.append(core[5:])
        s_depth_all.append(section_depth)
        depth_all.append(section_depth + start_depth)

In [4]:
spe_df = pd.DataFrame(spe_all, columns = [str(_) for _ in range(2048)])
spe_df['cps'] = cps_all
spe_df['core'] = core_all
spe_df['composite_depth_mm'] = depth_all
spe_df['section_depth_mm'] = s_depth_all
spe_df['filename'] = file_name

In [5]:
spe_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,69009,LV29-114-3,30,30,LV29-114-3_0000 30.0mm 10s 10kV 150uA No-F...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,78171,LV29-114-3,40,40,LV29-114-3_0000 40.0mm 10s 10kV 150uA No-F...
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,77307,LV29-114-3,50,50,LV29-114-3_0000 50.0mm 10s 10kV 150uA No-F...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,71940,LV29-114-3,70,70,LV29-114-3_0000 70.0mm 10s 10kV 150uA No-F...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,73630,LV29-114-3,80,80,LV29-114-3_0000 80.0mm 10s 10kV 150uA No-F...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,LV28-44-3,11080,550,LV28-44-3_1053 550.0mm 12s 10kV 1200uA F1.spe
3502,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,LV28-44-3,11090,560,LV28-44-3_1053 560.0mm 12s 10kV 1200uA F1.spe
3503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,LV28-44-3,11100,570,LV28-44-3_1053 570.0mm 12s 10kV 1200uA F1.spe
3504,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,LV28-44-3,11110,580,LV28-44-3_1053 580.0mm 12s 10kV 1200uA F1.spe


In [10]:
spe_df[spe_df.isnull().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename


## Build composite_id

In [20]:
spe_df.composite_depth_mm.max()

17000

In [21]:
composite_id = []
for core, depth in zip(spe_df.core, spe_df.composite_depth_mm):
    composite_id.append('{}_{:05}'.format(core, depth))
    
spe_df['composite_id'] = composite_id

## Drop duplicates

In [22]:
clean_df = spe_df.drop_duplicates('composite_id', keep = 'last')
len(clean_df)

3501

### Check those duplicates

In [23]:
spe_df.loc[spe_df.composite_id.duplicated(keep = 'last'), spe_df.columns[-6:]]

Unnamed: 0,cps,core,composite_depth_mm,section_depth_mm,filename,composite_id
2019,46462,SO178-12-3,13440,440,SO178-12-3_1300 440.0mm 10s 10kV 150uA No-F...,SO178-12-3_13440
2020,84886,SO178-12-3,13450,450,SO178-12-3_1300 450.0mm 10s 10kV 150uA No-F...,SO178-12-3_13450
2021,85193,SO178-12-3,13460,460,SO178-12-3_1300 460.0mm 10s 10kV 150uA No-F...,SO178-12-3_13460
2025,55317,SO178-12-3,13500,500,SO178-12-3_1300 500.0mm 10s 10kV 150uA No-F...,SO178-12-3_13500
3446,0,LV28-44-3,10540,1000,LV28-44-3_0954 1000.0mm 12s 10kV 1200uA F1.spe,LV28-44-3_10540


Just some overlaps at section edges. I would simply delete them.

## Build section

In [25]:
clean_df = clean_df.set_index('composite_id')

In [26]:
clean_df[clean_df.section_depth_mm == 0]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename
composite_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


No section depth start from 0.

In [28]:
section_all = []
# make sure the order follows the core and composite depth
clean_df.sort_values(by = 'composite_id', axis = 0, inplace = True)

for core in np.unique(clean_df.core):    
    # I assume every core scanned from section 0 so the first section in the core is marked as section 0
    # the deeper the larger number
    section = 0
    X = clean_df.loc[clean_df.core == core, 'section_depth_mm']
    for i in range(len(X)):
        section_all.append(section)
        try:
            # when section changes, the section depth should be rest to smaller number
            if X[i] > X[i + 1]:
                section += 1
        except IndexError:
            print('bottom of the core {}'.format(core))
        
clean_df['section'] = section_all

bottom of the core LV28-44-3
bottom of the core LV29-114-3
bottom of the core SO178-12-3


In [29]:
clean_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename,section
composite_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LV28-44-3_00010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,LV28-44-3,10,10,LV28-44-3_0000 10.0mm 12s 10kV 1200uA F1.spe,0
LV28-44-3_00020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,LV28-44-3,20,20,LV28-44-3_0000 20.0mm 12s 10kV 1200uA F1.spe,0
LV28-44-3_00030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,LV28-44-3,30,30,LV28-44-3_0000 30.0mm 12s 10kV 1200uA F1.spe,0
LV28-44-3_00040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,LV28-44-3,40,40,LV28-44-3_0000 40.0mm 12s 10kV 1200uA F1.spe,0
LV28-44-3_00050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,LV28-44-3,50,50,LV28-44-3_0000 50.0mm 12s 10kV 1200uA F1.spe,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SO178-12-3_16960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,54952,SO178-12-3,16960,900,SO178-12-3_1606 900.0mm 10s 10kV 150uA No-F...,17
SO178-12-3_16970,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,47476,SO178-12-3,16970,910,SO178-12-3_1606 910.0mm 10s 10kV 150uA No-F...,17
SO178-12-3_16980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,59149,SO178-12-3,16980,920,SO178-12-3_1606 920.0mm 10s 10kV 150uA No-F...,17
SO178-12-3_16990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,72548,SO178-12-3,16990,930,SO178-12-3_1606 930.0mm 10s 10kV 150uA No-F...,17


In [30]:
clean_df.to_csv('data/spe_dataset_{}.csv'.format(date))

## Read bulk chemistry

In [34]:
bulk_28_df = pd.read_excel('data/Bulk chem/LV28-44-3_TCN.xlsx')
bulk_28_df

Unnamed: 0,Depth,Age (ka),Sedrate (cm/ka),CaCO3 (wt. %),CaCO3 (AR g/cm2*ka),TOC (wt. %),TOC AR [g/cm2*ka],Nitrogen (wt. %),N2 AR [g/cm2*ka],OC/N ratio
0,2.5,1.3612,26.10,3.85,0.306080,1.4230,0.113130,0.1860,0.014787,8.9256
1,7.5,1.5530,26.10,4.99,0.407780,1.4020,0.114570,0.1915,0.015649,8.5413
2,12.5,1.7447,26.10,4.96,0.415940,1.2900,0.108180,0.1805,0.015137,8.3379
3,17.5,1.9364,26.10,2.92,0.270960,1.4595,0.135440,0.1935,0.017956,8.7997
4,22.5,2.1281,26.10,4.43,0.404850,1.3025,0.119030,0.1830,0.016724,8.3037
...,...,...,...,...,...,...,...,...,...,...
218,1087.5,154.4000,4.13,0.40,0.009638,0.5925,0.014276,0.0665,0.001602,10.3950
219,1092.5,155.6100,4.13,0.82,0.019395,0.5175,0.012240,0.0695,0.001644,8.6871
220,1097.5,156.8200,4.13,0.00,0.000000,0.5520,0.013235,0.0585,0.001403,11.0090
221,1102.5,158.0300,4.13,0.39,0.009811,0.5075,0.012767,0.0615,0.001547,9.6274


In [48]:
bulk_28_df.isna().any()

Depth                  False
Age (ka)               False
Sedrate (cm/ka)        False
CaCO3 (wt. %)          False
CaCO3 (AR g/cm2*ka)    False
TOC (wt. %)            False
TOC AR [g/cm2*ka]      False
Nitrogen (wt. %)       False
N2 AR [g/cm2*ka]       False
OC/N ratio             False
dtype: bool

In [36]:
bulk_29_df = pd.read_excel('data/Bulk chem/LV29 114-3_TOC%_CaCO3%LMax.xls')
bulk_29_df

Unnamed: 0,Age (ka BP),Depth (cm),TOC(%),TC(%),CaCO3 (%)
0,1.245858,0,0.9075,1.06,1.270783
1,1.281507,1,0.8100,1.11,2.499900
2,1.424100,5,0.8100,1.15,2.833220
3,1.602300,10,0.8922,1.07,1.481607
4,1.780600,15,0.9900,1.17,1.499940
...,...,...,...,...,...
79,,580,0.7585,,
80,,590,0.7296,,
81,,600,0.4875,,
82,,590,0.7296,,


In [37]:
bulk_29_df = bulk_29_df.dropna(axis=0)
bulk_29_df

Unnamed: 0,Age (ka BP),Depth (cm),TOC(%),TC(%),CaCO3 (%)
0,1.245858,0,0.9075,1.060000,1.270783
1,1.281507,1,0.8100,1.110000,2.499900
2,1.424100,5,0.8100,1.150000,2.833220
3,1.602300,10,0.8922,1.070000,1.481607
4,1.780600,15,0.9900,1.170000,1.499940
...,...,...,...,...,...
66,23.037001,450,0.6454,0.683737,0.319458
67,23.761709,460,0.6732,0.805945,1.106163
68,24.486417,470,0.6321,0.728977,0.807274
70,25.938793,490,0.6148,0.694443,0.663667


Drop out 16 rows having null values.

In [46]:
bulk_178_df = pd.read_table('data/Bulk chem/SO178-12-3_TOC.txt', header=0, usecols=range(3))
bulk_178_df

Unnamed: 0,Teufe(cm),Kohlenstoff(%),Schwefel(%)
0,0.5,0.83,0.28
1,11.5,0.84,0.26
2,21.5,0.94,0.30
3,31.5,1.00,0.33
4,41.5,1.00,0.34
...,...,...,...
58,591.5,0.73,0.35
59,601.5,0.71,0.28
60,621.5,0.73,0.38
61,641.5,0.76,0.30


In [47]:
bulk_178_df.isna().any()

Teufe(cm)         False
Kohlenstoff(%)    False
Schwefel(%)       False
dtype: bool

## Merge three cores' bulk chemistry
The depths in LV29-114-3 are all integer instead of XX.5 like previous cores which use mid depth. I assume it's a mistake so add 0.5 to the depths. 

In [68]:
depth = np.hstack((bulk_28_df['Depth'].values, bulk_29_df['Depth (cm)']+.5, bulk_178_df['Teufe(cm)']))
toc = np.hstack((bulk_28_df['TOC (wt. %)'].values, bulk_29_df['TOC(%)'], bulk_178_df['Kohlenstoff(%)']))
# SO178-12-3 doesn't have CaCO3 so I simply asign np.NaN
caco3 = np.hstack((bulk_28_df['CaCO3 (wt. %)'].values, bulk_29_df['CaCO3 (%)'], [np.NaN for _ in range(len(bulk_178_df))]))
core = np.hstack((['LV28-44-3' for _ in range(len(bulk_28_df))], ['LV29-114-3' for _ in range(len(bulk_29_df))], ['SO178-12-3' for _ in range(len(bulk_178_df))]))

In [69]:
print(len(depth), len(toc), len(caco3), len(core))

354 354 354 354


In [70]:
bulk_df = pd.DataFrame({'mid_depth_mm': depth*10,
                        'TOC%': toc,
                        'CaCO3%': caco3,
                        'core': core
                       })
bulk_df

Unnamed: 0,mid_depth_mm,TOC%,CaCO3%,core
0,25.0,1.4230,3.85,LV28-44-3
1,75.0,1.4020,4.99,LV28-44-3
2,125.0,1.2900,4.96,LV28-44-3
3,175.0,1.4595,2.92,LV28-44-3
4,225.0,1.3025,4.43,LV28-44-3
...,...,...,...,...
349,5915.0,0.7300,,SO178-12-3
350,6015.0,0.7100,,SO178-12-3
351,6215.0,0.7300,,SO178-12-3
352,6415.0,0.7600,,SO178-12-3


In [73]:
bulk_df.to_csv('data/bulk_dataset_{}.csv'.format(date))

# Combine the dataset to the previous datasets

The new cores all lack TC and core SO178-12-3 lacks CaCO3 also.

In [62]:
bulk_p_df = pd.read_csv('data/bulk_dataset_20201007.csv', index_col=0)
spe_p_df = pd.read_csv('data/spe_dataset_20201008.csv', index_col=0)
print(bulk_df.shape, clean_df.shape)
print(bulk_p_df.shape, spe_p_df.shape)

(354, 4) (3501, 2054)
(388, 5) (34322, 2054)


In [71]:
bulk_c_df = pd.concat([bulk_p_df, bulk_df], axis=0, join='outer')
spe_c_df = pd.concat([spe_p_df, clean_df], axis=0, join='outer')
print(bulk_c_df.shape, spe_c_df.shape)

(742, 5) (37823, 2054)


In [72]:
bulk_c_df

Unnamed: 0,TC%,TOC%,CaCO3%,core,mid_depth_mm
0,2.542079,0.394127,17.898887,SO264-64-1,115.0
1,2.247150,0.611208,13.632300,SO264-64-1,215.0
2,0.710588,0.523402,1.559822,SO264-64-1,305.0
3,0.562171,0.472551,0.746802,SO264-64-1,1015.0
4,0.578167,0.312852,2.210866,SO264-64-1,1815.0
...,...,...,...,...,...
349,,0.730000,,SO178-12-3,5915.0
350,,0.710000,,SO178-12-3,6015.0
351,,0.730000,,SO178-12-3,6215.0
352,,0.760000,,SO178-12-3,6415.0


## Quick view

In [2]:
clean_df = pd.read_csv('data/spe_dataset_20201215.csv', index_col=0)
spe_p_df = pd.read_csv('data/spe_dataset_20201008.csv', index_col=0)

In [None]:
plt.plot(range(2048), clean_df.loc[clean_df.core == 'LV28-44-3', clean_df.columns[:2048]].mean(), label='LV28-44-3', alpha=.7)
plt.plot(range(2048), clean_df.loc[clean_df.core != 'LV28-44-3', clean_df.columns[:2048]].mean(), label='Other two cores', alpha=.7)
plt.plot(range(2048), spe_p_df.loc[:, spe_p_df.columns[:2048]].mean(), label='SO264', alpha=.7)
plt.suptitle('Averaged spectrum')
plt.xlim(0, 630)
plt.xlabel('Channel')
plt.ylabel('Count')
plt.legend()
plt.savefig('results/spectrum_{}.png'.format(date))

# Merge spe and bulk datasets

In [74]:
mask_c = spe_c_df.columns[:2048]  # only the channels
merge_df = pd.DataFrame()

for index, row in bulk_c_df.iterrows():
    mid = row['mid_depth_mm']
    core = row['core']
    
    # get the spe in 10 mm interval
    mask_r = (spe_c_df.composite_depth_mm >= (mid-5)) & (spe_c_df.composite_depth_mm <= (mid+5)) & (spe_c_df.core == core)
    merge_df = pd.concat(
        [merge_df, spe_c_df.loc[mask_r, mask_c].apply(np.mean, axis = 0).append(row)],
        axis = 1
    )
    
merge_df = merge_df.T.reset_index(drop = True)

In [75]:
merge_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,TC%,TOC%,CaCO3%,core,mid_depth_mm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2.54208,0.394127,17.8989,SO264-64-1,115
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2.24715,0.611208,13.6323,SO264-64-1,215
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.710588,0.523402,1.55982,SO264-64-1,305
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.562171,0.472551,0.746802,SO264-64-1,1015
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.578167,0.312852,2.21087,SO264-64-1,1815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.73,,SO178-12-3,5915
738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.71,,SO178-12-3,6015
739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.73,,SO178-12-3,6215
740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.76,,SO178-12-3,6415


### Check rows having nan in any column 

In [76]:
merge_df[merge_df.isnull().any(axis = 1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,TC%,TOC%,CaCO3%,core,mid_depth_mm
37,,,,,,,,,,,...,,,,,,0.565117,0.385465,1.49704,SO264-66-2,395
149,,,,,,,,,,,...,,,,,,9.91158,0.151331,81.3321,SO264-28-2,7595
150,,,,,,,,,,,...,,,,,,7.79372,0.154413,63.6583,SO264-55-1,25
151,,,,,,,,,,,...,,,,,,7.72836,0.131972,63.3007,SO264-55-1,75
330,,,,,,,,,,,...,,,,,,8.79912,0.123676,72.2924,SO264-15-2,7805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.73,,SO178-12-3,5915
738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.71,,SO178-12-3,6015
739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.73,,SO178-12-3,6215
740,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.76,,SO178-12-3,6415


### Check rows having nan in spetra
They mean the data points has bulk measurements but without XRF measurement.

In [77]:
merge_df[merge_df.iloc[:, :2048].isnull().any(axis = 1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,TC%,TOC%,CaCO3%,core,mid_depth_mm
37,,,,,,,,,,,...,,,,,,0.565117,0.385465,1.49704,SO264-66-2,395
149,,,,,,,,,,,...,,,,,,9.91158,0.151331,81.3321,SO264-28-2,7595
150,,,,,,,,,,,...,,,,,,7.79372,0.154413,63.6583,SO264-55-1,25
151,,,,,,,,,,,...,,,,,,7.72836,0.131972,63.3007,SO264-55-1,75
330,,,,,,,,,,,...,,,,,,8.79912,0.123676,72.2924,SO264-15-2,7805
331,,,,,,,,,,,...,,,,,,10.9751,0.0985913,90.6339,SO264-15-2,8005
611,,,,,,,,,,,...,,,,,,,0.9075,1.27078,LV29-114-3,5
612,,,,,,,,,,,...,,,,,,,0.81,2.4999,LV29-114-3,15
619,,,,,,,,,,,...,,,,,,,0.75,3.49986,LV29-114-3,355
620,,,,,,,,,,,...,,,,,,,0.8427,1.95049,LV29-114-3,405


In [78]:
merge_df[~merge_df.iloc[:, :2048].isnull().any(axis = 1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2043,2044,2045,2046,2047,TC%,TOC%,CaCO3%,core,mid_depth_mm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2.54208,0.394127,17.8989,SO264-64-1,115
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2.24715,0.611208,13.6323,SO264-64-1,215
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.710588,0.523402,1.55982,SO264-64-1,305
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.562171,0.472551,0.746802,SO264-64-1,1015
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.578167,0.312852,2.21087,SO264-64-1,1815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.71,,SO178-12-3,5715
737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.73,,SO178-12-3,5915
738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.71,,SO178-12-3,6015
739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,0.73,,SO178-12-3,6215


Comparing to the previous merged dataset (382), the updated dataset has 317 more data points.

## Export dataset
This dataset combines the preivous and updated merged datasets. The data points having no TC or CaCO3 measurements are still kept.

In [79]:
merge_df[~merge_df.iloc[:, :2048].isnull().any(axis = 1)].to_csv('data/spe+bulk_dataset_{}.csv'.format(date))