In [1]:
import os
import pandas as pd

In [10]:
# Read in raw data

# For XRF, ignore repeats and standards
xrf = pd.read_excel('raw/VAS1980.xls',usecols='A:D',index_col=0,header=1)

# For ICPMS, ignnore repeats and standards
icpms = pd.read_excel('raw/vas930.xls',index_col=0,nrows=54)

# Make directory for processed data
os.makedirs('processed',exist_ok=True)

In [16]:
# Pull normalized major elements from XRF and transpose to make samples index
majors_norm = xrf.iloc[18:28,:].T

# Remove extra space in majors columns
new_cols = majors_norm.columns.str.replace(' ','')
majors_norm_corr = majors_norm.copy()
majors_norm_corr.columns = new_cols

# Pull XRF trace elements and transpose
xrf_trace = xrf.iloc[31:50,:].T

# Remove extra space in XRF Trace columns
new_cols = xrf_trace.columns.str[1:]
xrf_trace_corr = xrf_trace.copy()
xrf_trace_corr.columns = new_cols

# Remove 'ppm' from ICPMS columns
new_cols = icpms.columns.str[:-4]
icpms_corr = icpms.copy()
icpms_corr.columns  = new_cols

# Remove 'VAS' from ICPMS indices
icpms_indices_stripped = icpms_corr.index.str[4:]
icpms_corr.index = icpms_indices_stripped

# Remove XRF data duplicated by ICPMS
common_cols = xrf_trace_corr.columns.intersection(icpms_corr.columns)
xrf_trace_culled = xrf_trace_corr.drop(common_cols,axis=1)

# Check what is in each file
print(majors_norm_corr.columns)
print(xrf_trace_culled.index)
print(xrf_trace_culled.columns)
print(xrf_trace_culled.index)
print(icpms_corr.columns)
print(icpms_corr.index)
print(majors_norm_corr.index.equals(xrf_trace_culled.index))
print(xrf_trace_culled.index.equals(icpms_corr.index))


# Combine into single dataframe
data_organized = pd.concat([majors_norm_corr,xrf_trace_culled,icpms_corr],axis=1)
print(data_organized.columns)
print(data_organized.index)

Index(['SiO2', 'TiO2', 'Al2O3', 'FeO*', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O',
       'P2O5'],
      dtype='object')
Index(['C17069B', 'V17043A', '100311-3A'], dtype='object')
Index(['Ni', 'Cr', 'V', 'Ga', 'Cu', 'Zn'], dtype='object')
Index(['C17069B', 'V17043A', '100311-3A'], dtype='object')
Index(['La', 'Ce', 'Pr', 'Nd', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm',
       'Yb', 'Lu', 'Ba', 'Th', 'Nb', 'Y', 'Hf', 'Ta', 'U', 'Pb', 'Rb', 'Cs',
       'Sr', 'Sc', 'Zr'],
      dtype='object')
Index(['C17069B', 'V17043A', '100311-3A'], dtype='object', name='Sample ID')
True
True
Index(['SiO2', 'TiO2', 'Al2O3', 'FeO*', 'MnO', 'MgO', 'CaO', 'Na2O', 'K2O',
       'P2O5', 'Ni', 'Cr', 'V', 'Ga', 'Cu', 'Zn', 'La', 'Ce', 'Pr', 'Nd', 'Sm',
       'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Ba', 'Th', 'Nb',
       'Y', 'Hf', 'Ta', 'U', 'Pb', 'Rb', 'Cs', 'Sr', 'Sc', 'Zr'],
      dtype='object')
Index(['C17069B', 'V17043A', '100311-3A'], dtype='object')


In [17]:
# Read in metadata
meta = pd.read_csv('metadata/gchm_qe.csv',index_col=0)

# Isolate columns of interest
meta_cols = ['Latitude','Longitude','Rock_Type','Period','S_Domain']

# Isolate rows that were actually run
meta_trimmed = meta.loc[data_organized.index,meta_cols]

# Add metadata to main table
data_final = data_organized.join(meta_trimmed)

In [18]:
# Write to CSV
data_final.to_csv('processed/data_qe.csv')