In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
# Read in raw data

# For XRF, ignore repeats and standards
xrf = pd.read_excel('raw/GAL-DV-21_XRF.xlsx',usecols='A:BB',index_col=0)

# For ICPMS, ignnore repeats and standards
icpms = pd.read_excel('raw/DV-21-ICPMS.xlsx',index_col=0,nrows=54)

# Make directory for processed data
os.makedirs('processed',exist_ok=True)

In [None]:
# Pull unnormalized major elements from XRF and transpose
majors_unnorm = xrf.iloc[5:15,:].T

# Pull normalized major elements from XRF and transpose to make samples index
majors_norm = xrf.iloc[19:29,:].T

# Pull LOI from XRF and transpose 
loi = xrf.iloc[16,:].T

# Remove extra space in majors_unnorm columns
new_cols = majors_unnorm.columns.str.replace(' ','')
new_cols_unnorm = [x+'_unnorm' for x in new_cols]
majors_unnorm_corr = majors_unnorm.copy()
majors_unnorm_corr.columns = new_cols_unnorm

# Remove extra space in majors_norm columns
new_cols = majors_norm.columns.str.replace(' ','')
majors_norm_corr = majors_norm.copy()
majors_norm_corr.columns = new_cols

# Pull XRF trace elements and transpose
xrf_trace = xrf.iloc[32:51,:].T

# Remove extra space in XRF Trace columns
new_cols = xrf_trace.columns.str[1:]
xrf_trace_corr = xrf_trace.copy()
xrf_trace_corr.columns = new_cols

# Remove 'ppm' from ICPMS columns
new_cols = icpms.columns.str[:-4]
icpms_corr = icpms.copy()
icpms_corr.columns  = new_cols

# Remove XRF data duplicated by ICPMS
common_cols = xrf_trace_corr.columns.intersection(icpms_corr.columns)
xrf_trace_culled = xrf_trace_corr.drop(common_cols,axis=1)

# Check what is in each file
print(majors_unnorm_corr.columns)
print(majors_norm_corr.columns)
print(loi.name)
print(xrf_trace_culled.columns)
print(icpms_corr.columns)
print(majors_norm_corr.index.equals(xrf_trace_culled.index))
print(xrf_trace_culled.index.equals(icpms_corr.index))

# Combine into single dataframe
data_organized = pd.concat([majors_unnorm_corr,majors_norm_corr,loi,xrf_trace_culled,icpms_corr],axis=1)
print(data_organized.columns)

In [None]:
# Read in metadata
meta = pd.read_csv('metadata/gchm_smps_long.csv',index_col=0)

# Isolate columns of interest
meta_cols = ['Latitude','Longitude','Elevation','Rock_Type','Period','S_Domain']

# Create placeholder rows for 184 and 186 for now
meta.loc['G22184',:] = 0
meta.loc['G22186',:] = 0

# Fix the misspelled Khaishi
typo = meta[meta['S_Domain']=='Khashi'].index
print(typo)
meta.loc[typo,'S_Domain'] = 'Khaishi'

# Isolate rows that were actually run
meta_trimmed = meta.loc[data_organized.index,meta_cols]

# Add metadata to main table
data_final = data_organized.join(meta_trimmed)

In [None]:
# Combine with 2019 data
data_2019 = pd.read_csv('processed/data_2019.csv',index_col=0)

data_combined = pd.concat([data_final,data_2019])
print(data_combined.index)
print(data_combined.columns)

In [None]:
# Pull in lithologies from thin sections
lith = pd.read_csv('metadata/lithology.csv',index_col=0)
data_combined.loc[lith.index,'Rock_Type'] = lith['Rock Name']

In [None]:
# Write to CSV
data_combined.to_csv('processed/data.csv')

In [None]:
# import data from Gale13
data = pd.read_excel('published/galeetal_supplementarytables_final.xlsx',sheet_name='5- Master TE-Iso Compilation.')

# isolate back-arc basins
bab_data = data[data['Seg Name'].astype(str).str.startswith('B')]

# Assign nan to empty cells
bab_data.replace(' ',np.nan,inplace=True)

# Save modified data
bab_data.to_csv('processed/data_gale.csv')