In [1]:
import re
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import glob
from pandas.io.pytables import HDFStore

In [2]:
from sphandles.sphandle import sphandle
from sphandles.ML_Train import mltrain

In [3]:
def parse_filename(fname):
    pattern = ".*__(\w+).csv"
    m = re.match(pattern, fname)
    return m.group(1)

def read_csv(fname,dropcols=None, dropzero = None):
    dropcols = dropcols if dropcols else []
    label = parse_filename(fname)
    df = pd.read_csv('./{}'.format(fname),
                    index_col=0)
    df = df[[c for c in df.columns if c.endswith('mass_g') and c not in dropcols]].fillna(0.0)
    df.columns = df.columns.str.replace('_mass_g','')
    if dropzero != None:
        df = df[abs(df).T.sum() > 0].reset_index(drop=True)
        df[df < 0] = 0
        df['label'] = label
        df = df.set_index('label')
    return df, label


#user function: loads a list of variable path names and assigns a label 
def load_and_label(pathname, newlabel, DROPCOLS):
    data1 = pd.DataFrame()
    for i in glob.glob(pathname):
        data, soil_label = read_csv(i,DROPCOLS, 1)
        data1 = pd.concat([data,data1], axis = 0, sort=False)
    data1['newlabel'] = newlabel
    data1 = data1.set_index('newlabel')
    return data1

#Type: Data Frame. From a selected isotope and its particle events, select all other isotopes associated and drop others not associated with it
def isotope_particle(data, isotope):
    obs = data[data[isotope] > 0.0]
    return obs

In [5]:
DROPCOLS = ['23Na_mass_g', '27Al_mass_g', '28Si_mass_g', '31P_mass_g', '32S_mass_g', '34S_mass_g', '39K_mass_g', '40Ca_mass_g', '44Ca_mass_g',
            '78Se_mass_g', '80Se_mass_g', '56Fe_mass_g', '87Rb_mass_g', '238U_mass_g']

#Data set run from 6_07_19
tiobs, ti_label = sphandle.read_csv('sample_data/6_07_19/All_Quantified__67Tiobs16.csv', dropcols=DROPCOLS)
lufa56, lufa_label = sphandle.read_csv('sample_data/6_07_19/All_Quantified__67LufaBlank16.csv', dropcols=DROPCOLS)
Lufa100, lufa100_label = sphandle.read_csv('sample_data/6_07_19/All_Quantified__67Lufa100ppm16.csv', dropcols=DROPCOLS)
Lufa1k, lufa1k_label =  sphandle.read_csv('sample_data/6_07_19/All_Quantified__67Lufa1000ppm16.csv', dropcols=DROPCOLS)
Lufa10k, lufa10k_label = sphandle.read_csv('sample_data/6_07_19/All_Quantified__67Lufa10000ppm16.csv', dropcols=DROPCOLS)

#concatonate all Tiobs samples - from past
Tiobs1, ti_label = sphandle.read_csv('sample_data/6_07_19/All_Quantified__67Tiobs16.csv', dropcols=DROPCOLS)
Tiobs2, ti_label = sphandle.read_csv('sample_data/4_12_19/All_Quantified__Tiobs5.csv', dropcols=DROPCOLS)
Tiobs3, ti_label = sphandle.read_csv('sample_data/4_12_19/All_Quantified__Tiobs3.csv', dropcols=DROPCOLS)
Tiobs = pd.concat([Tiobs1, Tiobs2], axis=0, sort=False)
Tiobs['newlabel'] = 'Engineered'
Tiobs = Tiobs.set_index('newlabel')

#Lufa 2.2 Soil
LufaBlank1, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank11.csv', DROPCOLS)
LufaBlank2, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank21.csv', DROPCOLS)
LufaBlank3, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank31.csv', DROPCOLS)
LufaBlank4, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank41.csv', DROPCOLS)
LufaBlank5, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank51.csv', DROPCOLS)
LufaBlank6, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank12.csv', DROPCOLS)
LufaBlank7, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank22.csv', DROPCOLS)
LufaBlank8, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank32.csv', DROPCOLS)
LufaBlank9, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank42.csv', DROPCOLS)
LufaBlank10, lufa_label = sphandle.read_csv('sample_data/7_02_19/All_Quantified__LufaBlank52.csv', DROPCOLS)
LufaBlank = pd.concat([LufaBlank1, LufaBlank2, LufaBlank3, LufaBlank4, LufaBlank5, LufaBlank6, LufaBlank7, LufaBlank8, LufaBlank9, LufaBlank10], axis=0, sort=False)

LufaBlank['newlabel'] = 'Natural'
LufaBlank = LufaBlank.set_index('newlabel')

#Frsludge
Frsludge1, sl_label = sphandle.read_csv('sample_data/4_30_19/All_Quantified__Frsludge44.csv', DROPCOLS)
Frsludge2, sl_label = sphandle.read_csv('sample_data/6_07_19/All_Quantified__67Frsludge16.csv', DROPCOLS)
Frsludge = pd.concat([Frsludge1, Frsludge2], axis=0, sort=False)
Frsludge['newlabel'] = 'Engineered'
Frsludge = Frsludge.set_index('newlabel')

#Soils
L221, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__22soil1.csv', DROPCOLS)
L222, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__22soil2.csv', DROPCOLS)
L223, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__22soil3.csv', DROPCOLS)
L231, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__23soilB1.csv', DROPCOLS)
L232, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__23soilB2.csv', DROPCOLS)
L233, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__23soilB3.csv', DROPCOLS)
L241, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__24soilB1.csv', DROPCOLS)
L242, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__24soilB2.csv', DROPCOLS)
L243, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__24soilB3.csv', DROPCOLS)
Fr1, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__Frsoil1.csv', DROPCOLS)
Fr2, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__Frsoil2.csv', DROPCOLS)
Fr3, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__Frsoil3.csv', DROPCOLS)
Az1, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__AzsoilB1.csv', DROPCOLS)
Az2, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__AzsoilB2.csv', DROPCOLS)
Az3, soil_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__AzsoilB3.csv', DROPCOLS)

L22 = pd.concat([L221, L222, L223], axis=0, sort = False)
L23 = pd.concat([L231, L232, L233], axis=0, sort = False)
L24 = pd.concat([L241, L242, L243], axis=0, sort = False)
Fr = pd.concat([Fr1,Fr2,Fr3], axis=0, sort=False)
Az = pd.concat([Az1,Az2,Az3], axis=0, sort=False)

L22['newlabel'] = 'Natural'
L23['newlabel'] = 'Natural'
L24['newlabel'] = 'Natural'
Fr['newlabel'] = 'Natural'
Az['newlabel'] = 'Natural'

L22 = L22.hiset_index('newlabel')
L23 = L23.set_index('newlabel')
L24 = L24.set_index('newlabel')
Fr = Fr.set_index('newlabel')
Az = Az.set_index('newlabel')

#Ti30nm
#Since Ti sample size was so big, use only one dataset
#Ti30nm, ti_label = read_csv('../USARMY/8_19_19/All_Quantified__30nm1.csv', DROPCOLS) THIS IS NOT 30nm
Ti30nm, ti_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__30nm2.csv', DROPCOLS)
Ti30nm2, ti_label = sphandle.read_csv('sample_data/8_19_19/All_Quantified__30nm1.csv', DROPCOLS)

#Ti30nm = pd.concat([Ti30nm1, Ti30nm2], axis = 0, sort = False)
Ti30nm['newlabel'] = 'Engineered'
Ti30nm = Ti30nm.set_hiindex('newlabel')

#With focus on Ti-containing materials only. Need to eliminate 'noise' in TiobsTi
LufaBlankTi = sphandle.isotope_particle(LufaBlank, '48Ti')
TiobsTi = sphandle.isotope_particle(Tiobs, '48Ti')

#08_19_19
Ti30nmTi = sphandle.isotope_particle(Ti30nm, '48Ti')
L22Ti = sphandle.isotope_particle(L22, '48Ti')
L23Ti = sphandle.isotope_particle(L23, '48Ti')
L24Ti = sphandle.isotope_particle(L24, '48Ti')
FrTi = sphandle.isotope_particle(Fr, '48Ti')
AzTi = sphandle.isotope_particle(Az, '48Ti')

#combination of 4/30/19 and 06/07/19
FrslTi = sphandle.isotope_particle(Frsludge, '48Ti')

fullset = [AzTi, FrTi, LufaBlankTi, TiobsTi, FrslTi, Ti30nmTi]
keys = ['Az', 'Frso', 'L22', 'OBS', 'Frsl', 'Ti30nm']

AttributeError: 'DataFrame' object has no attribute 'hiset_index'

In [None]:
store = pd.HDFStore('sample_data/H5 files/alldfsnormalnoAltest.h5')
for i, j in zip(fullset, keys):
    store.put(j,i)
store.close()

# HEH Mode

In [27]:
DROPCOLS = ['23Na_mass_g', '31P_mass_g', 
            '32S_mass_g', '34S_mass_g', '39K_mass_g']

AZ2 = sphandle.load_and_label('sample_data/11_12_19_HEH/All_Quantified__AZso*.csv', 'Natural', DROPCOLS)
Frso2 = sphandle.load_and_label('sample_data/11_12_19_HEH/All_Quantified__Frso*.csv', 'Natural', DROPCOLS) 
L221c2 = sphandle.load_and_label('sample_data/11_12_19_HEH/All_Quantified__L22*.csv', 'Natural', DROPCOLS)
Frsl2 = sphandle.load_and_label('sample_data/11_12_19_HEH/All_Quantified__Frsl*.csv', 'Engineered', DROPCOLS)
obs2 = sphandle.load_and_label('sample_data/11_12_19_HEH/All_Quantified__OBS*.csv', 'Engineered', DROPCOLS)
Ti302 = sphandle.load_and_label('sample_data/11_12_19_HEH/All_Quantified__30nm*.csv', 'Engineered', DROPCOLS)

#Focusing on Ti particles
AZ2Ti = sphandle.isotope_particle(AZ2, '48Ti')
Frsl2Ti = sphandle.isotope_particle(Frsl2, '48Ti')
Frso2Ti = sphandle.isotope_particle(Frso2, '48Ti')
L222Ti = sphandle.isotope_particle(L221c2, '48Ti')
obs2Ti = sphandle.isotope_particle(obs2, '48Ti')
Ti302Ti = sphandle.isotope_particle(Ti302, '48Ti')

fullset = [AZ2Ti, Frsl2Ti, Frso2Ti, L222Ti, obs2Ti, Ti302Ti]
keys = ['Az', 'Frsl', 'Frso', 'L22', 'OBS', 'Ti30nm']

In [28]:
store = pd.HDFStore('sample_data/H5 files/alldfsHEHtest.h5')
for i, j in zip(fullset, keys):
    store.put(j,i)
store.close()