In [10]:
import pandas as pd
import itertools as itt
import numbers as nb
import numpy as np
import statistics as st
import re
import math

from cleaning_utils import pivot_dataset, transform_chemical_data, create_standardise_units_func, drop_chemical_columns, order_cols

In [2]:
idaho_data = pd.read_csv("Datasets/idaho_data.csv")

  idaho_data = pd.read_csv("Datasets/idaho_data.csv")


In [3]:
idaho_data[["SampleNumber", "CharName", "Amount", "UOM", "MinDetectLimit"]]

Unnamed: 0,SampleNumber,CharName,Amount,UOM,MinDetectLimit
0,GWQM-1993-237,Fecal Coliform,<1,col/100 ml,1.000
1,GWQM-1993-237,Alkalinity as (CaCO3),247,mg/l,257.000
2,GWQM-1993-237,Ammonia,0.03,mg/l,0.002
3,GWQM-1993-237,Arsenic,<1,ug/L,0.200
4,GWQM-1993-237,Bicarbonate,302,mg/L,
...,...,...,...,...,...
885839,GWQM-2022-0218,"1H,1H, 2H, 2H-Perfluorohexane sulfonic acid (4...",<0.00200,ug/L,0.002
885840,GWQM-2022-0218,Dissolved Oxygen,4.83,mg/l,0.000
885841,GWQM-2022-0218,pH,6.8,pH,0.000
885842,GWQM-2022-0218,Specific Conductance,760.4,uS/cm,0.000


In [4]:
desired_chemical_names = ["Chloride", "Sulfate", "Hardness", "Sodium", "Potassium", "Magnesium", "Calcium",
                          "Specific Conductance", "Total Dissolved Solids", "Water Temperature", "pH"]

dataset_pivoted = pivot_dataset(
    idaho_data,
    sample_id_columns=["SampleNumber", "SampleDate"],
    per_sample_data=["Latitude", "Longitude"],
    chemical_name_column="CharName",
    values_per_chemical=["Amount", "UOM", "MinDetectLimit"],
    desired_chemical_names=desired_chemical_names,
    # drop_duplicates=True
)
dataset_pivoted

Unnamed: 0_level_0,SampleNumber,SampleDate,Latitude,Longitude,Calcium,Calcium,Calcium,Chloride,Chloride,Chloride,...,Sulfate,Total Dissolved Solids,Total Dissolved Solids,Total Dissolved Solids,Water Temperature,Water Temperature,Water Temperature,pH,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM,...,UOM,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM
0,0003ETAN655B,29/03/2000 10:20,43.847830,-112.709326,[65.6],[0.1],[mg/l],,,,...,,,,,,,,,,
1,0003ETAN655C,29/03/2000 10:20,43.847830,-112.709326,,,,[411],[0.9],[mg/l],...,[mg/l],[1320],[1.0],[mg/l],,,,,,
2,0004ENRFIWDB,20/04/2000 10:05,43.651806,-112.919948,[896],[0.1],[mg/l],,,,...,,,,,,,,,,
3,0004ENRFIWDD,20/04/2000 10:05,43.651806,-112.919948,,,,[9713],[0.9],[mg/l],...,[mg/l],[17100],[1.0],[mg/l],,,,,,
4,0004ENRFSEWB,20/04/2000 9:40,43.653774,-112.912298,[8.4],[0.1],[mg/l],,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28690,IDEQ-2017-11-08-6159,8/11/2017 14:32,43.376230,-116.550910,,,,,,,...,,,,,[15.79],[nan],[°C],[7.56],[nan],[pH]
28691,IDEQ-2017-11-14-6160,14/11/2017 10:55,43.717750,-116.998090,,,,,,,...,,,,,[18.94],[nan],[°C],[7.14],[nan],[pH]
28692,IDEQ-2017-11-14-6161,14/11/2017 11:40,43.713400,-116.902030,,,,,,,...,,,,,[15.42],[nan],[°C],[6.95],[nan],[pH]
28693,IDEQ-2017-11-14-6162,14/11/2017 12:17,43.697140,-117.020360,,,,,,,...,,,,,[16.51],[nan],[°C],[7.42],[nan],[pH]


In [5]:
def clean_units(units):
    if isinstance(units, str):
        return ("".join(units.strip().lower().split()),)
    else:
        return (np.nan,)


transform_chemical_data(
    dataset_pivoted,
    desired_chemical_names,
    clean_units,
    ["UOM"],
    ["UOM"],
    split_lists=True
)

dataset_pivoted

Unnamed: 0_level_0,SampleNumber,SampleDate,Latitude,Longitude,Calcium,Calcium,Calcium,Chloride,Chloride,Chloride,...,Sulfate,Total Dissolved Solids,Total Dissolved Solids,Total Dissolved Solids,Water Temperature,Water Temperature,Water Temperature,pH,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM,...,UOM,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM
0,0003ETAN655B,29/03/2000 10:20,43.847830,-112.709326,[65.6],[0.1],[mg/l],,,,...,,,,,,,,,,
1,0003ETAN655C,29/03/2000 10:20,43.847830,-112.709326,,,,[411],[0.9],[mg/l],...,[mg/l],[1320],[1.0],[mg/l],,,,,,
2,0004ENRFIWDB,20/04/2000 10:05,43.651806,-112.919948,[896],[0.1],[mg/l],,,,...,,,,,,,,,,
3,0004ENRFIWDD,20/04/2000 10:05,43.651806,-112.919948,,,,[9713],[0.9],[mg/l],...,[mg/l],[17100],[1.0],[mg/l],,,,,,
4,0004ENRFSEWB,20/04/2000 9:40,43.653774,-112.912298,[8.4],[0.1],[mg/l],,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28690,IDEQ-2017-11-08-6159,8/11/2017 14:32,43.376230,-116.550910,,,,,,,...,,,,,[15.79],[nan],[°c],[7.56],[nan],[ph]
28691,IDEQ-2017-11-14-6160,14/11/2017 10:55,43.717750,-116.998090,,,,,,,...,,,,,[18.94],[nan],[°c],[7.14],[nan],[ph]
28692,IDEQ-2017-11-14-6161,14/11/2017 11:40,43.713400,-116.902030,,,,,,,...,,,,,[15.42],[nan],[°c],[6.95],[nan],[ph]
28693,IDEQ-2017-11-14-6162,14/11/2017 12:17,43.697140,-117.020360,,,,,,,...,,,,,[16.51],[nan],[°c],[7.42],[nan],[ph]


In [6]:

def format_amount(erase_invalid: bool = False):

    def format_amount_func(amount, min_detection_limit, uom):
        # if amount is already number
        if isinstance(amount, nb.Number):
            return ('=', float(amount), uom)

        if isinstance(amount, str):
            # remove whitespace
            amount = ''.join(amount.split())
            amount_prefix = None

            # if amount matches prefix-numerical form
            if re.fullmatch(r'([<>=]?)[0-9]*(\.?)[0-9]+', amount):
                if re.match(r'[<>=]', amount):
                    amount_prefix = re.split(r'[<>=]', amount)
                    amount_prefix[1] = float(amount_prefix[1])
                    amount_prefix = (amount_prefix[0], amount_prefix[1], uom)
                else:
                    amount_prefix = ("=", float(amount), uom)
                return amount_prefix

            # if amount is below detection limit
            elif amount == "BDL" or amount == "ND":
                if min_detection_limit is not None:
                    return ("<", min_detection_limit, uom)
                else:
                    return ("=", 0, uom)

        # otherwise amount is invalid
        if erase_invalid:
            return (np.nan, np.nan, np.nan)
        else:
            raise ValueError("Invalid Formatting - " + str(amount))

    return format_amount_func


transform_chemical_data(
    dataset_pivoted,
    desired_chemical_names,
    format_amount(True),
    ["Amount", "MinDetectLimit", "UOM"],
    ["Prefix", "Amount", "UOM"],
    split_lists=True
)

dataset_pivoted

Unnamed: 0_level_0,SampleNumber,SampleDate,Latitude,Longitude,Calcium,Calcium,Calcium,Chloride,Chloride,Chloride,...,Sulfate,Hardness,Sodium,Potassium,Magnesium,Calcium,Specific Conductance,Total Dissolved Solids,Water Temperature,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM,...,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix
0,0003ETAN655B,29/03/2000 10:20,43.847830,-112.709326,[65.6],[0.1],[mg/l],,,,...,=,=,[=],[=],[=],[=],=,=,=,=
1,0003ETAN655C,29/03/2000 10:20,43.847830,-112.709326,,,,[411.0],[0.9],[mg/l],...,[=],=,=,=,=,=,=,[=],=,=
2,0004ENRFIWDB,20/04/2000 10:05,43.651806,-112.919948,[896.0],[0.1],[mg/l],,,,...,=,=,[=],[=],[=],[=],=,=,=,=
3,0004ENRFIWDD,20/04/2000 10:05,43.651806,-112.919948,,,,[9713.0],[0.9],[mg/l],...,[=],=,=,=,=,=,=,[=],=,=
4,0004ENRFSEWB,20/04/2000 9:40,43.653774,-112.912298,[8.4],[0.1],[mg/l],,,,...,=,=,[=],[=],[=],[=],=,=,=,=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28690,IDEQ-2017-11-08-6159,8/11/2017 14:32,43.376230,-116.550910,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]
28691,IDEQ-2017-11-14-6160,14/11/2017 10:55,43.717750,-116.998090,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]
28692,IDEQ-2017-11-14-6161,14/11/2017 11:40,43.713400,-116.902030,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]
28693,IDEQ-2017-11-14-6162,14/11/2017 12:17,43.697140,-117.020360,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]


In [7]:
convert_to_standard = dict()

# mass/volume concentration g/L
convert_to_standard['g/l'] = lambda x: x
convert_to_standard['mg/l'] = lambda x: x/1e3
convert_to_standard['ug/l'] = lambda x: x/1e6
convert_to_standard['µg/l'] = lambda x: x/1e6
convert_to_standard['ng/l'] = lambda x: x/1e9

# pH
convert_to_standard['ph'] = lambda x: x

# temperature °c
convert_to_standard['°c'] = lambda x: x
convert_to_standard['°f'] = lambda x: 5*(x-32)/9

# conductivity us/cm
convert_to_standard['us/cm'] = lambda x: x
convert_to_standard['µs/cm'] = lambda x: x


# apply conversion to both the amount and the mindetection limit
transform_chemical_data(
    dataset_pivoted,
    desired_chemical_names,
    create_standardise_units_func(convert_to_standard),
    ["Amount", "UOM"],
    ["Amount"],
    split_lists=True
)

# apply conversion
transform_chemical_data(
    dataset_pivoted,
    desired_chemical_names,
    create_standardise_units_func(convert_to_standard),
    ["MinDetectLimit", "UOM"],
    ["MinDetectLimit"],
    split_lists=True
)

dataset_pivoted

Unnamed: 0_level_0,SampleNumber,SampleDate,Latitude,Longitude,Calcium,Calcium,Calcium,Chloride,Chloride,Chloride,...,Sulfate,Hardness,Sodium,Potassium,Magnesium,Calcium,Specific Conductance,Total Dissolved Solids,Water Temperature,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,MinDetectLimit,UOM,Amount,MinDetectLimit,UOM,...,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix,Prefix
0,0003ETAN655B,29/03/2000 10:20,43.847830,-112.709326,[0.06559999999999999],[0.0001],[mg/l],,,,...,=,=,[=],[=],[=],[=],=,=,=,=
1,0003ETAN655C,29/03/2000 10:20,43.847830,-112.709326,,,,[0.411],[0.0009],[mg/l],...,[=],=,=,=,=,=,=,[=],=,=
2,0004ENRFIWDB,20/04/2000 10:05,43.651806,-112.919948,[0.896],[0.0001],[mg/l],,,,...,=,=,[=],[=],[=],[=],=,=,=,=
3,0004ENRFIWDD,20/04/2000 10:05,43.651806,-112.919948,,,,[9.713],[0.0009],[mg/l],...,[=],=,=,=,=,=,=,[=],=,=
4,0004ENRFSEWB,20/04/2000 9:40,43.653774,-112.912298,[0.008400000000000001],[0.0001],[mg/l],,,,...,=,=,[=],[=],[=],[=],=,=,=,=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28690,IDEQ-2017-11-08-6159,8/11/2017 14:32,43.376230,-116.550910,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]
28691,IDEQ-2017-11-14-6160,14/11/2017 10:55,43.717750,-116.998090,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]
28692,IDEQ-2017-11-14-6161,14/11/2017 11:40,43.713400,-116.902030,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]
28693,IDEQ-2017-11-14-6162,14/11/2017 12:17,43.697140,-117.020360,,,,,,,...,=,=,=,=,=,=,[=],=,[=],[=]


In [8]:
drop_chemical_columns(
    dataset_pivoted,
    desired_chemical_names,
    ["UOM", "MinDetectLimit"]
)

dataset_pivoted = order_cols(dataset_pivoted, desired_chemical_names)

In [11]:
# turn list of measurements into one aggregated measurement

def agg_measurement(amount, prefix):
    if not isinstance(amount, list):
        if math.isnan(amount):
            return (np.nan, np.nan)
        elif isinstance(amount, nb.Number):
            if isinstance(prefix, list):
                raise TypeError(prefix)
            return (amount, prefix)
        else:
            raise TypeError(amount)

    if set(prefix) == set(["<"]):
        return (min(amount), "<")
    else:
        return (st.mean(amount), "=")


transform_chemical_data(
    dataset_pivoted,
    desired_chemical_names,
    agg_measurement,
    ["Amount", "Prefix"],
    ["Amount", "Prefix"]
)

dataset_pivoted

Unnamed: 0_level_0,SampleDate,Longitude,Latitude,SampleNumber,Calcium,Calcium,Chloride,Chloride,Hardness,Hardness,...,Specific Conductance,Specific Conductance,Sulfate,Sulfate,Total Dissolved Solids,Total Dissolved Solids,Water Temperature,Water Temperature,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,Prefix,Amount,Prefix,Amount,Prefix,...,Amount,Prefix,Amount,Prefix,Amount,Prefix,Amount,Prefix,Amount,Prefix
0,29/03/2000 10:20,-112.709326,43.847830,0003ETAN655B,0.0656,=,,,,,...,,,,,,,,,,
1,29/03/2000 10:20,-112.709326,43.847830,0003ETAN655C,,,0.4110,=,,,...,,,0.3020,=,1.32,=,,,,
2,20/04/2000 10:05,-112.919948,43.651806,0004ENRFIWDB,0.8960,=,,,,,...,,,,,,,,,,
3,20/04/2000 10:05,-112.919948,43.651806,0004ENRFIWDD,,,9.7130,=,,,...,,,0.0246,=,17.10,=,,,,
4,20/04/2000 9:40,-112.912298,43.653774,0004ENRFSEWB,0.0084,=,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28690,8/11/2017 14:32,-116.550910,43.376230,IDEQ-2017-11-08-6159,,,,,,,...,817.0,=,,,,,15.79,=,7.56,=
28691,14/11/2017 10:55,-116.998090,43.717750,IDEQ-2017-11-14-6160,,,,,,,...,494.0,=,,,,,18.94,=,7.14,=
28692,14/11/2017 11:40,-116.902030,43.713400,IDEQ-2017-11-14-6161,,,,,,,...,1270.0,=,,,,,15.42,=,6.95,=
28693,14/11/2017 12:17,-117.020360,43.697140,IDEQ-2017-11-14-6162,,,,,,,...,415.0,=,,,,,16.51,=,7.42,=


In [38]:

def count_nas(row):
    return row.isna().sum()


na_threshold = 5
amounts_index = list(itt.product(desired_chemical_names, ["Amount"]))
mask = dataset_pivoted[amounts_index].apply(count_nas, axis=1)
mask = mask[mask > na_threshold].index

dataset_pivoted = dataset_pivoted.loc[mask]
dataset_pivoted = dataset_pivoted.reset_index()
dataset_pivoted = dataset_pivoted.drop(['index'], axis=1, level=0)

dataset_pivoted

Unnamed: 0_level_0,level_0,SampleDate,Longitude,Latitude,SampleNumber,Calcium,Calcium,Chloride,Chloride,Hardness,...,Specific Conductance,Specific Conductance,Sulfate,Sulfate,Total Dissolved Solids,Total Dissolved Solids,Water Temperature,Water Temperature,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Amount,Prefix,Amount,Prefix,Amount,...,Amount,Prefix,Amount,Prefix,Amount,Prefix,Amount,Prefix,Amount,Prefix
0,0,29/03/2000 10:20,-112.709326,43.847830,0003ETAN655B,0.0656,=,,,,...,,,,,,,,,,
1,1,29/03/2000 10:20,-112.709326,43.847830,0003ETAN655C,,,0.411,=,,...,,,0.3020,=,1.32,=,,,,
2,2,20/04/2000 10:05,-112.919948,43.651806,0004ENRFIWDB,0.8960,=,,,,...,,,,,,,,,,
3,3,20/04/2000 10:05,-112.919948,43.651806,0004ENRFIWDD,,,9.713,=,,...,,,0.0246,=,17.10,=,,,,
4,4,20/04/2000 9:40,-112.912298,43.653774,0004ENRFSEWB,0.0084,=,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16470,16470,8/11/2017 11:41,-116.989560,43.704780,IDEQ-2017-11-08-6157,,,,,,...,1300.0,=,,,,,14.55,=,7.25,=
16471,16471,8/11/2017 14:32,-116.550910,43.376230,IDEQ-2017-11-08-6159,,,,,,...,817.0,=,,,,,15.79,=,7.56,=
16472,16472,14/11/2017 10:55,-116.998090,43.717750,IDEQ-2017-11-14-6160,,,,,,...,494.0,=,,,,,18.94,=,7.14,=
16473,16473,14/11/2017 11:40,-116.902030,43.713400,IDEQ-2017-11-14-6161,,,,,,...,1270.0,=,,,,,15.42,=,6.95,=


In [None]:
# calculate averages for each chemical
# for now I naively take the mean regardless of < or =.
amount_avgs = dict()
missing_chemicals = []

for chemical_name in desired_chemical_names:
    amount_avgs[chemical_name] = dataset_pivoted[chemical_name,
                                                 "Amount"].mean()
    if math.isnan(amount_avgs[chemical_name]):
        missing_chemicals.append(chemical_name)

# remove chemicals with all NaNs
desired_chemical_names = list(
    set(desired_chemical_names).difference(set(missing_chemicals)))
dataset_pivoted = dataset_pivoted.drop(missing_chemicals, axis=1)


# fill NaN values with averages
transform_chemical_data(
    dataset_pivoted, desired_chemical_names,
    lambda a, p: (amount_avgs[chemical_name],
                  '=') if math.isnan(a) else (a, p),
    ["Amount", "Prefix"],
    ["Amount", "Prefix"]
)

dataset_pivoted

Unnamed: 0_level_0,Latitude,SampleDate,SampleNumber,Longitude,Calcium,Calcium,Chloride,Chloride,Magnesium,Magnesium,...,Specific Conductance,Specific Conductance,Sulfate,Sulfate,Total Dissolved Solids,Total Dissolved Solids,Water Temperature,Water Temperature,pH,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,Prefix,Amount,Prefix,Amount,Prefix,...,Amount,Prefix,Amount,Prefix,Amount,Prefix,Amount,Prefix,Amount,Prefix
0,43.847830,29/03/2000 10:20,0003ETAN655B,-112.709326,0.065600,=,8.344642,=,0.019800,=,...,8.344642,=,8.344642,=,8.344642,=,8.344642,=,8.344642,=
1,43.847830,29/03/2000 10:20,0003ETAN655C,-112.709326,8.344642,=,0.411000,=,8.344642,=,...,8.344642,=,0.302000,=,1.320000,=,8.344642,=,8.344642,=
2,43.651806,20/04/2000 10:05,0004ENRFIWDB,-112.919948,0.896000,=,8.344642,=,0.289000,=,...,8.344642,=,8.344642,=,8.344642,=,8.344642,=,8.344642,=
3,43.651806,20/04/2000 10:05,0004ENRFIWDD,-112.919948,8.344642,=,9.713000,=,8.344642,=,...,8.344642,=,0.024600,=,17.100000,=,8.344642,=,8.344642,=
4,43.653774,20/04/2000 9:40,0004ENRFSEWB,-112.912298,0.008400,=,8.344642,=,0.002000,=,...,8.344642,=,8.344642,=,8.344642,=,8.344642,=,8.344642,=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16470,43.704780,8/11/2017 11:41,IDEQ-2017-11-08-6157,-116.989560,8.344642,=,8.344642,=,8.344642,=,...,1300.000000,=,8.344642,=,8.344642,=,14.550000,=,7.250000,=
16471,43.376230,8/11/2017 14:32,IDEQ-2017-11-08-6159,-116.550910,8.344642,=,8.344642,=,8.344642,=,...,817.000000,=,8.344642,=,8.344642,=,15.790000,=,7.560000,=
16472,43.717750,14/11/2017 10:55,IDEQ-2017-11-14-6160,-116.998090,8.344642,=,8.344642,=,8.344642,=,...,494.000000,=,8.344642,=,8.344642,=,18.940000,=,7.140000,=
16473,43.713400,14/11/2017 11:40,IDEQ-2017-11-14-6161,-116.902030,8.344642,=,8.344642,=,8.344642,=,...,1270.000000,=,8.344642,=,8.344642,=,15.420000,=,6.950000,=


In [None]:
drop_chemical_columns(
    dataset_pivoted,
    desired_chemical_names,
    ["Prefix"]
)
dataset_pivoted

Unnamed: 0_level_0,Latitude,SampleDate,SampleNumber,Longitude,Calcium,Chloride,Magnesium,Potassium,Sodium,Specific Conductance,Sulfate,Total Dissolved Solids,Water Temperature,pH
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Amount,Amount,Amount,Amount,Amount,Amount,Amount,Amount,Amount,Amount
0,43.847830,29/03/2000 10:20,0003ETAN655B,-112.709326,0.065600,8.344642,0.019800,0.009900,0.410000,8.344642,8.344642,8.344642,8.344642,8.344642
1,43.847830,29/03/2000 10:20,0003ETAN655C,-112.709326,8.344642,0.411000,8.344642,8.344642,8.344642,8.344642,0.302000,1.320000,8.344642,8.344642
2,43.651806,20/04/2000 10:05,0004ENRFIWDB,-112.919948,0.896000,8.344642,0.289000,0.029000,5.500000,8.344642,8.344642,8.344642,8.344642,8.344642
3,43.651806,20/04/2000 10:05,0004ENRFIWDD,-112.919948,8.344642,9.713000,8.344642,8.344642,8.344642,8.344642,0.024600,17.100000,8.344642,8.344642
4,43.653774,20/04/2000 9:40,0004ENRFSEWB,-112.912298,0.008400,8.344642,0.002000,0.014000,0.230000,8.344642,8.344642,8.344642,8.344642,8.344642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16470,43.704780,8/11/2017 11:41,IDEQ-2017-11-08-6157,-116.989560,8.344642,8.344642,8.344642,8.344642,8.344642,1300.000000,8.344642,8.344642,14.550000,7.250000
16471,43.376230,8/11/2017 14:32,IDEQ-2017-11-08-6159,-116.550910,8.344642,8.344642,8.344642,8.344642,8.344642,817.000000,8.344642,8.344642,15.790000,7.560000
16472,43.717750,14/11/2017 10:55,IDEQ-2017-11-14-6160,-116.998090,8.344642,8.344642,8.344642,8.344642,8.344642,494.000000,8.344642,8.344642,18.940000,7.140000
16473,43.713400,14/11/2017 11:40,IDEQ-2017-11-14-6161,-116.902030,8.344642,8.344642,8.344642,8.344642,8.344642,1270.000000,8.344642,8.344642,15.420000,6.950000


In [None]:
dataset_pivoted.columns = dataset_pivoted.columns.droplevel(1)
dataset_pivoted

Unnamed: 0,Latitude,SampleDate,SampleNumber,Longitude,Calcium,Chloride,Magnesium,Potassium,Sodium,Specific Conductance,Sulfate,Total Dissolved Solids,Water Temperature,pH
0,43.847830,29/03/2000 10:20,0003ETAN655B,-112.709326,0.065600,8.344642,0.019800,0.009900,0.410000,8.344642,8.344642,8.344642,8.344642,8.344642
1,43.847830,29/03/2000 10:20,0003ETAN655C,-112.709326,8.344642,0.411000,8.344642,8.344642,8.344642,8.344642,0.302000,1.320000,8.344642,8.344642
2,43.651806,20/04/2000 10:05,0004ENRFIWDB,-112.919948,0.896000,8.344642,0.289000,0.029000,5.500000,8.344642,8.344642,8.344642,8.344642,8.344642
3,43.651806,20/04/2000 10:05,0004ENRFIWDD,-112.919948,8.344642,9.713000,8.344642,8.344642,8.344642,8.344642,0.024600,17.100000,8.344642,8.344642
4,43.653774,20/04/2000 9:40,0004ENRFSEWB,-112.912298,0.008400,8.344642,0.002000,0.014000,0.230000,8.344642,8.344642,8.344642,8.344642,8.344642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16470,43.704780,8/11/2017 11:41,IDEQ-2017-11-08-6157,-116.989560,8.344642,8.344642,8.344642,8.344642,8.344642,1300.000000,8.344642,8.344642,14.550000,7.250000
16471,43.376230,8/11/2017 14:32,IDEQ-2017-11-08-6159,-116.550910,8.344642,8.344642,8.344642,8.344642,8.344642,817.000000,8.344642,8.344642,15.790000,7.560000
16472,43.717750,14/11/2017 10:55,IDEQ-2017-11-14-6160,-116.998090,8.344642,8.344642,8.344642,8.344642,8.344642,494.000000,8.344642,8.344642,18.940000,7.140000
16473,43.713400,14/11/2017 11:40,IDEQ-2017-11-14-6161,-116.902030,8.344642,8.344642,8.344642,8.344642,8.344642,1270.000000,8.344642,8.344642,15.420000,6.950000
