In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import rubisco_data
import seaborn as sns

from matplotlib import pyplot as plt

In [2]:
measurement2err_cols = {
    'KC': ['KC_SD'],
    'vC': ['vC_SD'],
    'S': ['S_SD'],
    'KO': ['KO_SD'],
    'KRuBP': ['KRuBP_SD'],
    'vO_reported': ['vO_SD_reported'],
    'vO': ['vO_95CI_low', 'vO_95CI_high'],
    'kon_C': ['kon_C_95CI_low', 'kon_C_95CI_high'],
    'kon_O': ['kon_O_95CI_low', 'kon_O_95CI_high'],
}

# error is not very often reported for kcatO. 
# assert at least 30 values for everything else
min_measurements = {"vO_SD_reported": 2}

# Helper functions for sanity checking dataframes
def check_col(df, colname, min_vals):
    mask = df[colname].notnull()
    assert(mask.sum() >= min_vals)
    assert(np.isfinite(df[mask][colname]).all())
    
def check_df(my_df):
    for col, err_cols in measurement2err_cols.items():
        for c in [col] + err_cols:
            min_vals = min_measurements.get(c, 30)
            check_col(my_df, c, min_vals)
            print(c, "passed column tests")

        if col.endswith('_reported'):
            # next check ensures error is inferred.
            # this does not happen on the vO_reported columns.
            continue

        for ec in err_cols:
            # should always have an error value when we have a primary value
            # since the data normalization infers error when absent
            mask = my_df[col].notnull()
            assert(my_df[mask][ec].notnull().all())
            print(ec, 'has values when', col, 'does')
    


In [3]:
# Merged dataset is the one we work with in most of the figures. 
merged_df = pd.read_csv('../data/DatasetS2_RubiscoKinetics_Merged.csv')

check_df(merged_df)

KC passed column tests
KC_SD passed column tests
KC_SD has values when KC does
vC passed column tests
vC_SD passed column tests
vC_SD has values when vC does
S passed column tests
S_SD passed column tests
S_SD has values when S does
KO passed column tests
KO_SD passed column tests
KO_SD has values when KO does
KRuBP passed column tests
KRuBP_SD passed column tests
KRuBP_SD has values when KRuBP does
vO_reported passed column tests
vO_SD_reported passed column tests
vO passed column tests
vO_95CI_low passed column tests
vO_95CI_high passed column tests
vO_95CI_low has values when vO does
vO_95CI_high has values when vO does
kon_C passed column tests
kon_C_95CI_low passed column tests
kon_C_95CI_high passed column tests
kon_C_95CI_low has values when kon_C does
kon_C_95CI_high has values when kon_C does
kon_O passed column tests
kon_O_95CI_low passed column tests
kon_O_95CI_high passed column tests
kon_O_95CI_low has values when kon_O does
kon_O_95CI_high has values when kon_O does


In [4]:
# Merged dataset is the one we work with in most of the figures. 
full_df = pd.read_csv('../data/DatasetS4_RubiscoKineticsFull_Merged.csv')

check_df(full_df)

KC passed column tests
KC_SD passed column tests
KC_SD has values when KC does
vC passed column tests
vC_SD passed column tests
vC_SD has values when vC does
S passed column tests
S_SD passed column tests
S_SD has values when S does
KO passed column tests
KO_SD passed column tests
KO_SD has values when KO does
KRuBP passed column tests
KRuBP_SD passed column tests
KRuBP_SD has values when KRuBP does
vO_reported passed column tests
vO_SD_reported passed column tests
vO passed column tests
vO_95CI_low passed column tests
vO_95CI_high passed column tests
vO_95CI_low has values when vO does
vO_95CI_high has values when vO does
kon_C passed column tests
kon_C_95CI_low passed column tests
kon_C_95CI_high passed column tests
kon_C_95CI_low has values when kon_C does
kon_C_95CI_high has values when kon_C does
kon_O passed column tests
kon_O_95CI_low passed column tests
kon_O_95CI_high passed column tests
kon_O_95CI_low has values when kon_O does
kon_O_95CI_high has values when kon_O does


In [5]:
def get_kin_dups(my_df):
    # check for duplicate rows after rounding to 1 sig fig.
    # all the duplicates should be with "Savir 2010" since we use this as a point of comparison
    measured_vars = 'KC, vC, S, KO'.split(', ')
    meta_plus_meas = ["species", "identifier", "short_ref", "primary"] + measured_vars
    rounded_df = my_df[meta_plus_meas].round(1)
    idxs = rounded_df.duplicated(measured_vars, keep=False)
    return rounded_df[idxs].sort_values(measured_vars)

In [6]:
# Merged dataset is the one we work with in most of the figures. 
merged_df = pd.read_csv('../data/DatasetS2_RubiscoKinetics_Merged.csv')

# Printing duplicates by 4 primary kinetic measurements - KC, kcatC, S and KO.
# Have manually verified that all the duplicates below are fine - i.e. the values are
# truly the same in the primary reference.
get_kin_dups(merged_df)
# Notice also that the listing includes some data from Savir. This is removed in a later
# filtering step to avoid data duplication when doing regression (see below).

Unnamed: 0,species,identifier,short_ref,primary,KC,vC,S,KO
220,Galdieria sulfuraria,sulfuraria_savir,Savir 2010 (meta-analysis),0,3.3,1.2,166.0,374.0
275,Galdieria sulfuraria,sulfuraria_whitney,Whitney 2001,1,3.3,1.2,166.0,374.0
94,Glycine max,max_jordan81,Jordan 1981,1,9.0,,82.0,430.0
224,Glycine max,max_savir,Savir 2010 (meta-analysis),0,9.0,,82.0,430.0
218,Griffithsia monilis,monilis_savir,Savir 2010 (meta-analysis),0,9.3,2.6,167.0,
274,Griffithsia monilis,monilis_whitney,Whitney 2001,1,9.3,2.6,167.0,
213,Nicotiana tabacum,tabacum_savir,Savir 2010 (meta-analysis),0,10.7,3.4,82.0,295.0
272,Nicotiana tabacum,tabacum_whitney,Whitney 2001,1,10.7,3.4,82.0,295.0
139,Chenopodium alba,alba_kubien,Kubien 2008,1,11.2,2.9,78.7,415.0
217,Chenopodium alba,alba_savir,Savir 2010 (meta-analysis),0,11.2,2.9,78.7,415.0


In [7]:
# Load the data from Excel, remove data points that are not comparable.
# raw_kin_df is everything in the file, kin_df is the subset with all relevant kinetic data.
raw_kin_df, kin_df = rubisco_data.load_rubisco_data()

# Subset the data in various ways for future use
deduped_kin_df, savir_df, nonsavir_df = rubisco_data.filter_data(raw_kin_df)

In [8]:
# The helper function "filter_data" should drop the Savir data for which we
# have a matching primary reference containing all the data. 
get_kin_dups(deduped_kin_df)
# You can see that there is no Savir et al. data in the duplicate listing below.

Unnamed: 0,species,identifier,short_ref,primary,KC,vC,S,KO
1,Methanococcoides burtonii L2,burtonii_L2_coli_alonso,Alonso 2009,1,,1.9,1.1,
2,Methanococcoides burtonii,burtonii_tobacco_coli_alonso,Alonso 2009,1,,1.9,1.1,
68,Urtica atrovirens ssp. bianorii,atrovirens_bianorii_galmes,Galmes 2005,1,,,90.2,
347,Pisum sativum,pea_uemura96,Uemura 1996,1,,,90.2,
345,Nicotiana tabacum,tobacco_uemura96,Uemura 1996,1,,,93.1,
350,Spinacia oleracea,spinach_peg_uemura96,Uemura 1996,1,,,93.1,
63,Lysimachia minoricensis,minoricensis_galmes,Galmes 2005,1,,,93.8,
349,Spinacia oleracea,spinach_uemura96,Uemura 1996,1,,,93.8,
64,Mentha aquatica,aquatica_galmes,Galmes 2005,1,,,97.2,
65,Pistacia lentiscus,lentiscus_galmes,Galmes 2005,1,,,97.2,
