# Confidence Interval (CI) Estimation for Figure 3

Compute the confidence intervals for Figure 3 - to be placed in SI.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
import numpy as np
import joblib
import os
from matplotlib.dates import date2num
from datetime import timedelta

mpl.rcParams['mathtext.default'] = 'regular'

sns.set("paper", "ticks", color_codes=True, palette='colorblind', font_scale=1.5)

%matplotlib inline

# Set the default paths
basepath = Path(os.path.dirname(os.path.realpath('__file__')))
datapath = basepath.parent.joinpath('data')

## Load the Data

In [2]:
nmf = pd.read_feather(datapath.joinpath("output/tata2017015/nmf-tseries-results.feather")).set_index("index")

## Perform a Bootstrap Analysis on the First Subplot

In [68]:
nmf_a = nmf[["Factor 1", "Factor 2", "Factor 3", "TotalConc", "PM1_EST", 
             "org", "nh4", "so4", "no3", "chl", "bc"]].resample('5min').mean().dropna()

n_samples = 2500
boot_pct = .15

results = pd.DataFrame()

for _ in range(n_samples):
    frame = nmf_a.sample(frac=boot_pct, replace=True)
    
    # correlate everything
    corr = frame.corr()**2

    # keep only the columns of interest
    corr = corr.loc[
        ["Factor 1", "Factor 2", "Factor 3"], 
        ["TotalConc", "PM1_EST", "org", "nh4", "so4", "no3", "chl", "bc"]]
    
    # append to results
    results = results.append(corr.reset_index().melt(id_vars=["index"]).dropna(), ignore_index=True)
    
results.groupby(["index", "variable"]).quantile([0.05, 0.5, 0.95]).unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,0.05,0.50,0.95
index,variable,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Factor 1,PM1_EST,0.33001,0.377774,0.425975
Factor 1,TotalConc,0.597985,0.63942,0.680061
Factor 1,bc,0.638003,0.68105,0.722409
Factor 1,chl,0.016945,0.031453,0.050885
Factor 1,nh4,0.002985,0.011541,0.025767
Factor 1,no3,0.03765,0.061929,0.091979
Factor 1,org,0.70826,0.748023,0.783547
Factor 1,so4,9e-06,0.000841,0.00747
Factor 2,PM1_EST,0.574237,0.618717,0.661382
Factor 2,TotalConc,0.014208,0.032445,0.057206


## Perform a Bootstrap Analysis on the second subplot

In [70]:
nmf_b = nmf[["Factor 1", "Factor 2", "Factor 3", "HOA/BBOA", "SVOOA", "LVOOA", "AC"]].resample('5min').mean().dropna()

n_samples = 2500
boot_pct = .15

results = pd.DataFrame()

for _ in range(n_samples):
    frame = nmf_b.sample(frac=boot_pct, replace=True)
    
    # correlate everything
    corr = frame.corr()**2

    # keep only the columns of interest
    corr = corr.loc[
        ["Factor 1", "Factor 2", "Factor 3"], 
        ["HOA/BBOA", "SVOOA", "LVOOA", "AC"]]
    
    # append to results
    results = results.append(corr.reset_index().melt(id_vars=["index"]).dropna(), ignore_index=True)
    
results.groupby(["index", "variable"]).quantile([0.05, 0.5, 0.95]).unstack()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
index       30000 non-null object
variable    30000 non-null object
value       30000 non-null float64
dtypes: float64(1), object(2)
memory usage: 703.2+ KB
None
