In [90]:
import pandas as pd
from sklearn.decomposition import NMF

## Step 1. Import Data

In [91]:
folder = '/Users/laurayang/Dropbox/Dr.Ng Group/Yang/Project2-LCS/B_Data-ES&T Roof/munged/NMF Raw-ARISense-Data/interpolated-raw-gas/'
#folder = '/Users/laurayang/Dropbox/Dr.Ng Group/Yang/Project2-LCS/B_Data-ES&T Roof/munged/NMF Raw-ARISense-Data/interpolated-raw-gas/log-normalized-weighted/'
input_file = folder + '5-merged-timeseries-6min.csv'
df = pd.read_csv(input_file)

### Take a look at the original data 

In [53]:
N_COMPONENTS = 4

#Columns to include in the analysis is bin 0 to bin 23 as well as gases
COLS_TO_INCLUDE = ['bin0','bin1','bin2','co','no2','no','o3']

df_original = df.dropna(subset = COLS_TO_INCLUDE)
df_original = df_original.reset_index(drop=True)
#df = df_original.reset_index().drop('index',axis=1)

#Drop the rows that include NaN values for columns of interest 
df = df.dropna(subset=COLS_TO_INCLUDE)
df = df[COLS_TO_INCLUDE]
df

Unnamed: 0,bin0,bin1,bin2,co,no2,no,o3
0,12.914967,1.151983,0.323933,67.950000,0.933333,3.750000,29.316667
1,12.583917,1.092117,0.309133,63.916667,0.950000,3.766667,29.283333
2,12.070900,1.110283,0.294350,65.016667,0.916667,3.983333,29.083333
3,11.691667,1.097817,0.309200,66.540000,0.700000,3.940000,29.140000
4,11.416633,1.047700,0.283717,59.042857,0.628571,3.700000,29.042857
...,...,...,...,...,...,...,...
12326,15.742250,1.868767,0.422067,34.483333,0.400000,6.750000,28.900000
12327,15.923467,1.900750,0.395217,36.050000,0.155556,5.633333,29.700000
12328,16.075750,1.932817,0.416300,36.800000,0.094444,5.750000,29.516667
12329,16.161667,1.912083,0.410233,36.333333,0.100000,6.083333,29.183333


## Step 2. Normalize/scale Data Overview
* a. Apply log to data (except for O$_3$ as it has a normal distribution already)
* b. Apply min-max scaler to have all the variables vary from 0 - 1
* c. Apply arbitrary weight for each variable 

### Step 2a. Apply log to data except for O$_3$

In [54]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

# Create transformer object 
transformer = FunctionTransformer(np.log,validate=True,check_inverse=True)
# fit and transform in one step 
Data0 = df[["bin0","bin1","bin2","co","no","no2"]].values
log_transformed = pd.DataFrame(transformer.fit_transform(Data0),columns=["bin0","bin1","bin2","co","no","no2"],index=df.index)

# Replace infinity values as NaN
log_transformed.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop NaN values
log_transformed = log_transformed.dropna(subset=["bin0","bin1","bin2","co","no","no2"])

  return func(X, **(kw_args if kw_args else {}))


In [55]:
log_transformed

Unnamed: 0,bin0,bin1,bin2,co,no,no2
0,2.558387,0.141485,-1.127218,4.218772,1.321756,-0.068993
1,2.532420,0.088118,-1.173983,4.157580,1.326190,-0.051293
2,2.490798,0.104615,-1.222986,4.174644,1.382119,-0.087011
3,2.458876,0.093323,-1.173767,4.197803,1.371181,-0.356675
4,2.435071,0.046597,-1.259779,4.078264,1.308333,-0.464306
...,...,...,...,...,...,...
12325,2.761527,0.611178,-0.894652,3.514526,1.783391,-2.708050
12326,2.756348,0.625279,-0.862592,3.540476,1.909543,-0.916291
12327,2.767794,0.642249,-0.928321,3.584907,1.728701,-1.860752
12328,2.777312,0.658978,-0.876349,3.605498,1.749200,-2.359744


#### Create df2 that merges log_transformed data with O$_3$

In [56]:
df2 = log_transformed
df2['o3']=df['o3']

In [57]:
df2

Unnamed: 0,bin0,bin1,bin2,co,no,no2,o3
0,2.558387,0.141485,-1.127218,4.218772,1.321756,-0.068993,29.316667
1,2.532420,0.088118,-1.173983,4.157580,1.326190,-0.051293,29.283333
2,2.490798,0.104615,-1.222986,4.174644,1.382119,-0.087011,29.083333
3,2.458876,0.093323,-1.173767,4.197803,1.371181,-0.356675,29.140000
4,2.435071,0.046597,-1.259779,4.078264,1.308333,-0.464306,29.042857
...,...,...,...,...,...,...,...
12325,2.761527,0.611178,-0.894652,3.514526,1.783391,-2.708050,29.500000
12326,2.756348,0.625279,-0.862592,3.540476,1.909543,-0.916291,28.900000
12327,2.767794,0.642249,-0.928321,3.584907,1.728701,-1.860752,29.700000
12328,2.777312,0.658978,-0.876349,3.605498,1.749200,-2.359744,29.516667


### 2b. Apply min-max scaler 
All the variables will now range from 0 - 1

In [58]:
# demonstrate data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# create scaler
scaler = MinMaxScaler()

# fit and transform in one step
Data = df2[["bin0","bin1","bin2","co","no","no2","o3"]].values
min_max_applied = pd.DataFrame(scaler.fit_transform(Data),columns=["bin0","bin1","bin2","co","no","no2","o3"],index=df2.index)

# inverse transformation 
#Data2=normalized[["bin0","bin1","bin2","co","no","no2","o3"]].values
#inverse = pd.DataFrame(scaler.inverse_transform(Data2),columns=["bin0","bin1","bin2","co","no","no2","o3"],index=df2.index)

In [59]:
min_max_applied

Unnamed: 0,bin0,bin1,bin2,co,no,no2,o3
0,0.702089,0.513739,0.470765,0.733398,0.488949,0.771970,0.616112
1,0.696930,0.505557,0.464354,0.726085,0.489818,0.773595,0.615412
2,0.688661,0.508086,0.457636,0.728124,0.500773,0.770315,0.611208
3,0.682319,0.506355,0.464384,0.730892,0.498630,0.745554,0.612399
4,0.677590,0.499191,0.452592,0.716606,0.486320,0.735671,0.610358
...,...,...,...,...,...,...,...
12325,0.742447,0.585751,0.502648,0.649237,0.579371,0.529648,0.619965
12326,0.741418,0.587913,0.507043,0.652338,0.604081,0.694170,0.607356
12327,0.743692,0.590515,0.498032,0.657647,0.568659,0.607448,0.624168
12328,0.745583,0.593080,0.505157,0.660108,0.572674,0.561630,0.620315


### 2c. Apply weight to each variable to revive the intra-variation among species

In [60]:
normalized = min_max_applied
normalized['co']=normalized['co']*32
normalized['no']=normalized['no']*3
normalized['o3']=normalized['o3']*14
normalized['bin0']=normalized['bin0']*25
normalized['bin1']=normalized['bin1']*16

In [61]:
normalized

Unnamed: 0,bin0,bin1,bin2,co,no,no2,o3
0,17.552220,8.219824,0.470765,23.468727,1.466848,0.771970,8.625569
1,17.423247,8.088910,0.464354,23.234719,1.469453,0.773595,8.615762
2,17.216521,8.129380,0.457636,23.299973,1.502318,0.770315,8.556918
3,17.057976,8.101680,0.464384,23.388539,1.495891,0.745554,8.573590
4,16.939742,7.987057,0.452592,22.931399,1.458960,0.735671,8.545009
...,...,...,...,...,...,...,...
12325,18.561166,9.372016,0.502648,20.775570,1.738114,0.529648,8.679510
12326,18.535446,9.406607,0.507043,20.874808,1.812243,0.694170,8.502977
12327,18.592294,9.448235,0.498032,21.044718,1.705977,0.607448,8.738354
12328,18.639568,9.489275,0.505157,21.123462,1.718022,0.561630,8.684413


#### As the last step, find the intersection indices between normalized and the original df

In [78]:
# Find the overlapping indices where the rows are not zeros
normalized_idx = pd.to_numeric(normalized_idx)[0]
df_idx = pd.to_numeric(df_idx)[0]

normalized_idx = normalized.dropna(how='all').index
df_idx = df.dropna(how='all').index

#Find the intersection between two indices
idx = df_idx.intersection(normalized_idx)

# Use masking to keep only data that have non-NaN values
df = df.loc[idx]
normalized = normalized.loc[idx]

## Step 3. Run NMF analysis and create a table of bootstrap result
For bootstrap, we randomly select 50% of data, run the NMF analysis and record the result. We repeat this process 100 times to check for repeatability. 

**NOTE on problem I am stuck**: 
* I should "de-normalize" it when we do X' ~ WH (size of a matrix: 5976 x 7) computation since it's possible to reverse back to X' ~ (WH) but it's not possible to transform back W and H separately. 
* However, to make a bootstrap result, what we use is W*H (matrix size: 5976 x 4), not X' ~ XH. 
    - X*H matrix columns: Factor 1,2,3, and 4. 
    - XH matrix columns: 7 variables (bin0, bin1, bin2, CO, NO, NO$_2$,and O$_3$).
* De-normalizing X*H matrix seems infeasible as I applied different weights to different 7 variables seaparately. As noted above, X*H matrix doesn't have 7 variables as columns, instead it has 4 factors as columns.  
* Hence, where and how should I apply "denormalization" in the below code? 

In [21]:
#create an empty array to hold all of the results
frame = []

n_iter = 100
fraction = 0.5
for iter in range(n_iter):
    
    #set up the nmf analysis
    nmf = NMF(n_components=N_COMPONENTS, alpha = 0.1, max_iter=15000)
    
    #subselect a portion of the dataset randomly. 
        #Note: Sample the original df and normalized df randomly at the same index 
    sub = normalized.sample(frac = fraction)
    sub2 = df.loc[sub.index,]
    
    #fit the data
    W = nmf.fit_transform(X=sub[COLS_TO_INCLUDE].T)
    H = nmf.components_
    
    #convert the basis matrix to a dataframe
    R = pd.DataFrame(H.T, index=sub.index)
    
    #Set the column names
    R.columns = ["Factor {}".format(i+1) for i in range(H.T.shape[1])]
    
    #calculate the composition (3 x number of features)
    comp = pd.DataFrame(W.T,index=R.columns,columns=COLS_TO_INCLUDE)
    
    #calculate the total and residual for each column
    res = []
    for i, col in enumerate(comp.columns):
        #First, "denormalize "W*H" (5986x4) matrix 
        W_time_H = comp.iloc[:,i].values * H.T

        by_factor = pd.DataFrame(comp.iloc[:,i].values * H.T).sum()
        
        #divide by the total amount for a given species
        by_factor /= sub[col].sum()
        
        res.append(pd.DataFrame(by_factor, columns = [col]).T)
        
    res = pd.concat(res)
    res.columns = R.columns
    res["Residual"] = 1-res.sum(axis=1)
    
    res= res.reset_index().melt(id_vars=["index"])
    
    #add in a few more columns
    res["iter"] = iter
    frame.append(res)
    
#concat and save the data
frame = pd.concat(frame,sort=False)


In [22]:
frame

Unnamed: 0,index,variable,value,iter
0,bin2_log_normalized,Factor 1,0.138415,0
1,bin0_log_normalized_weighted,Factor 1,0.502455,0
2,bin1_log_normalized_weighted,Factor 1,0.203777,0
3,co_log_normalized_weighted,Factor 1,0.506499,0
4,no_log_normalized_weighted,Factor 1,0.423260,0
...,...,...,...,...
30,bin1_log_normalized_weighted,Residual,0.000050,99
31,co_log_normalized_weighted,Residual,0.000013,99
32,no_log_normalized_weighted,Residual,0.006711,99
33,no2_log_normalized,Residual,0.000827,99


In [25]:
#save the data
folder = '/Users/laurayang/Dropbox/Dr.Ng Group/Yang/Project2-LCS/B_Data-ES&T Roof/bootstrap/NMF Raw-ARISense-Data/interpolated-raw-gas/log-normalized-weighted/'
frame.reset_index().to_csv(folder + '5b-6min-4-factors.csv')