In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
from scipy import stats
import matplotlib.dates as mdates

import seaborn as sns
from pandas.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline 

# PCA functions

## PCA on covariance matrix

In [None]:
def PCA_hommade(df, colorbar, path=r'PCAfigure', savepath=None):

    X_std = (df - df.mean()) / (df.max() - df.min())
    
    # Compute the covariance matrix
    cov_mat = np.cov(X_std.T)

    # Eigendecomposition of the covariance matrix
    eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)

    tot = sum(eig_val_cov)
    var_exp = [(i / tot)*100 for i in sorted(eig_val_cov, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)

    # plot explained variance by each PC (eigenvalues)
    with plt.style.context("seaborn-paper"):
        with plt.style.context("seaborn-white"):
            plt.figure(figsize=(4, 3))
            plt.bar(range(len(cov_mat)), 
                    var_exp, 
                    alpha=0.5, 
                    align='center',
                    label='Individual explained variance',
                    color='grey')
            plt.step(range(len(cov_mat)), 
                     cum_var_exp, where='mid',
                     label='Cumulative explained variance', 
                     color='k')
            plt.axis([ -1, len(cov_mat)-1, 0, 100])
            plt.ylabel('Explained variance (%)')
            plt.xticks(np.arange(1+len(cov_mat)), 
                       ('1','2','3','4'))
            plt.xlabel('Principal components')
            plt.legend(loc='best')
            
    if pd.isnull(savepath)==False:
        plt.savefig(savepath, dpi=300, bbox_inches='tight')
    plt.show()
    
    # Make a list of (eigenvalue, eigenvector) tuples
    # and sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs_cov = [(np.abs(eig_val_cov[i]), 
                      eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]
    eig_pairs_cov.sort()
    eig_pairs_cov.reverse()
    
    # Construct the transformation matrix W from the eigenvalues that correspond to
    # the k largest eigenvalues (here: k = 2)
    matrix_w_cov = np.hstack((eig_pairs_cov[0][1].reshape(-1,1), 
                              eig_pairs_cov[1][1].reshape(-1,1), 
                              eig_pairs_cov[2][1].reshape(-1,1)))
    
    # Transform the data using matrix W
    X_std_transf = matrix_w_cov.T.dot(X_std.T).T

    # Plot the data
    plt.subplots(figsize=(8,6))
    cmap = plt.get_cmap('coolwarm')
    plt.scatter(X_std_transf[:,0], 
                X_std_transf[:,1], 
                c=colorbar, 
                cmap=cmap,
                s=50)
    clb = plt.colorbar()
    clb.set_label('$N_2 O$')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
#     plt.title('PCA based on the covariance matrix of standardized data')
#     plt.savefig(path, dpi=200)
#     plt.show()

## PCA from sklearn and visualization with vectors

In [None]:
def PCAskl(df, colorbar, path=r'PCAfigure', cbar='$N_2O$', marksize=50, savepath=None):
    '''PCAskl(X_std, colorbar)
    
    Xstd: the matrix of elements to be analized, one column per variable
    colorbar: is a vector of elements that can be used as rgb values
    cbar: text for colorbar name
    marksize= vector for sizes of circles
    savepath= None (does nothing) or path ending with name of figure (saves the figure)
    
    '''

    dat = (df - df.mean()) / (df.max() - df.min())
    
    n = len(dat.columns)

    pca = PCA(n_components = n)
    # defaults number of PCs to number of columns in imported data (ie number of
    # features), but can be set to any integer less than or equal to that value

    pca.fit(dat)

    ## project data into PC space
    # 0,1 denote PC1 and PC2; change values for other PCs
    xvector = pca.components_[0] #PC1 # see 'prcomp(my_data)$rotation' in R
    yvector = pca.components_[1] #PC2

    xs = pca.transform(dat)[:,0] # see 'prcomp(my_data)$x' in R
    ys = pca.transform(dat)[:,1]

    # visualize projections
    # Note: scale values for arrows and text are a bit inelegant as of now,
    #       so feel free to play around with them

    with plt.style.context("seaborn-paper"):
        with plt.style.context("seaborn-white"):
            fig, ax1 = plt.subplots(figsize=(3.3,3))

            for i in range(len(xvector)):
            # arrows project features (ie columns from csv) as vectors onto PC axes
                plt.arrow(0, 0, xvector[i]*max(xs), yvector[i]*max(ys),
                          color='r', 
                          width=0.0005, 
                          head_width=0.0025)
                plt.text(xvector[i]*max(xs)*1.2, yvector[i]*max(ys)*1.2,
                         list(dat.columns.values)[i], 
                         color='r')

            cmap = plt.get_cmap('coolwarm')
            
    
            plt.scatter(xs, ys, 
                        c=colorbar, 
                        cmap=cmap, 
                        s=marksize)
#             plt.ylim(-0.8,1)
            plt.xlabel('PC1')
#             plt.ylabel('PC2')

            clb = plt.colorbar()
            clb.set_label(cbar)
            ax1.spines['top'].set_visible(False);
            ax1.spines['right'].set_visible(False);
    if pd.isnull(savepath)==False:
        plt.savefig(savepath, dpi=300, bbox_inches='tight')
    plt.show()
    
    return(xs,ys)

# Load data

In [None]:
# Data loading from pickle
df = pd.read_pickle(r'../data/data_PCA.txt') 

<font size=5> time series for the whole period for all variables

In [None]:
with plt.style.context("seaborn-paper"):
    with plt.style.context("seaborn-white"):

        ax = df.plot(subplots=True,figsize=(16, 10),  style='o', sharex=True, legend=False, markersize=2);
        ax[0].set_ylabel('$N_2O$ $(ppm)$')
        ax[1].set_ylabel('$CO_2$ $(ppm)$')
        ax[2].set_ylabel('$DO$ $(mg/l)$')
        ax[3].set_ylabel('$NH_4$ $(mg/l)$')
        ax[4].set_ylabel('$NO_3$ $(mg/l)$')
        ax[5].set_ylabel('$Q_air$ $(m^3/h)$')
        for i, a in enumerate(ax):
            a.spines['top'].set_visible(False);
            a.spines['right'].set_visible(False);
# plt.savefig(r'C:\Users\Giacomo\Desktop\PhD UNIFI\Thesis\figures\7_dataset.png', dpi=300, bbox_inches='tight')

<font size=5> cleaned period

In [None]:
ax = df[df['NH_4']<6][df['CO2']>15000]['2012/08/21':'2012/08/25'].plot(subplots=True,figsize=(16, 10),  style='o', legend=False);
ax[0].set_ylabel('$N_2O$ $(ppm)$')
ax[1].set_ylabel('$CO_2$ $(ppm)$')
ax[2].set_ylabel('$DO$ $(mg/l)$')
ax[3].set_ylabel('$NH_4$ $(mg/l)$')
ax[4].set_ylabel('$NO_3$ $(mg/l)$')
ax[5].set_ylabel('$Q_air$ $(m^3/h)$')

In [None]:
df_clean = df[df['NH_4']<6][df['CO2']>15000]

In [None]:
df_clean_end = df[df['NH_4']<6][df['CO2']>15000]['2012/08/21':'2012/08/25']

# Data preparation

## scatterplot with all the data available and no filtering

In [None]:
scatter_matrix(df_clean_end, alpha=1, figsize=(12, 12), diagonal='kde');
# plt.savefig('allVar_endAug12_scatter')

## outlier removal with 95th percentile on the whole dataset grouping by each minute of the day

In [None]:
sns.pairplot(df_clean_end.groupby([df_clean_end.index.hour, 
                                   df_clean_end.index.minute]).quantile(0.95), diag_kind='kde')
# plt.savefig('allVar_q95_scatter', dpi=200)`

In [None]:
ax1 = plt.scatter(df_clean_end['Qair_meas'].groupby([df_clean_end.index.hour, 
                                                     df_clean_end.index.minute]).quantile(0.95),
                  df_clean_end['NH_4'].groupby([df_clean_end.index.hour, 
                                                df_clean_end.index.minute]).quantile(0.95),
                  c=range(len(df_clean_end['Qair_meas'].groupby([df_clean_end.index.hour, 
                                                                 df_clean_end.index.minute]).quantile(0.95))),
                  linewidth=0)
ax1.axes.set_ylabel('$NH_4$ $mg/l$')
ax1.axes.set_xlabel('$Q_air$ $m^3/h$')
clb = plt.colorbar()
clb.set_label('minutes of the day')

# plt.savefig('scatterQairNH4', dpi=200)

## visualization of 95th percentile results for each minute of the day

In [None]:
ax = df_clean_end.groupby([df_clean_end.index.hour, 
                           df_clean_end.index.minute]).quantile(0.95).plot(figsize=(12,8), 
                                                                           subplots=True,
                                                                           legend=False);
ax[0].set_ylabel('$N_2O$ $(ppm)$')
ax[1].set_ylabel('$CO_2$ $(ppm)$')
ax[2].set_ylabel('$DO$ $(mg/l)$')
ax[3].set_ylabel('$NH_4$ $(mg/l)$')
ax[4].set_ylabel('$NO_3$ $(mg/l)$')
ax[5].set_ylabel('$Q_air$ $(m^3/h)$')
ax[5].set_xlabel('$(hours, minutes)$')

# plt.savefig('q95min', dpi=200)

## per minute of the hour

In [None]:
ax = df_clean_end.groupby(df_clean_end.index.minute).quantile(q=0.95).plot(figsize=(12,8), 
                                                                           subplots=True, 
                                                                           legend=False);
ax[0].set_ylabel('$ppm$'), ax[0].legend(['$N_2O$'], loc=4)
ax[1].set_ylabel('$ppm$'), ax[1].legend(['$CO_2$'])
ax[2].set_ylabel('$mg/l$'), ax[2].legend(['$DO$'], loc=4)
ax[3].set_ylabel('$mg/l$'), ax[3].legend(['$NH_4$'], loc=4)
ax[4].set_ylabel('$mg/l$'), ax[4].legend(['$NO_3$'], loc=4)
ax[5].set_ylabel('$m^3/h$'), ax[5].legend(['$Qair$'], loc=4), ax[5].set_xlabel('$hours$');
# plt.savefig('allVar_q95')

## per hour of the day

<div class="alert alert-success">
    <b> EXERCISE: </b> group data and plot it per hour of the day
</div>

<font size=5> define $df_q95$

In [None]:
#df_q95 = df_clean_end.groupby(df_clean_end.index.minute).quantile(q=0.95) #typical hour
#df_q95 = df_clean_end.groupby([df_clean_end.index.hour, df_clean_end.index.minute]).quantile(q=0.95) #typical day in minutes
df_q95 = df_clean_end.groupby(df_clean_end.index.hour).quantile(q=0.95) #typical day in hours

<font size=5> define a $df$ without $N_2O$ variable for hunting a PCA based model

In [None]:
df_q95_NOn2o = df_q95.drop('N2O', 1)

# PCA on cov mat for $df_q95$ without $N_2O$

In [None]:
df_q95_NOn2o.describe()

In [None]:
PCA_hommade(df_q95_NOn2o, df_q95['N2O'])

In [None]:
PCAskl(df_q95_NOn2o, df_q95['N2O'])

<div class="alert alert-success">
    <b> EXERCISE: </b> make all the plots appearing with the same size
</div>

<font size=5> In fact, it seems like just 2 PCs can be used since the low emission points are not distinguished either by the 3rd PC

## additional visualization

In [None]:
g = sns.PairGrid(df_q95)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, cmap="Blues_d", n_levels=6);

In [None]:
xs, ys = PCAskl(df_q95.drop(['CO2', 'Qair_meas'], 1), df_q95['N2O'], marksize=((df_q95['NH_4']/df_q95['NH_4'].min()))*50)

# Clustering

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster
import time

In [None]:
data = np.column_stack((xs,ys))

In [None]:
cluster.KMeans().fit_predict(data)

In [None]:
plot_kwds = {'alpha' : 1, 's' : 50, 'linewidths':0}
def plot_clusters(data, algorithm, args, kwds):

    labels = algorithm(*args, **kwds).fit_predict(data)

    with plt.style.context("seaborn-paper"):
        with plt.style.context("seaborn-white"):
            plt.figure(figsize=(3,2.5))
            palette = sns.color_palette('muted', np.unique(labels).max() + 12)
            colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
            plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
#             frame = plt.gca()
#             frame.axes.get_yaxis().set_visible(False)
            ax = plt.axes()
            ax.spines['top'].set_visible(False);
            ax.spines['right'].set_visible(False)
            ax.set_ylabel('PC2');
            ax.set_xlabel('PC1');

In [None]:
xs, ys = PCAskl(df_q95.drop(['CO2', 'Qair_meas'], 1), df_q95['N2O'], marksize=((df_q95['NH_4']/df_q95['NH_4'].min()))*50)

In [None]:
plot_clusters(data, cluster.KMeans, (), {'n_clusters':3})

<div class="alert alert-success">
    <b> EXERCISE: </b> try another clustering method
</div>