#  Importing and explore environmental data

In [1]:
def import_csv(file):
    # Read the file into a DataFrame: df
    import pandas as pd
    return pd.read_csv(file)

In [9]:
# Path to the file to be imported
path = "../data/Terminos_lagoon_TA_DIC__2023_RawData.csv"

# Import the file
CO2Data = import_csv(path)

## Exploring  DataFrame


In [3]:
# Print the number of rows and columns in the DataFrame
print(CO2Data.shape)

(106, 21)


In [4]:
# Print the head of the DataFrame
print(CO2Data.head())

   Sample      Date     Estuary   Area Station Layer_depth Season  \
0  CDL01S  5/3/2020  Candelaria  River   CDL01     Surface    Dry   
1  CDL01F  5/3/2020  Candelaria  River   CDL01      Bottom    Dry   
2  CDL02S  5/3/2020  Candelaria  River   CDL02     Surface    Dry   
3  CDL02F  5/3/2020  Candelaria  River   CDL02      Bottom    Dry   
4  CDL03S  5/3/2020  Candelaria  River   CDL03     Surface    Dry   

   Chlorophy_microg_L  Cond_microsiemens_cm  Depth_m  ...  DO_mg_L  Sal_psu  \
0                0.36                7015.4    0.464  ...     7.12     3.56   
1                4.19               29886.1    7.792  ...     4.90    16.97   
2                0.92               16691.1    0.453  ...     6.99     8.94   
3                2.23               24847.4    1.261  ...     6.52    13.87   
4                0.58               46341.6    0.465  ...     6.24    28.06   

   Sp_cond_microsiemens_cm  Turbidity_fnu  Temp_C  latitude  longitude  \
0                   6547.7          

- Print information about a DataFrame including the index dtype and columns, non-null values and memory usage

In [5]:
# Print information about the DataFrame
print(CO2Data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Sample                   106 non-null    object 
 1   Date                     106 non-null    object 
 2   Estuary                  106 non-null    object 
 3   Area                     106 non-null    object 
 4   Station                  106 non-null    object 
 5   Layer_depth              106 non-null    object 
 6   Season                   106 non-null    object 
 7   Chlorophy_microg_L       106 non-null    float64
 8   Cond_microsiemens_cm     106 non-null    float64
 9   Depth_m                  106 non-null    float64
 10  DO_percent_sat           106 non-null    float64
 11  DO_mg_L                  106 non-null    float64
 12  Sal_psu                  106 non-null    float64
 13  Sp_cond_microsiemens_cm  106 non-null    float64
 14  Turbidity_fnu            1

In [6]:
# Print descriptive statistics
print(CO2Data.describe())

       Chlorophy_microg_L  Cond_microsiemens_cm     Depth_m  DO_percent_sat  \
count          106.000000            106.000000  106.000000      106.000000   
mean             6.545472          27895.183962    1.830160       89.515094   
std             14.941262          20931.232513    2.038739       29.772291   
min              0.360000             13.800000    0.105000        1.700000   
25%              2.555000           1778.025000    0.428750       84.575000   
50%              3.705000          33202.600000    0.638500       97.100000   
75%              5.925000          47046.650000    2.883250      105.300000   
max            150.900000          59988.600000    8.558000      174.100000   

          DO_mg_L     Sal_psu  Sp_cond_microsiemens_cm  Turbidity_fnu  \
count  106.000000  106.000000               106.000000     106.000000   
mean     6.474340   17.331981             27126.980189     100.429623   
std      2.104254   13.578980             20527.530804     290.290113

-  Group the data by a categorical column, and calculate the mean and standard deviation of a numerical column

In [7]:
TA_DIC_season  = CO2Data.groupby("Season")[['TA_micromol_kg', 'DIC_micromol_kg']].agg(['mean', 'std'])

print(TA_DIC_season)

       TA_micromol_kg             DIC_micromol_kg            
                 mean         std            mean         std
Season                                                       
Dry       3092.333333  452.211935     2942.625000  521.669731
Rainy     2532.970588  228.041262     2491.676471  259.152198


In [8]:

TA_DIC_Season_Area  = CO2Data.groupby(["Season", "Area"])[['TA_micromol_kg', 'DIC_micromol_kg']].agg(['mean', 'std'])

print(TA_DIC_Season_Area)

             TA_micromol_kg             DIC_micromol_kg            
                       mean         std            mean         std
Season Area                                                        
Dry    Coast    2859.125000  335.087440     2562.208333  134.178654
       Plume    2908.291667  194.581039     2751.208333  258.145894
       River    3509.583333  455.370171     3514.458333  474.327165
Rainy  Coast    2497.000000  317.024232     2290.250000   68.637553
       Plume    2449.636364   80.818652     2393.272727  133.077489
       River    2655.545455  169.717626     2809.818182  167.515264


-  Export to grouped data to cs excell

In [17]:
TA_DIC_Season_Area.to_excel('TA_DIC_Season_Areas.xlsx')

# Missing values in each column.

In [13]:
# Use the Pandas isnull() function to identify the number of missing values in each column.
print(CO2Data.isnull().sum())

Sample                     0
Date                       0
Estuary                    0
Area                       0
Station                    0
Layer_depth                0
Season                     0
Chlorophy_microg_L         0
Cond_microsiemens_cm       0
Depth_m                    0
DO_percent_sat             0
DO_mg_L                    0
Sal_psu                    0
Sp_cond_microsiemens_cm    0
Turbidity_fnu              0
Temp_C                     0
latitude                   0
longitude                  0
DIC_micromol_kg            0
TA_micromol_kg             0
dummy_data                 7
dtype: int64


### Method to use for filling holes in reindexed Series:

- ffill: propagate last valid observation forward to next valid.

- bfill: use next valid observation to fill gap.

In [14]:
# Create new data frame 
CO2Data_fill = CO2Data.copy()

# Use fill method 
CO2Data_fill = CO2Data_fill.fillna(method="ffill")

print(CO2Data_fill.isnull().sum())

Sample                     0
Date                       0
Estuary                    0
Area                       0
Station                    0
Layer_depth                0
Season                     0
Chlorophy_microg_L         0
Cond_microsiemens_cm       0
Depth_m                    0
DO_percent_sat             0
DO_mg_L                    0
Sal_psu                    0
Sp_cond_microsiemens_cm    0
Turbidity_fnu              0
Temp_C                     0
latitude                   0
longitude                  0
DIC_micromol_kg            0
TA_micromol_kg             0
dummy_data                 0
dtype: int64


#### Interpolation for filling missing values 

In [15]:
CO2Data_fill_linear = CO2Data.copy()

CO2Data_fill_linear = CO2Data_fill_linear.interpolate(method='linear')

print(CO2Data_fill_linear.isnull().sum())


Sample                     0
Date                       0
Estuary                    0
Area                       0
Station                    0
Layer_depth                0
Season                     0
Chlorophy_microg_L         0
Cond_microsiemens_cm       0
Depth_m                    0
DO_percent_sat             0
DO_mg_L                    0
Sal_psu                    0
Sp_cond_microsiemens_cm    0
Turbidity_fnu              0
Temp_C                     0
latitude                   0
longitude                  0
DIC_micromol_kg            0
TA_micromol_kg             0
dummy_data                 0
dtype: int64
