#  Importing and explore environmental data

In [None]:
def import_csv(file):
    # Read the file into a DataFrame: df
    import pandas as pd
    return pd.read_csv(file)

In [None]:
# Path to the file to be imported
path = "../data/Terminos_lagoon_TA_DIC__2023_RawData.csv"

# Import the file
CO2Data = import_csv(path)

## Exploring  DataFrame


In [None]:
# Print the number of rows and columns in the DataFrame
print(CO2Data.shape)

In [None]:
# Print the head of the DataFrame
print(CO2Data.head())

- Print information about a DataFrame including the index dtype and columns, non-null values and memory usage

In [None]:
# Print information about the DataFrame
print(CO2Data.info())

In [None]:
# Print descriptive statistics
print(CO2Data.describe())

-  Group the data by a categorical column, and calculate the mean and standard deviation of a numerical column

In [None]:
TA_DIC_season  = CO2Data.groupby("season")[['ta_micromol_kg', 'dic_micromol_kg']].agg(['mean', 'std'])

print(TA_DIC_season)

In [13]:

TA_DIC_Season_Area  = CO2Data.groupby(["season", "area"])[['ta_micromol_kg', 'dic_micromol_kg']].agg(['mean', 'std'])

print(TA_DIC_Season_Area)

             ta_micromol_kg             dic_micromol_kg            
                       mean         std            mean         std
season area                                                        
Dry    Coast    2859.125000  335.087440     2562.208333  134.178654
       Plume    2908.291667  194.581039     2751.208333  258.145894
       River    3509.583333  455.370171     3514.458333  474.327165
Rainy  Coast    2497.000000  317.024232     2290.250000   68.637553
       Plume    2449.636364   80.818652     2393.272727  133.077489
       River    2655.545455  169.717626     2809.818182  167.515264


# Missing values in each column.

In [18]:
# Use the Pandas isnull() function to identify the number of missing values in each column.
print(CO2Data.isnull().sum())

sample                     0
date                       0
estuary                    0
area                       0
station                    0
layer_depth                0
season                     0
chlorophy_microg_L         0
cond_microsiemens_cm       0
depth_m                    0
do_percent_sat             0
do_mg_l                    0
sal_psu                    0
sp_cond_microsiemens_cm    0
turbidity_fnu              0
temp_c                     0
latitude                   0
longitude                  0
dic_micromol_kg            0
ta_micromol_kg             0
dummy_data                 7
dtype: int64


### Method to use for filling holes in reindexed Series:

- ffill: propagate last valid observation forward to next valid.

- bfill: use next valid observation to fill gap.

In [19]:
# Create new data frame 
CO2Data_fill = CO2Data.copy()

# Use fill method 
CO2Data_fill = CO2Data_fill.fillna(method="ffill")

print(CO2Data_fill.isnull().sum())

sample                     0
date                       0
estuary                    0
area                       0
station                    0
layer_depth                0
season                     0
chlorophy_microg_L         0
cond_microsiemens_cm       0
depth_m                    0
do_percent_sat             0
do_mg_l                    0
sal_psu                    0
sp_cond_microsiemens_cm    0
turbidity_fnu              0
temp_c                     0
latitude                   0
longitude                  0
dic_micromol_kg            0
ta_micromol_kg             0
dummy_data                 0
dtype: int64


  CO2Data_fill = CO2Data_fill.fillna(method="ffill")


#### Interpolation for filling missing values 

In [20]:
CO2Data_fill_linear = CO2Data.copy()

CO2Data_fill_linear = CO2Data_fill_linear.interpolate(method='linear')

print(CO2Data_fill_linear.isnull().sum())


sample                     0
date                       0
estuary                    0
area                       0
station                    0
layer_depth                0
season                     0
chlorophy_microg_L         0
cond_microsiemens_cm       0
depth_m                    0
do_percent_sat             0
do_mg_l                    0
sal_psu                    0
sp_cond_microsiemens_cm    0
turbidity_fnu              0
temp_c                     0
latitude                   0
longitude                  0
dic_micromol_kg            0
ta_micromol_kg             0
dummy_data                 0
dtype: int64


  CO2Data_fill_linear = CO2Data_fill_linear.interpolate(method='linear')


### Homework:  
    - Create a new column called "TA_DIC_ratio" that is the ratio of TA to DIC
    - Calculate the mean and standard deviation of the "TA_DIC_ratio" for each season
    - Calculate the mean and standard deviation of the "TA_DIC_ratio" for each season and area
    - Save the results to an Excel file called "TA_DIC_Season_Areas.xlsx"
