#  Importing and explore environmental data using Pandas 

In [3]:
import pandas as pd

# Import the air quality dataset 

CO2Data = pd.read_csv("MartinezTrejo_etal_2023_TA_DIC_RawData.csv")

## Explore a Pandas DataFrame in Python


- Get the number of rows and columns in the DataFrame

In [4]:
print(CO2Data.shape)

(106, 21)


- Print the first few rows of the DataFrame

In [5]:
print(CO2Data.head())

   Sample        Date     Estuary  ... DIC_micromol_kg TA_micromol_kg dummy_data
0  CDL01S  05/03/2020  Candelaria  ...            3915           3863     3863.0
1  CDL01F  05/03/2020  Candelaria  ...            3698           3685     3685.0
2  CDL02S  05/03/2020  Candelaria  ...            3724           3708     3708.0
3  CDL02F  05/03/2020  Candelaria  ...            3667           3992     3992.0
4  CDL03S  05/03/2020  Candelaria  ...            2928           3023     3023.0

[5 rows x 21 columns]


- Get information about the columns in the DataFrame using th

In [6]:
print(CO2Data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Sample                   106 non-null    object 
 1   Date                     106 non-null    object 
 2   Estuary                  106 non-null    object 
 3   Area                     106 non-null    object 
 4   Station                  106 non-null    object 
 5   Layer_depth              106 non-null    object 
 6   Season                   106 non-null    object 
 7   Chlorophy_microg_L       106 non-null    float64
 8   Cond_microsiemens_cm     106 non-null    float64
 9   Depth_m                  106 non-null    float64
 10  DO_percent_sat           106 non-null    float64
 11  DO_mg_L                  106 non-null    float64
 12  Sal_psu                  106 non-null    float64
 13  Sp_cond_microsiemens_cm  106 non-null    float64
 14  Turbidity_fnu            1

-  Summary of the statistics for each column in the DataFrame 

In [7]:
print(CO2Data.describe())

       Chlorophy_microg_L  Cond_microsiemens_cm  ...  TA_micromol_kg   dummy_data
count          106.000000            106.000000  ...      106.000000   100.000000
mean             6.545472          27895.183962  ...     2912.915094  2912.490000
std             14.941262          20931.232513  ...      472.694346   480.980553
min              0.360000             13.800000  ...     2357.000000  2357.000000
25%              2.555000           1778.025000  ...     2585.500000  2571.250000
50%              3.705000          33202.600000  ...     2823.000000  2816.000000
75%              5.925000          47046.650000  ...     3053.750000  3048.750000
max            150.900000          59988.600000  ...     4307.000000  4307.000000

[8 rows x 14 columns]


-  Group the data by a categorical column, and calculate the mean and standard deviation of a numerical column

In [8]:
grouped_data = CO2Data.groupby("Season")[['TA_micromol_kg', 'DIC_micromol_kg']].agg(['mean', 'std'])
print(grouped_data)

       TA_micromol_kg             DIC_micromol_kg            
                 mean         std            mean         std
Season                                                       
Dry       3092.333333  452.211935     2942.625000  521.669731
Rainy     2532.970588  228.041262     2491.676471  259.152198


In [9]:
grouped_data = CO2Data.groupby(["Season", "Estuary"])[['TA_micromol_kg', 'DIC_micromol_kg']].agg(['mean', 'std'])
print(grouped_data)

                  TA_micromol_kg             DIC_micromol_kg            
                            mean         std            mean         std
Season Estuary                                                          
Dry    Candelaria    3076.777778  556.680114     2917.055556  630.585893
       Palizada      3107.888889  323.177154     2968.194444  391.242257
Rainy  Candelaria    2537.062500  192.848635     2474.125000  306.407980
       Palizada      2529.333333  260.962923     2507.277778  216.751092


-  Missing values in each column.

In [10]:
# 1. Use the Pandas isnull() function to identify the number of missing values in each column.

print(CO2Data.isnull().sum())

Sample                     0
Date                       0
Estuary                    0
Area                       0
Station                    0
Layer_depth                0
Season                     0
Chlorophy_microg_L         0
Cond_microsiemens_cm       0
Depth_m                    0
DO_percent_sat             0
DO_mg_L                    0
Sal_psu                    0
Sp_cond_microsiemens_cm    0
Turbidity_fnu              0
Temp_C                     0
latitude                   0
longitude                  0
DIC_micromol_kg            0
TA_micromol_kg             0
dummy_data                 6
dtype: int64


In [11]:
# Replace the missing values in the dummy_data' column with the median value of the column.

CO2Data_fixed = CO2Data

median_dummy_data = CO2Data['dummy_data'].median()

CO2Data_fixed ['dummy_data'].fillna(median_dummy_data, inplace=True)

print(CO2Data_fixed.isnull().sum())

Sample                     0
Date                       0
Estuary                    0
Area                       0
Station                    0
Layer_depth                0
Season                     0
Chlorophy_microg_L         0
Cond_microsiemens_cm       0
Depth_m                    0
DO_percent_sat             0
DO_mg_L                    0
Sal_psu                    0
Sp_cond_microsiemens_cm    0
Turbidity_fnu              0
Temp_C                     0
latitude                   0
longitude                  0
DIC_micromol_kg            0
TA_micromol_kg             0
dummy_data                 0
dtype: int64
