# **Data Exploration**

## **Import Dependencies**

In [179]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Define the directories
work_dir = r'd:\GITHUB\MSM-Research\Stomata2TranspireNet'
os.chdir(work_dir) # change the current working directory
data_dir = r'data\raw'
out_data_dir = r'data\processed'

## **Read the Datasets**

In [180]:
# Read the image information dataset
image_info_df = pd.read_excel(os.path.join(data_dir, 'csv', 'information.xlsx'))
print(image_info_df.shape)
image_info_df.head()

(428, 10)


Unnamed: 0,ID,Photo_name,Zoom,Date,Time,gsw,gbw,gtw,E,VPDleaf
0,—,,,YYYY-MM-DD,HH-MM,mol m-2 s-1,,,,KPa
1,167,0348.jpg,x2,2024-08-10 00:00:00,15-30,0.224415,2.914877,0.208372,2.571935,1.174389
2,168,0349.jpg,x2,2024-08-10 00:00:00,15-30,0.20035,2.915647,0.187468,2.051684,1.041221
3,169,0350.jpg,x2,2024-08-10 00:00:00,15-30,0.173362,2.915294,0.163631,1.996701,1.15891
4,170,0351.jpg,x2,2024-08-10 00:00:00,15-30,0.065583,2.915425,0.06414,0.845858,1.251926


In [181]:
# Read the environmental data
enviromental_df = pd.DataFrame()

for month in range(6, 12):
    df =  pd.read_excel(os.path.join(data_dir, 'csv', 'weather_data_yancheng.xls'), sheet_name=str(month))
    len_df = len(df)
    df.drop([0, len_df-1], axis=0, inplace=True)
    enviromental_df = pd.concat((enviromental_df, df), axis=0, ignore_index=True)

enviromental_df.columns = [col.strip() for col in enviromental_df.columns]
print(enviromental_df.shape)
enviromental_df.head()

(7121, 35)


Unnamed: 0,Date,Ambient Temperature,Dew Point,Relative Humidity,Soil Moisture,Water Potential,Salinity,Water Level,PM2.5,CO2,...,Ultraviolet Radiation,Heat Flux,30-minute Global Radiation,30-minute Diffuse Radiation,30-minute Direct Radiation,30-minute Net Radiation,30-minute Photosynthetically Active Radiation,30-minute Ultraviolet Radiation,30-minute Heat Flux,Sunshine Duration
0,2024-06-01 00:00,20.0,19.49,96.9,70.5,-2,10.2,0.25,69,587.8,...,0,937,0.0,0.452,0.0,-0.118,0.0,0.0,1.687,0.0
1,2024-06-01 00:30,20.5,19.71,95.2,70.5,-2,10.2,0.25,56,571.3,...,0,937,0.0,0.465,0.0,-0.124,0.0,0.0,1.687,0.0
2,2024-06-01 01:00,20.3,19.49,95.1,70.5,-2,10.21,0.25,47,571.3,...,0,937,0.0,0.463,0.0,-0.135,0.0,0.0,1.687,0.0
3,2024-06-01 01:30,20.4,19.69,95.7,70.5,-2,10.22,0.25,48,568.9,...,0,937,0.0,0.458,0.0,-0.125,0.0,0.0,1.687,0.0
4,2024-06-01 02:00,20.5,19.79,95.7,70.5,-2,10.23,1.19,47,573.2,...,0,937,0.0,0.464,0.0,-0.117,0.0,0.0,1.687,0.0


In [182]:
# Read the image paths
image_paths = glob(os.path.join(data_dir, 'image', '*.jpg'))
image_names = [i.split('\\')[-1] for i in image_paths]
print('Number of images:', len(image_paths))

Number of images: 457


## **Data Cleaning**

### **Image Informations**

In [183]:
# Drop the first row (0th index) from the dataframe
image_info_df.drop(0, axis=0, inplace=True)

# Merge the 'Date' and 'Time' columns into a single 'DateTime' column
image_info_df['Time'] = image_info_df['Time'].str.replace('-', ':')
image_info_df['DateTime'] = pd.to_datetime(image_info_df['Date'].astype(str).str.split().str[0] + ' ' + image_info_df['Time'])

# Reorder the columns
image_info_df = image_info_df[['ID', 'Photo_name', 'Zoom', 'DateTime', 'gsw', 'gbw', 'gtw', 'VPDleaf', 'E']]
image_info_df['gsw'] = image_info_df['gsw'].astype(float)
image_info_df['VPDleaf'] = image_info_df['VPDleaf'].astype(float)

print(image_info_df.shape)
image_info_df.head()

(427, 9)


Unnamed: 0,ID,Photo_name,Zoom,DateTime,gsw,gbw,gtw,VPDleaf,E
1,167,0348.jpg,x2,2024-08-10 15:30:00,0.224415,2.914877,0.208372,1.174389,2.571935
2,168,0349.jpg,x2,2024-08-10 15:30:00,0.20035,2.915647,0.187468,1.041221,2.051684
3,169,0350.jpg,x2,2024-08-10 15:30:00,0.173362,2.915294,0.163631,1.15891,1.996701
4,170,0351.jpg,x2,2024-08-10 15:30:00,0.065583,2.915425,0.06414,1.251926,0.845858
5,171,0352.jpg,x2,2024-08-10 15:30:00,0.095736,2.916392,0.092693,1.354014,1.322427


In [184]:
# Save the dataframe
# image_info_df.to_csv(os.path.join(out_data_dir, 'information.csv'), index=False)

### **Environmental Data**

In [185]:
# Sort the enviromental_df based on time
enviromental_df['DateTime'] = pd.to_datetime(enviromental_df['Date'])
enviromental_df_final = enviromental_df[['DateTime']]

for col in enviromental_df.columns[1:-1]:
    enviromental_df_final[col] = enviromental_df[col].astype(float)

enviromental_df_final.sort_values('DateTime', inplace=True)
print(enviromental_df_final.shape)
enviromental_df_final.head()

(7121, 35)


Unnamed: 0,DateTime,Ambient Temperature,Dew Point,Relative Humidity,Soil Moisture,Water Potential,Salinity,Water Level,PM2.5,CO2,...,Ultraviolet Radiation,Heat Flux,30-minute Global Radiation,30-minute Diffuse Radiation,30-minute Direct Radiation,30-minute Net Radiation,30-minute Photosynthetically Active Radiation,30-minute Ultraviolet Radiation,30-minute Heat Flux,Sunshine Duration
0,2024-06-01 00:00:00,20.0,19.49,96.9,70.5,-2.0,10.2,0.25,69.0,587.8,...,0.0,937.0,0.0,0.452,0.0,-0.118,0.0,0.0,1.687,0.0
1,2024-06-01 00:30:00,20.5,19.71,95.2,70.5,-2.0,10.2,0.25,56.0,571.3,...,0.0,937.0,0.0,0.465,0.0,-0.124,0.0,0.0,1.687,0.0
2,2024-06-01 01:00:00,20.3,19.49,95.1,70.5,-2.0,10.21,0.25,47.0,571.3,...,0.0,937.0,0.0,0.463,0.0,-0.135,0.0,0.0,1.687,0.0
3,2024-06-01 01:30:00,20.4,19.69,95.7,70.5,-2.0,10.22,0.25,48.0,568.9,...,0.0,937.0,0.0,0.458,0.0,-0.125,0.0,0.0,1.687,0.0
4,2024-06-01 02:00:00,20.5,19.79,95.7,70.5,-2.0,10.23,1.19,47.0,573.2,...,0.0,937.0,0.0,0.464,0.0,-0.117,0.0,0.0,1.687,0.0


In [189]:
# Save the dataframe
# enviromental_df_final.to_csv(os.path.join(out_data_dir, 'weather_data.csv'), index=False)