In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from pathlib import Path, PurePath
from ISMN_data_cleaning_utils import soil_temp_col_names

# Data Understanding and Preparation of the ISMN Data

The goal is generate descriptive statistics, plot, and impute missing values for the <10 cm depth soil temperature variable for all ISMN sensors in `data/raw`.

#### Define variables to be used in later cells

In [2]:
# key variables
max_depth = 0.10 # in m, exclusive
short_variable = 'ts' # soil temperature
long_variable = 'soil_temp'

# path variables
raw_data_path = Path('data/raw')
cleaned_data_path = Path('data/cleaned')

# define data directory names
dir_aberdeen = Path('Aberdeen-35-WNW')
dir_jamestown = Path('Jamestown-38-WSW')
dir_gobblers_knob = Path('GobblersKnob')
dir_nenana = Path('Nenana')
dir_L23 = Path('L23')
dir_L38 = Path('L38')
dir_NST_07 = Path('NST-07')
dir_NST_09 = Path('NST-09')
dir_SOD012 = Path('SOD012')
dir_SOD103 = Path('SOD103')

# # find all files
# for obj in data_dir.iterdir():
#     if obj.is_dir():  # data files stored in directories
#         for file_path in obj.iterdir():
#             print(PurePath(file_path).name)
#             file_name = PurePath(file_path).name.split('_')

#### Define functions to be used in later cells

In [7]:
def collect_data(path: Path, depth: float, short_feature: str, long_feature: str) -> pd.DataFrame:
    """
    Collect data for a station into a list then merge into a single df
    :param path: path to directory for a station
    :param depth: max depth in meters, exclusive
    :param short_feature: abbreviated variable name
    :param long_feature: full variable name
    :return: combined_df
    """
    col_names = ['UTC_date', 'UTC_time', long_feature, 'ISMN_data_quality', 'provider_data_quality']

    dfs = []
    for file in path.iterdir():
        filename = file.name
        filename_split = filename.split('_')

        # skip if file extension is not .stm
        if not filename.endswith('.stm'):
            continue

        # skip if file contains wrong variable or soil depth
        if filename_split[3] != short_feature or float(filename_split[4]) >= depth:
            continue

        df = pd.read_csv(file, sep=' ', header=None, skiprows=1, names=col_names)
        dfs.append(df)

    combined_df = pd.concat(dfs, axis=0, ignore_index=True)

    return combined_df

def create_timestamp_col(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create timestamp column.
    ISMN has the date and time in separate columns.
    :param df:
    :return: df with timestamp column
    """
    df_copy = df.copy()

    df_copy['UTC_timestamp'] = df_copy['UTC_date'].astype(str) + ' ' + df_copy['UTC_time'].astype(str)
    df_copy['UTC_timestamp'] = pd.to_datetime(df_copy['UTC_timestamp'], format='%Y/%m/%d %H:%M')
    df_copy = df_copy.drop(columns=['UTC_date', 'UTC_time'])

    return df_copy

### Aberdeen-35-WNW

In [8]:
df_aberdeen = collect_data(raw_data_path / dir_aberdeen, max_depth, short_variable, long_variable)
df_aberdeen = create_timestamp_col(df_aberdeen)
print(df_aberdeen.head())
# df_aberdeen = pd.read_csv(raw_data_path / data_aberdeen,
#                  sep=' ',
#                  header=None,
#                  skiprows=1,)
# df_aberdeen.columns = soil_temp_col_names

   soil_temp ISMN_data_quality provider_data_quality       UTC_timestamp
0        6.1                 G                     M 2009-10-28 02:00:00
1        6.1                 G                     M 2009-10-28 03:00:00
2        6.0                 G                     M 2009-10-28 04:00:00
3        6.0                 G                     M 2009-10-28 05:00:00
4        5.9                 G                     M 2009-10-28 06:00:00


### Nenana

In [5]:
df_nenana = collect_data(raw_data_path / dir_nenana, max_depth, short_variable, long_variable)