# Imports

In [16]:
import numpy as np
import pandas as pd

In [13]:
DATA_PATH = "../data/raw/jena_climate_2017_2024.csv"
df = pd.read_csv(DATA_PATH)
df["Date Time"] = pd.to_datetime(df["Date Time"])
df.set_index("Date Time", inplace=True)
df.head()

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),...,wv (m/s),max. wv (m/s),wd (deg),rain (mm),raining (s),SWDR (W/m²),PAR (µmol/m²/s),max. PAR (µmol/m²/s),Tlog (degC),CO2 (ppm)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 00:10:00,999.77,-4.91,268.27,-8.41,76.3,4.24,3.23,1.0,2.01,3.23,...,0.78,1.56,184.0,0.0,0.0,0.0,0.0,0.0,7.1,434.3
2017-01-01 00:20:00,999.63,-5.05,268.13,-8.37,77.4,4.19,3.24,0.95,2.02,3.24,...,1.52,1.92,202.6,0.0,0.0,0.0,0.0,0.0,7.72,434.1
2017-01-01 00:30:00,999.54,-4.98,268.21,-8.38,76.9,4.21,3.24,0.97,2.02,3.24,...,0.98,1.78,227.4,0.0,0.0,0.0,0.0,0.0,8.77,430.4
2017-01-01 00:40:00,999.4,-4.88,268.33,-8.56,75.2,4.25,3.19,1.05,1.99,3.2,...,1.16,1.8,212.5,0.0,0.0,0.0,0.0,0.0,9.36,430.6
2017-01-01 00:50:00,999.17,-5.17,268.06,-8.74,75.8,4.15,3.15,1.01,1.96,3.15,...,1.5,2.64,222.1,0.0,0.0,0.0,0.0,0.0,9.45,429.5


In [14]:
df.shape

(420782, 21)

# Removing Duplicates

In [15]:
df = df[~df.index.duplicated(keep='first')]
df.shape

(420631, 21)

# Missing Values

## Converting -9999 values to NaN

In [17]:
df = df.replace(-9999, np.nan)
df.shape

(420631, 21)

## Converting Erroneously Low CO2 Concentration to NaN

In [18]:
df.loc[df['CO2 (ppm)'] <= 375, 'CO2 (ppm)'] = np.nan
df.shape

(420631, 21)

In [19]:
df.isna().sum()

p (mbar)                  0
T (degC)                  0
Tpot (K)                  0
Tdew (degC)               0
rh (%)                    0
VPmax (mbar)              0
VPact (mbar)              0
VPdef (mbar)              0
sh (g/kg)                 0
H2OC (mmol/mol)           0
rho (g/m**3)              0
wv (m/s)                  1
max. wv (m/s)             0
wd (deg)                  0
rain (mm)                 0
raining (s)               0
SWDR (W/m²)               1
PAR (µmol/m²/s)           0
max. PAR (µmol/m²/s)    128
Tlog (degC)               0
CO2 (ppm)               822
dtype: int64

In [20]:
# Percentage of missing values

(df.isna().sum() / len(df))*100

p (mbar)                0.000000
T (degC)                0.000000
Tpot (K)                0.000000
Tdew (degC)             0.000000
rh (%)                  0.000000
VPmax (mbar)            0.000000
VPact (mbar)            0.000000
VPdef (mbar)            0.000000
sh (g/kg)               0.000000
H2OC (mmol/mol)         0.000000
rho (g/m**3)            0.000000
wv (m/s)                0.000238
max. wv (m/s)           0.000000
wd (deg)                0.000000
rain (mm)               0.000000
raining (s)             0.000000
SWDR (W/m²)             0.000238
PAR (µmol/m²/s)         0.000000
max. PAR (µmol/m²/s)    0.030430
Tlog (degC)             0.000000
CO2 (ppm)               0.195421
dtype: float64

## Imputing Missing Values using Interpolation

In [21]:
df.interpolate(method="time", inplace=True)
df.isna().sum()

p (mbar)                0
T (degC)                0
Tpot (K)                0
Tdew (degC)             0
rh (%)                  0
VPmax (mbar)            0
VPact (mbar)            0
VPdef (mbar)            0
sh (g/kg)               0
H2OC (mmol/mol)         0
rho (g/m**3)            0
wv (m/s)                0
max. wv (m/s)           0
wd (deg)                0
rain (mm)               0
raining (s)             0
SWDR (W/m²)             0
PAR (µmol/m²/s)         0
max. PAR (µmol/m²/s)    0
Tlog (degC)             0
CO2 (ppm)               0
dtype: int64

# Converting to Daily Resolution

In [22]:
df = df.resample("D").mean()
df.shape

(2923, 21)

# Saving the Data

In [23]:
DAILY_DATA_PATH = "../data/interim/daily_data.csv"

# df.to_csv(DAILY_DATA_PATH, index=True)