# Time Series Analysis & Forecasting

Here some methods in TSA/F are explored and utilized

It requires that there are directories of csv files in /data containing all of the data that should be used.

In [6]:
# Imports
import pandas as pd
from statsmodels.tsa.ar_model import AutoReg
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

In [7]:
# Read in data
years = ["2019", "2020", "2021", "2022", "2023", "2024"]
columns = ["ISO Time", "Temperature (C)", "Humidity (%)", "Pressure (kPa)", "Air Quality (Ohms)"]
all_data = pd.read_csv("./data/2018_BME680SensorData.csv", usecols=columns)

for year in years:
    data_filename = f"./data/{year}_BME680SensorData.csv"
    read_in_data = pd.read_csv(data_filename, usecols=columns)
    all_data = pd.concat([all_data, read_in_data], ignore_index=True)

print("Data Collected")
all_data.head()

Data Collected


Unnamed: 0,ISO Time,Temperature (C),Humidity (%),Pressure (kPa),Air Quality (Ohms)
0,2018-01-01 00:00:02,19.6,44.49,99.436,12946860.0
1,2018-01-01 00:01:02,19.61,44.46,99.435,12946860.0
2,2018-01-01 00:02:01,19.62,44.46,99.433,12946860.0
3,2018-01-01 00:03:02,19.63,44.39,99.425,12946860.0
4,2018-01-01 00:04:02,19.62,44.39,99.433,12946860.0


### General Preprocessing

In [8]:
# Preprocessing

# Convert to date time
all_data['ISO Time'] = pd.to_datetime(all_data['ISO Time'])
all_data.set_index('ISO Time')

# Assurance that there are datapoints from each year -- This may throw an error as more is added to the code base
assert list(all_data['ISO Time'].dt.year.unique()) == [2018, 2019, 2020, 2021, 2022, 2023, 2024]

# Number of observations
len(all_data)

print(all_data.head())

             ISO Time  Temperature (C)  Humidity (%)  Pressure (kPa)  \
0 2018-01-01 00:00:02            19.60         44.49          99.436   
1 2018-01-01 00:01:02            19.61         44.46          99.435   
2 2018-01-01 00:02:01            19.62         44.46          99.433   
3 2018-01-01 00:03:02            19.63         44.39          99.425   
4 2018-01-01 00:04:02            19.62         44.39          99.433   

   Air Quality (Ohms)  
0          12946860.0  
1          12946860.0  
2          12946860.0  
3          12946860.0  
4          12946860.0  


### Feature Expansion -- Computation of daily descriptive statistics

In [11]:
grouped_data = all_data.groupby(all_data['ISO Time'].dt.date)

# Temperature Celsius
temperature_stats = grouped_data['Temperature (C)'].agg(
    min='min',
    max='max',
    mean='mean',
    median='median',
    std='std'
)
temperature_stats.index = pd.to_datetime(temperature_stats.index)
print("Temperature Stats\n")
print("Number of observations", len(temperature_stats))
print(temperature_stats.head())

# Humidity % moisture
humidity_stats = grouped_data['Humidity (%)'].agg(
    min='min',
    max='max',
    mean='mean',
    median='median',
    std='std'
)
humidity_stats.index = pd.to_datetime(humidity_stats.index)
print("\nHumidity Stats\n")
print("Number of observations", len(humidity_stats))
print(humidity_stats.head())

# Pressure kPa
pressure_stats = grouped_data['Pressure (kPa)'].agg(
    min='min',
    max='max',
    mean='mean',
    median='median',
    std='std'
)
pressure_stats.index = pd.to_datetime(pressure_stats.index)
print("\nPressure Stats\n")
print("Number of observations", len(pressure_stats))
print(pressure_stats.head())

# Air Quality Ohms
airq_stats = grouped_data['Air Quality (Ohms)'].agg(
    min='min',
    max='max',
    mean='mean',
    median='median',
    std='std'
)
airq_stats.index = pd.to_datetime(airq_stats.index)
print("\nAir Quality Stats\n")
print("Number of observations", len(airq_stats))
print(airq_stats.head())

list_of_dataframes = [temperature_stats, humidity_stats, pressure_stats, airq_stats]

Temperature Stats

Number of observations 2498
              min    max       mean  median       std
ISO Time                                             
2018-01-01  16.99  19.66  18.389903   18.34  0.685289
2018-01-02  14.66  17.01  15.835549   15.88  0.580612
2018-01-03  13.56  15.80  14.646667   14.65  0.507629
2018-01-04  13.97  16.85  15.515403   15.58  0.918085
2018-01-05  15.82  18.20  16.903662   17.00  0.367914

Humidity Stats

Number of observations 2498
              min    max       mean  median       std
ISO Time                                             
2018-01-01  32.13  44.49  36.907361  35.135  3.784848
2018-01-02  34.18  46.11  42.534833  43.950  3.395228
2018-01-03  39.87  56.14  48.947486  50.325  5.883896
2018-01-04  48.01  55.65  52.431951  53.550  2.577857
2018-01-05  38.92  50.26  42.455638  40.735  3.551891

Pressure Stats

Number of observations 2498
               min      max        mean    median       std
ISO Time                                       

# Time Series Forecasting

In [26]:
# Train - Test Split

def ts_train_test_split(df):
    return train, test

# Temperature (C) medians
temp_medians = temperature_stats['median']
last_year = temp_medians.index.year.max()
temperature_medians_test = temp_medians[temp_medians.index.year == last_year]
temperature_medians_train = temp_medians[temp_medians.index.year < last_year]

# Assurance that it is the final year
print(f"Training data period: {temperature_medians_train.index.min()} to {temperature_medians_train.index.max()}")
print(f"Test data period: {temperature_medians_test.index.min()} to {temperature_medians_test.index.max()}")




Training data period: 2018-01-01 00:00:00 to 2023-12-31 00:00:00
Test data period: 2024-01-01 00:00:00 to 2024-12-31 00:00:00


### Auto Regression (AR)