In [1]:
import numpy as  np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import glob
import itertools
from tqdm import tqdm


In [3]:
# Get list of sheets within excel file
def get_sheet_names(file_path):
    xl = pd.ExcelFile(file_path)
    return xl.sheet_names

get_sheet_names(r'Final Master Data.xlsx')

['Sampling Cycles',
 'Meterological Data',
 'PM Data',
 'LAMPS FT Raw Data',
 'LAMPS DN Raw Data',
 'CampSci Reference Panel',
 'Day Fraction',
 'LAMPS Calculated Soiling Loss',
 'Additional Data']

# LAMPS FT

In [15]:
# Read sheet name 'LAMPS FT Raw Data'
df = pd.read_excel(r'Final Master Data.xlsx', sheet_name='LAMPS FT Raw Data', skiprows=0)

# Drop rows with all NaN values
df.dropna(axis=0, how='all', inplace=True)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Drop column 'millis'
df.drop('millis', axis=1, inplace=True)

# Set 'Date' column as datetime
df.drop('Date', axis=1, inplace=True)
df['Date'] = pd.to_datetime(np.load('FT_Dates.npy'))

# # Merge Date and Time columns
df['DateTime'] = pd.to_datetime(df['Date'].astype('str') + ' ' + df['Time'].astype('str'))

# # Drop Date and Time columns
df.drop(['Date', 'Time'], axis=1, inplace=True)

# Only keep rows with DateTime values between 2022-01-10 and 2022-04-19 (inclusive)
df = df[(df['DateTime'] >= '2022-01-10T00:00:00') & (df['DateTime'] <= '2022-04-20T00:00:00')]

# Resample to minute frequency and interpolate missing values
# df = df.resample('T', on='DateTime').mean().interpolate()

df

Unnamed: 0,voltage1,voltage2,DateTime
57986,0.02,0.02,2022-01-10 00:00:41
57987,0.02,0.02,2022-01-10 00:01:40
57988,0.02,0.02,2022-01-10 00:02:39
57989,0.02,0.02,2022-01-10 00:03:38
57990,0.02,0.02,2022-01-10 00:04:37
...,...,...,...
203393,0.02,0.02,2022-04-19 23:55:37
203394,0.02,0.02,2022-04-19 23:56:36
203395,0.02,0.02,2022-04-19 23:57:35
203396,0.02,0.02,2022-04-19 23:58:34


In [25]:
total_dates = pd.date_range(start='2022-01-10T00:00:00', end='2022-04-20T00:00:00', freq='T')
available_dates = df['DateTime'].dt.floor('T')

In [26]:
# Find missing minutes in available dates compared to total dates
missing_dates = total_dates[~total_dates.isin(available_dates)]
missing_dates

DatetimeIndex(['2022-01-12 20:38:00', '2022-01-20 17:56:00',
               '2022-01-20 17:57:00', '2022-01-20 17:58:00',
               '2022-01-20 17:59:00', '2022-01-20 18:00:00',
               '2022-01-20 18:01:00', '2022-01-20 18:02:00',
               '2022-01-20 18:03:00', '2022-01-31 18:09:00',
               ...
               '2022-04-08 18:41:00', '2022-04-08 18:42:00',
               '2022-04-08 18:43:00', '2022-04-08 18:44:00',
               '2022-04-08 18:45:00', '2022-04-08 18:46:00',
               '2022-04-08 18:47:00', '2022-04-08 18:48:00',
               '2022-04-18 03:10:00', '2022-04-20 00:00:00'],
              dtype='datetime64[ns]', length=1007, freq=None)

In [16]:
df.resample('T', on='DateTime').mean().interpolate()

Unnamed: 0_level_0,voltage1,voltage2
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-10 00:00:00,0.02,0.02
2022-01-10 00:01:00,0.02,0.02
2022-01-10 00:02:00,0.02,0.02
2022-01-10 00:03:00,0.02,0.02
2022-01-10 00:04:00,0.02,0.02
...,...,...
2022-04-19 23:55:00,0.02,0.02
2022-04-19 23:56:00,0.02,0.02
2022-04-19 23:57:00,0.02,0.02
2022-04-19 23:58:00,0.02,0.02


# LAMPS DN

In [27]:
# Read sheet name 'LAMPS FT Raw Data'
df = pd.read_excel(r'Final Master Data.xlsx', sheet_name='LAMPS DN Raw Data', skiprows=0)

# Drop rows with all NaN values
df.dropna(axis=0, how='all', inplace=True)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Drop column 'millis'
df.drop('millis', axis=1, inplace=True)

# Set 'Date' column as datetime
df.drop('Date', axis=1, inplace=True)
df['Date'] = pd.to_datetime(np.load('DN_Dates.npy'))

# # Merge Date and Time columns
df['DateTime'] = pd.to_datetime(df['Date'].astype('str') + ' ' + df['Time'].astype('str'))

# # Drop Date and Time columns
df.drop(['Date', 'Time'], axis=1, inplace=True)

# Only keep rows with DateTime values between 2022-01-10 and 2022-04-19 (inclusive)
df = df[(df['DateTime'] >= '2022-01-10T00:00:00') & (df['DateTime'] <= '2022-04-20T00:00:00')]

# Resample to minute frequency and interpolate missing values
# df = df.resample('T', on='DateTime').mean().interpolate()

df

Unnamed: 0,voltage1,voltage2,DateTime
58920,0.00,0.00,2022-01-10 00:00:42
58921,0.00,0.00,2022-01-10 00:01:41
58922,0.00,0.00,2022-01-10 00:02:40
58923,0.00,0.00,2022-01-10 00:03:39
58924,0.00,0.00,2022-01-10 00:04:38
...,...,...,...
204323,0.02,0.02,2022-04-19 23:56:01
204324,0.02,0.02,2022-04-19 23:57:00
204325,0.02,0.02,2022-04-19 23:57:59
204326,0.02,0.02,2022-04-19 23:58:58


In [28]:
total_dates = pd.date_range(start='2022-01-10T00:00:00', end='2022-04-20T00:00:00', freq='T')
available_dates = df['DateTime'].dt.floor('T')

In [29]:
# Find missing minutes in available dates compared to total dates
missing_dates = total_dates[~total_dates.isin(available_dates)]
missing_dates

DatetimeIndex(['2022-01-16 01:52:00', '2022-01-16 01:54:00',
               '2022-01-17 13:31:00', '2022-01-20 17:55:00',
               '2022-01-20 17:56:00', '2022-01-20 17:57:00',
               '2022-01-20 17:58:00', '2022-01-20 17:59:00',
               '2022-01-20 18:00:00', '2022-01-20 18:01:00',
               ...
               '2022-04-08 18:39:00', '2022-04-08 18:40:00',
               '2022-04-08 18:41:00', '2022-04-08 18:42:00',
               '2022-04-08 18:43:00', '2022-04-08 18:44:00',
               '2022-04-08 18:45:00', '2022-04-08 18:46:00',
               '2022-04-08 18:47:00', '2022-04-20 00:00:00'],
              dtype='datetime64[ns]', length=1015, freq=None)