## Data Processing

#### Loading in the Cloud sensor data 

In [112]:
import pandas as pd
import numpy as np
import os

#specifiying directories for the data from the two sites
sodankyla_dir = r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/sodankyla_fpi/"
kiruna_dir = r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/kiruna_fpi/"

In [150]:
#create a function to load in all of the cloud sesnor csv files and concatenate them simultaneously
def cloud_sensor_file_loader(directory):
    files = os.listdir(directory)

    # Filter only CSV files
    csv_files = [file for file in files if file.endswith('.csv')]

    # Initialize an empty DataFrame to store the concatenated data
    dataframe = pd.DataFrame()

    # Iterate over each CSV file and append it to the end of the concatenated DataFrame
    for file in csv_files:
        filepath = os.path.join(directory, file)
        df = pd.read_csv(filepath, delimiter=',')

        #having to shift the columns and reset index as the csv file wont read in properly
        #this means ulitmately dropping the sensor ID column, which is useless anyway 
        df = df.shift(axis=1)
        df = df.reset_index(drop=True)

        dataframe = pd.concat([dataframe, df])
    
    #dropping sensor ID column
    dataframe = dataframe.drop(['Sensor ID', 'HardwareID'], axis=1)
    
    #converting the unix timestamp to datetime and renaming the columns in camel-case for consistency
    dataframe['UnixTimestamp'] = pd.to_datetime(dataframe['UnixTimestamp'], unit='s')
    dataframe.rename(columns={"UnixTimestamp": "datetime", "Date": "date", "Time": "time", "SensorT":"sensorTemp", "SkyT": "skyTemp", "Clarity": "clarity", "Light":"light", "Rain":"rain"}, inplace=True)

    # dropping date and time columns as we no have a unified datetime column which is more flexible to work with 
    dataframe = dataframe.drop(['date', 'time'], axis=1)

    return dataframe

In [151]:
sodankyla_cloud_data = cloud_sensor_file_loader(sodankyla_dir)
sodankyla_cloud_data

Unnamed: 0,datetime,sensorTemp,skyTemp,clarity,light,rain
0,2020-02-01 00:00:03,2.8,-12.9,15.6,0.0,-0.8
1,2020-02-01 00:01:04,2.8,-12.7,15.5,0.0,-0.1
2,2020-02-01 00:02:05,2.9,-14.1,17.0,0.0,-0.8
3,2020-02-01 00:03:06,3.0,-12.9,15.9,0.0,-0.5
4,2020-02-01 00:04:00,3.1,-14.7,17.8,0.0,-0.2
...,...,...,...,...,...,...
33044,2023-02-28 23:55:06,7.3,-23.6,30.9,0.0,-0.1
33045,2023-02-28 23:56:00,7.2,-22.3,29.5,0.0,-0.1
33046,2023-02-28 23:57:00,7.1,-21.3,28.4,0.0,-0.1
33047,2023-02-28 23:58:01,7.1,-20.2,27.3,0.0,-0.1


In [152]:
kiruna_cloud_data = cloud_sensor_file_loader(kiruna_dir)
kiruna_cloud_data

Unnamed: 0,datetime,sensorTemp,skyTemp,clarity,light,rain
0,2020-02-01 00:00:05,3.5,-11.4,14.9,0.0,6.5
1,2020-02-01 00:01:05,3.4,-12.6,16.0,0.0,5.7
2,2020-02-01 00:02:05,3.4,-11.4,14.7,0.0,5.5
3,2020-02-01 00:03:06,3.4,-12.0,15.3,0.0,5.0
4,2020-02-01 00:04:00,3.4,-11.5,14.9,0.0,4.7
...,...,...,...,...,...,...
40315,2023-02-28 23:55:02,8.1,-5.8,13.8,0.0,0.0
40316,2023-02-28 23:56:03,8.1,-5.3,13.4,0.0,-0.1
40317,2023-02-28 23:57:04,8.1,-5.0,13.1,0.0,0.0
40318,2023-02-28 23:58:04,8.1,-4.2,12.3,0.0,0.0


#### Loading in the Fabry-Perot data

In [189]:
from datetime import datetime, timedelta

# Function to add one day to the date if above 24, as the date from each .dat file is over two days but only written down as one
def adjust_date(row):
    new_time = float(row['time'])

    #have to set as a float becuase when used this 
    if float(row['time']) > 24:
        row['date'] += timedelta(days=1)
        new_time = float(row['time']) - 24 

    # Convert decimal hours to seconds
    seconds = round(new_time * 3600)

    # Create a timedelta object
    time_delta = timedelta(seconds=seconds)
    
    # Use a starting date to add the timedelta and extract the time
    start_date = datetime(1900, 1, 1)
    result_time = (start_date + time_delta).time()
    row['time'] = str(result_time)

    # need to create a unified datetime object with the date and time objects 
    time_object = datetime.strptime(str(result_time), "%H:%M:%S")
    datetime_combined = datetime(row['date'].year, row['date'].month, row['date'].day,
                             time_object.hour, time_object.minute, time_object.second)
    
    # assign the datetime value to the entry in the date column
    row['date'] = datetime_combined 
    return row


#create a function to load in all of the csv files and concatenate them simultaneously
def fp_file_loader(directory):
    files = os.listdir(directory)

    # Filter only CSV files
    dat_files = [file for file in files if file.endswith('.dat')]

    # Initialize an empty DataFrame to store the concatenated data
    dataframe = pd.DataFrame()

    # Iterate over each CSV file and append it to the end of the concatenated DataFrame
    for file in dat_files:
        filepath = os.path.join(directory, file)

        #extract date string from .dat filename
        date_string = '20' + str(file[1:3]) + '-' + str(int(file[3],16)) + '-' + str(file[4:6])

        # Define the format of the date string
        date_format = "%Y-%m-%d"

        # Convert the string to a datetime object
        date_object = datetime.strptime(date_string, date_format)

        # Read lines from the file starting from line 14 as the data above isn't tabular it is in written form so unusable if unspecified (check a FP .dat file to see what I mean)
        with open(filepath, 'r') as file:
            lines = file.readlines()[13:]

        # Create a DataFrame from the lines and add the date column
        df = pd.DataFrame([line.strip().split() for line in lines])
        df['date'] = date_object
        df.rename(columns={0: "time", 1: "mirror", 2: "intensity", 3: "intensityError", 4:"windSpeed", 5: "windSpeedError", 6: "temp", 7:"tempError", 8:"chiSquared", 9:"sigToNoiseRatio", 10:"peak"}, inplace=True)

        # Apply the function based on the condition of if more than 24 , change the date to the net day
        df = df.apply(adjust_date, axis=1)

        #concatenate new df to master dataframe
        dataframe = pd.concat([dataframe, df])
    
    # setting the correct datatypes and dropping 'peak' column as it is irrelevant
    dataframe = dataframe.drop(['peak', 'time'], axis=1)
    
    dataframe['mirror'] = dataframe['mirror'].astype(int)
    dataframe['intensity'] = dataframe['intensity'].astype(float)
    dataframe['intensityError'] = dataframe['intensityError'].astype(float)
    dataframe['windSpeed'] = dataframe['windSpeed'].astype(float)
    dataframe['windSpeedError'] = dataframe['windSpeedError'].astype(float)
    dataframe['temp'] = dataframe['temp'].astype(float)
    dataframe['tempError'] = dataframe['tempError'].astype(float)
    dataframe['chiSquared'] = dataframe['chiSquared'].astype(float)
    dataframe['sigToNoiseRatio'] = dataframe['sigToNoiseRatio'].astype(float)

    # sort the dataframe by date descedning and reset the idnex for a clean dataset
    dataframe = dataframe.sort_values(by='date')
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

In [190]:
sodankyla_red_fpi_data = fp_file_loader(sodankyla_dir)
sodankyla_red_fpi_data

Unnamed: 0,mirror,intensity,intensityError,windSpeed,windSpeedError,temp,tempError,chiSquared,sigToNoiseRatio,date
0,8,266.13,29.46,438.81,82.61,536.20,174.33,0.12,58.0,2003-02-03 01:32:32
1,8,179.75,17.91,-834.91,90.01,1495.74,414.75,0.12,64.0,2003-02-03 03:02:19
2,7,805.21,99.31,227.00,73.92,500.46,153.06,0.09,42.0,2003-02-03 03:16:02
3,3,876.37,120.74,299.93,76.72,401.64,135.78,0.09,33.0,2003-02-03 03:17:18
4,1,164.10,20.90,895.31,78.07,582.49,185.62,0.08,31.0,2003-02-03 05:15:44
...,...,...,...,...,...,...,...,...,...,...
73092,4,11350.52,140.89,2.41,9.34,2133.92,77.00,0.12,6100.0,2023-02-14 05:07:32
73093,14,12052.06,142.05,25.17,8.96,2094.00,71.97,0.14,6595.0,2023-02-14 05:08:40
73094,1,13253.51,150.48,37.98,8.38,2200.38,72.17,0.19,7616.0,2023-02-14 05:09:47
73095,8,13009.21,72.20,-124.14,4.54,1606.46,26.12,0.58,23378.0,2023-02-14 05:10:16


In [142]:
kiruna_red_fpi_data = fp_file_loader(kiruna_dir + 'Kiruna_Red/')
kiruna_red_fpi_data

Unnamed: 0,time,mirror,intensity,intensityError,windSpeed,windSpeedError,temp,tempError,chiSquared,sigToNoiseRatio,date
604,5.1631,1,2142.78,16.95,-9.16,7.10,1328.28,27.19,0.09,9395.0,2003-02-01
778,15.6997,2,3549.01,25.45,-14.82,6.54,1379.60,25.70,0.07,11564.0,2003-02-01
779,15.7081,8,2430.23,31.92,-75.93,6.93,421.86,13.42,3.16,4741.0,2003-02-01
780,15.7164,12,3852.96,28.55,23.25,6.80,1401.83,27.03,0.08,10892.0,2003-02-01
781,15.7242,3,4015.64,22.38,18.07,5.15,1426.75,20.73,0.12,19292.0,2003-02-01
...,...,...,...,...,...,...,...,...,...,...,...
768,25.4539,1,2370.73,10.54,40.87,4.04,1429.53,16.93,0.13,31017.0,2023-03-01
767,25.4422,4,4558.50,17.31,3.75,3.42,1391.43,14.02,0.07,42334.0,2023-03-01
766,25.4319,7,3178.26,12.21,53.23,3.48,1416.64,14.48,0.10,41518.0,2023-03-01
764,25.4122,12,2722.06,10.84,160.97,3.64,1445.41,15.35,0.11,38792.0,2023-03-01
