## Data Processing

#### Loading in the Cloud sensor data 

In [222]:
import pandas as pd
import numpy as np
import os

#specifiying directories for the data from the two sites
sodankyla_dir = r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/sodankyla_fpi/"
kiruna_dir = r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/kiruna_fpi/"

In [223]:
#create a function to load in all of the cloud sesnor csv files and concatenate them simultaneously
def cloud_sensor_file_loader(directory):
    files = os.listdir(directory)

    # Filter only CSV files
    csv_files = [file for file in files if file.startswith('Cloud') and file.endswith('.csv')]

    # Initialize an empty DataFrame to store the concatenated data
    dataframe = pd.DataFrame()

    # Iterate over each CSV file and append it to the end of the concatenated DataFrame
    for file in csv_files:
        filepath = os.path.join(directory, file)
        df = pd.read_csv(filepath, delimiter=',')

        #having to shift the columns and reset index as the csv file wont read in properly
        #this means ulitmately dropping the sensor ID column, which is useless anyway 
        df = df.shift(axis=1)
        df = df.reset_index(drop=True)

        dataframe = pd.concat([dataframe, df])
    
    #dropping sensor ID column
    dataframe = dataframe.drop(['Sensor ID', 'HardwareID'], axis=1)
    
    #converting the unix timestamp to datetime and renaming the columns in camel-case for consistency
    dataframe['UnixTimestamp'] = pd.to_datetime(dataframe['UnixTimestamp'], unit='s')
    dataframe.rename(columns={"UnixTimestamp": "datetime", "Date": "date", "Time": "time", "SensorT":"sensorTemp", "SkyT": "skyTemp", "Clarity": "clarity", "Light":"light", "Rain":"rain"}, inplace=True)

    # dropping date and time columns as we no have a unified datetime column which is more flexible to work with 
    dataframe = dataframe.drop(['date', 'time'], axis=1)

    return dataframe

In [224]:
sodankyla_cloud_data = cloud_sensor_file_loader(sodankyla_dir)
sodankyla_cloud_data.to_csv(sodankyla_dir + "sodanklya_cloud_data.csv")
sodankyla_cloud_data

Unnamed: 0,datetime,sensorTemp,skyTemp,clarity,light,rain
0,2020-02-01 00:00:03,2.8,-12.9,15.6,0.0,-0.8
1,2020-02-01 00:01:04,2.8,-12.7,15.5,0.0,-0.1
2,2020-02-01 00:02:05,2.9,-14.1,17.0,0.0,-0.8
3,2020-02-01 00:03:06,3.0,-12.9,15.9,0.0,-0.5
4,2020-02-01 00:04:00,3.1,-14.7,17.8,0.0,-0.2
...,...,...,...,...,...,...
33044,2023-02-28 23:55:06,7.3,-23.6,30.9,0.0,-0.1
33045,2023-02-28 23:56:00,7.2,-22.3,29.5,0.0,-0.1
33046,2023-02-28 23:57:00,7.1,-21.3,28.4,0.0,-0.1
33047,2023-02-28 23:58:01,7.1,-20.2,27.3,0.0,-0.1


In [225]:
kiruna_cloud_data = cloud_sensor_file_loader(kiruna_dir)
kiruna_cloud_data.to_csv(kiruna_dir + "kiruna_cloud_data.csv")
kiruna_cloud_data

Unnamed: 0,datetime,sensorTemp,skyTemp,clarity,light,rain
0,2020-02-01 00:00:05,3.5,-11.4,14.9,0.0,6.5
1,2020-02-01 00:01:05,3.4,-12.6,16.0,0.0,5.7
2,2020-02-01 00:02:05,3.4,-11.4,14.7,0.0,5.5
3,2020-02-01 00:03:06,3.4,-12.0,15.3,0.0,5.0
4,2020-02-01 00:04:00,3.4,-11.5,14.9,0.0,4.7
...,...,...,...,...,...,...
40315,2023-02-28 23:55:02,8.1,-5.8,13.8,0.0,0.0
40316,2023-02-28 23:56:03,8.1,-5.3,13.4,0.0,-0.1
40317,2023-02-28 23:57:04,8.1,-5.0,13.1,0.0,0.0
40318,2023-02-28 23:58:04,8.1,-4.2,12.3,0.0,0.0


#### Loading in the Fabry-Perot data

In [226]:
from datetime import datetime, timedelta

# Function to add one day to the date if above 24, as the date from each .dat file is over two days but only written down as one
def adjust_date(row):
    new_time = float(row['time'])

    #have to set as a float becuase when used this 
    if float(row['time']) > 24:
        row['date'] += timedelta(days=1)
        new_time = float(row['time']) - 24 

    # Convert decimal hours to seconds
    seconds = round(new_time * 3600)

    # Create a timedelta object
    time_delta = timedelta(seconds=seconds)
    
    # Use a starting date to add the timedelta and extract the time
    start_date = datetime(1900, 1, 1)
    result_time = (start_date + time_delta).time()
    row['time'] = str(result_time)

    # need to create a unified datetime object with the date and time objects 
    time_object = datetime.strptime(str(result_time), "%H:%M:%S")
    datetime_combined = datetime(row['date'].year, row['date'].month, row['date'].day,
                             time_object.hour, time_object.minute, time_object.second)
    
    # assign the datetime value to the entry in the date column
    row['date'] = datetime_combined 
    return row


#create a function to load in all of the csv files and concatenate them simultaneously
def fpi_file_loader(directory):
    files = os.listdir(directory)

    # Filter only CSV files
    dat_files = [file for file in files if file.endswith('.dat')]

    # Initialize an empty DataFrame to store the concatenated data
    dataframe = pd.DataFrame()

    # Iterate over each CSV file and append it to the end of the concatenated DataFrame
    for file in dat_files:
        filepath = os.path.join(directory, file)

        #extract date string from .dat filename
        date_string = '20' + str(file[1:3]) + '-' + str(int(file[3],16)) + '-' + str(file[4:6])

        # Define the format of the date string
        date_format = "%Y-%m-%d"

        # Convert the string to a datetime object
        date_object = datetime.strptime(date_string, date_format)

        # Read lines from the file starting from line 14 as the data above isn't tabular it is in written form so unusable if unspecified (check a FP .dat file to see what I mean)
        with open(filepath, 'r') as file:
            lines = file.readlines()[13:]

        # Create a DataFrame from the lines and add the date column
        df = pd.DataFrame([line.strip().split() for line in lines])
        df['date'] = date_object
        df.rename(columns={0: "time", 1: "mirror", 2: "intensity", 3: "intensityError", 4:"windSpeed", 5: "windSpeedError", 6: "temp", 7:"tempError", 8:"chiSquared", 9:"sigToNoiseRatio", 10:"peak"}, inplace=True)

        # Apply the function based on the condition of if more than 24 , change the date to the net day
        df = df.apply(adjust_date, axis=1)
        df.rename(columns={'date': 'datetime'}, inplace=True)

        #concatenate new df to master dataframe
        dataframe = pd.concat([dataframe, df])
    
    # setting the correct datatypes and dropping 'peak' column as it is irrelevant
    dataframe = dataframe.drop(['peak', 'time'], axis=1)
    
    dataframe['mirror'] = dataframe['mirror'].astype(int)
    dataframe['intensity'] = dataframe['intensity'].astype(float)
    dataframe['intensityError'] = dataframe['intensityError'].astype(float)
    dataframe['windSpeed'] = dataframe['windSpeed'].astype(float)
    dataframe['windSpeedError'] = dataframe['windSpeedError'].astype(float)
    dataframe['temp'] = dataframe['temp'].astype(float)
    dataframe['tempError'] = dataframe['tempError'].astype(float)
    dataframe['chiSquared'] = dataframe['chiSquared'].astype(float)
    dataframe['sigToNoiseRatio'] = dataframe['sigToNoiseRatio'].astype(float)

    # sort the dataframe by date descedning and reset the idnex for a clean dataset
    dataframe = dataframe.sort_values(by='datetime')
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

In [227]:
sodankyla_red_fpi_data = fpi_file_loader(sodankyla_dir)
sodankyla_red_fpi_data.to_csv(sodankyla_dir + 'sodanklya_red_fpi_data.csv', index=False)
sodankyla_red_fpi_data

In [None]:
kiruna_red_fpi_data = fpi_file_loader(kiruna_dir + 'Kiruna_Red/')
kiruna_red_fpi_data.to_csv(kiruna_dir + 'Kiruna_Red/' + 'kiruna_red_fpi_data.csv', index=False)
kiruna_red_fpi_data

Unnamed: 0,mirror,intensity,intensityError,windSpeed,windSpeedError,temp,tempError,chiSquared,sigToNoiseRatio,date
0,1,10061.11,30.36,-176.30,1.76,322.93,2.51,0.11,64516.0,2003-02-01 00:00:30
1,11,7920.98,26.61,-129.59,3.00,1317.83,11.44,0.13,52114.0,2003-02-01 00:01:01
2,2,6211.83,20.62,-12.20,2.94,1288.63,11.03,0.15,53247.0,2003-02-01 00:01:30
3,8,2557.50,37.31,-57.17,7.79,437.18,15.46,2.92,4510.0,2003-02-01 00:02:01
4,12,3327.00,17.80,87.44,4.84,1351.17,18.74,0.10,20568.0,2003-02-01 00:02:30
...,...,...,...,...,...,...,...,...,...,...
401244,11,2359.47,15.68,36.63,6.17,1506.20,26.86,0.11,13974.0,2023-03-01 04:26:09
401245,2,2134.76,9.79,42.63,4.29,1534.23,18.94,0.24,29493.0,2023-03-01 04:26:45
401246,8,10003.92,20.56,3.70,1.43,768.21,3.96,0.96,133738.0,2023-03-01 04:27:24
401247,12,1737.01,10.82,49.97,5.80,1523.40,25.51,0.11,15919.0,2023-03-01 04:28:02
