## Data Processing

#### Loading in the Cloud sensor data 

In [1]:
import pandas as pd
import numpy as np
import os

#specifiying directories for the data from the two sites
sodankyla_dir = r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/data/sodankyla_fpi/"
kiruna_dir = r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/data/kiruna_fpi/"

In [2]:
def cloud_sensor_file_loader(directory):
    '''This function loads in all of the cloud sensor csv files in a specfied directory and concatenates them simultaneously.

    Inputs:
        directory: the directory of cloud sensor csv files
        
    Returns:
        dataframe: a concatenated pandas dataframe of all of the csv files containing the cloud data
    '''
    files = os.listdir(directory)

    # Filter only CSV files
    csv_files = [file for file in files if file.startswith('Cloud') and file.endswith('.csv')]

    # Initialize an empty DataFrame to store the concatenated data
    dataframe = pd.DataFrame()

    # Iterate over each CSV file and append it to the end of the concatenated DataFrame
    for file in csv_files:
        filepath = os.path.join(directory, file)
        df = pd.read_csv(filepath, delimiter=',')

        #having to shift the columns and reset index as the csv file wont read in properly
        #this means ulitmately dropping the sensor ID column, which is useless anyway 
        df = df.shift(axis=1)
        df = df.reset_index(drop=True)

        dataframe = pd.concat([dataframe, df])
    
    #dropping sensor ID column
    dataframe = dataframe.drop(['Sensor ID', 'HardwareID'], axis=1)
    
    #converting the unix timestamp to datetime and renaming the columns in camel-case for consistency
    dataframe['UnixTimestamp'] = pd.to_datetime(dataframe['UnixTimestamp'], unit='s')
    dataframe.rename(columns={"UnixTimestamp": "datetime", "Date": "date", "Time": "time", "SensorT":"sensorTemp", "SkyT": "skyTemp", "Clarity": "clarity", "Light":"light", "Rain":"rain"}, inplace=True)

    # dropping date and time columns as we now have a unified datetime column which is more flexible to work with 
    dataframe = dataframe.drop(['date', 'time'], axis=1)
    
    # sort the dataframe by date descedning and reset the idnex for a clean dataset
    dataframe = dataframe.sort_values(by='datetime')
    dataframe = dataframe.reset_index(drop=True)

    return dataframe

In [3]:
sodankyla_cloud_data = cloud_sensor_file_loader(sodankyla_dir)
sodankyla_cloud_data.to_csv(sodankyla_dir + "sodankyla_cloud_data.csv", index=False)
sodankyla_cloud_data

Unnamed: 0,datetime,sensorTemp,skyTemp,clarity,light,rain
0,2017-02-01 00:00:01,5.4,-45.7,51.1,0.0,0.0
1,2017-02-01 00:01:01,5.4,-45.8,51.2,0.0,-0.1
2,2017-02-01 00:02:02,5.4,-45.4,50.8,0.0,0.0
3,2017-02-01 00:03:03,5.4,-46.3,51.7,0.0,0.0
4,2017-02-01 00:04:03,5.5,-45.2,50.7,0.0,0.0
...,...,...,...,...,...,...
286020,2024-02-08 19:15:05,-3.2,-54.3,51.1,0.0,0.0
286021,2024-02-08 19:16:05,-3.2,-54.7,51.5,0.0,-0.1
286022,2024-02-08 19:17:06,-3.2,-53.2,50.0,0.0,-0.1
286023,2024-02-08 19:18:06,-3.1,-54.8,51.8,0.0,-0.1


In [4]:
kiruna_cloud_data = cloud_sensor_file_loader(kiruna_dir)
kiruna_cloud_data.to_csv(kiruna_dir + "kiruna_cloud_data.csv", index=False)
kiruna_cloud_data

Unnamed: 0,datetime,sensorTemp,skyTemp,clarity,light,rain
0,2019-02-01 00:00:03,-0.3,-4.4,4.1,0.0,0.4
1,2019-02-01 00:01:03,-0.3,-4.5,4.2,0.0,0.5
2,2019-02-01 00:02:03,-0.3,-4.4,4.1,0.0,0.4
3,2019-02-01 00:03:04,-0.3,-4.6,4.3,0.0,0.4
4,2019-02-01 00:04:04,-0.3,-4.6,4.3,0.0,0.4
...,...,...,...,...,...,...
214193,2024-02-08 19:17:00,-2.3,-10.6,8.2,0.0,0.5
214194,2024-02-08 19:18:00,-2.3,-11.1,8.8,0.0,0.7
214195,2024-02-08 19:19:01,-2.3,-10.6,8.3,0.0,0.5
214196,2024-02-08 19:20:01,-2.3,-10.6,8.3,0.0,0.7


#### Loading in the Fabry-Perot data

In [42]:
from datetime import datetime, timedelta

def adjust_date(row):
    '''This function adds one day to the date if above 24, as the date from each .dat file is over two days but only written down as one

    Inputs:
        row: the row of the dataframe for the date to be adjusted

    Returns:
        row: the row with the adjusted datetime
    '''
    new_time = float(row['time'])

    #have to set as a float becuase when used this 
    if float(row['time']) > 24:
        row['date'] += timedelta(days=1)
        new_time = float(row['time']) - 24 

    # Convert decimal hours to seconds
    seconds = round(new_time * 3600)

    # Create a timedelta object
    time_delta = timedelta(seconds=seconds)
    
    # Use a starting date to add the timedelta and extract the time
    start_date = datetime(1900, 1, 1)
    result_time = (start_date + time_delta).time()
    row['time'] = str(result_time)

    # need to create a unified datetime object with the date and time objects 
    time_object = datetime.strptime(str(result_time), "%H:%M:%S")
    datetime_combined = datetime(row['date'].year, row['date'].month, row['date'].day,
                             time_object.hour, time_object.minute, time_object.second)
    
    # assign the datetime value to the entry in the date column
    row['date'] = datetime_combined 
    return row

def fpi_file_loader(directory, site):
    '''This function loads in all of the .dat files and concatenate them simultaneously, performing some data trabformations along the way for ease of use later on

    Inputs:
        directory: the directory of fpi .dat files
        site: site of the FPI (ie Kiruna)
        
    Returns:
        dataframe: a concatenated pandas dataframe of all of the .dat files containing the fpi data
    '''
    files = os.listdir(directory)

    # Filter only CSV files
    dat_files = [file for file in files if file.endswith('.dat')]

    # Initialize an empty DataFrame to store the concatenated data
    dataframe = pd.DataFrame()

    # Iterate over each CSV file and append it to the end of the concatenated DataFrame
    for file in dat_files:
        filepath = os.path.join(directory, file)

        #extract date string from .dat filename
        date_string = '20' + str(file[1:3]) + '-' + str(int(file[3],16)) + '-' + str(file[4:6])

        # Convert the string to a datetime object
        date_object = datetime.strptime(date_string, "%Y-%m-%d")

        # Read lines from the file starting from line 14 as the data above isn't tabular it is in written form so unusable if unspecified (check a FP .dat file to see what I mean)
        with open(filepath, 'r') as file:
            lines = file.readlines()[13:]

        # Create a DataFrame from the lines and add the date column
        df = pd.DataFrame([line.strip().split() for line in lines])
        df['date'] = date_object
        df.rename(columns={0: "time", 1: "mirror", 2: "intensity", 3: "intensityError", 4:"windSpeed", 5: "windSpeedError", 6: "temp", 7:"tempError", 8:"chiSquared", 9:"sigToNoiseRatio", 10:"peak"}, inplace=True)

        # Apply the function based on the condition of if more than 24 , change the date to the net day
        df = df.apply(adjust_date, axis=1)
        df.rename(columns={'date': 'datetime'}, inplace=True)
        
        #concatenate new df to master dataframe
        dataframe = pd.concat([dataframe, df])

    dataframe['datetime'] = pd.to_datetime(dataframe['datetime'])
    dataframe['intensity'] = dataframe['intensity'].astype(float)
    dataframe['intensityError'] = dataframe['intensityError'].astype(float)
    dataframe['windSpeed'] = dataframe['windSpeed'].astype(float)
    dataframe['windSpeedError'] = dataframe['windSpeedError'].astype(float)
    dataframe['temp'] = dataframe['temp'].astype(float)
    dataframe['tempError'] = dataframe['tempError'].astype(float)
    dataframe['chiSquared'] = dataframe['chiSquared'].astype(float)
    dataframe['sigToNoiseRatio'] = dataframe['sigToNoiseRatio'].astype(float)

    # mapping the mirror codes to their look directions
    mirror_map = {'1': 'N','2': 'E','3': 'S','4': 'W','5': 'NW','6': 'NE','7': 'Zen','8': 'Cal','9': 'SW','10': 'SE','11': 'Kir A','12': 'Kir B','14': 'Sod A','15': 'Sod B', '16':'EISCAT C'}

    dataframe['lookDirection'] = dataframe['mirror'].map(mirror_map)

    # need a function for mapping azimuth angles, have only assigned angles to the 8 compass directions, the rest are assigned None values
    def angle_mapping(row, site):
        if site == 'Sodankyla':
            mapping = {
                '1': 8, '2': 98, '3': 188, '4': 278, '5': 323, '6': 53,'7': None,'8': None,'9': 233,'10': 143,'11': None,'12': None,'14': 335, '15': 220, '16': 296
            }
            return mapping[row['mirror']]
        elif site == 'Kiruna':
            # kiruna changed direction in 2009, need to account for this changed mirror map
            if row['datetime'].date() < datetime.strptime('2009-01-01', "%Y-%m-%d").date():
                mapping = {
                '1': 348,'2': 78,'3': 168,'4': 258,'5': 303,'6': 33,'7': None,'8': None,'9': 213,'10': 123,'11': 43,'12': 158,'14': None,'15': None, '16': None
            }
                return mapping[row['mirror']]
            else:
                mapping = {
                '1': 0,'2': 90,'3': 180,'4': 270,'5': 315,'6': 45,'7': None,'8': None,'9': 225,'10': 135,'11': 44,'12': 157,'14': None,'15': None, '16': 330
            }
                return mapping[row['mirror']]
        else: 
            return None
        
    dataframe['azimuthAngle'] = dataframe.apply(angle_mapping, axis=1, args=(site,))
    dataframe['windDirection'] = dataframe['azimuthAngle']

    dataframe['windSpeed'] = dataframe['windSpeed'].astype(float)
    dataframe['azimuthAngle'] = dataframe['azimuthAngle'].astype(float)

    # Identify the rows where column 'A' values are less than 0
    mask = dataframe['windSpeed'] < 0

    # Multiply the negative values in column 'A' by -1
    dataframe.loc[mask, 'windSpeed'] = dataframe.loc[mask, 'windSpeed'] * -1

    # Adjust the values in column 'C' by adding 180 and then taking modulo 360
    dataframe.loc[mask, 'windDirection'] = (dataframe.loc[mask, 'windDirection'] + 180) % 360

    # setting the correct datatypes and dropping 'peak' column as it is irrelevant
    dataframe = dataframe.drop(['peak', 'time'], axis=1)

    dataframe['lookDirection'] = dataframe['lookDirection'].astype(str)
    dataframe['mirror'] = dataframe['mirror'].astype(int)
    dataframe['azimuthAngle'] = dataframe['azimuthAngle'].astype(float)

    # sort the dataframe by date descedning and reset the idnex for a clean dataset
    dataframe = dataframe.sort_values(by='datetime')
    dataframe = dataframe.reset_index(drop=True)

    # reordering the columns
    dataframe = dataframe[['datetime', 'mirror', 'lookDirection', 'azimuthAngle', 'intensity', 'intensityError', 'windSpeed', 'windSpeedError', 'windDirection',
       'temp', 'tempError', 'chiSquared', 'sigToNoiseRatio']]
    
    return dataframe

In [43]:
# Mapping dictionary for the mirrors, easier to inderstand which direction they're looking in 
sodankyla_red_fpi_data = fpi_file_loader(sodankyla_dir, 'Sodankyla')
#sodankyla_red_fpi_data.to_csv(sodankyla_dir + 'sodankyla_red_fpi_data.csv', index=False)
sodankyla_red_fpi_data.head(5)

Unnamed: 0,datetime,mirror,lookDirection,azimuthAngle,intensity,intensityError,windSpeed,windSpeedError,windDirection,temp,tempError,chiSquared,sigToNoiseRatio
0,2003-02-03 01:32:32,8,Cal,,266.13,29.46,438.81,82.61,,536.2,174.33,0.12,58.0
1,2003-02-03 03:02:19,8,Cal,,179.75,17.91,834.91,90.01,,1495.74,414.75,0.12,64.0
2,2003-02-03 03:16:02,7,Zen,,805.21,99.31,227.0,73.92,,500.46,153.06,0.09,42.0
3,2003-02-03 03:17:18,3,S,188.0,876.37,120.74,299.93,76.72,188.0,401.64,135.78,0.09,33.0
4,2003-02-03 05:15:44,1,N,8.0,164.1,20.9,895.31,78.07,8.0,582.49,185.62,0.08,31.0


In [45]:
kiruna_red_fpi_data = fpi_file_loader(kiruna_dir + 'Kiruna_Red/', 'Kiruna')
#kiruna_red_fpi_data.to_csv(kiruna_dir + 'kiruna_red_fpi_data.csv', index=False)
kiruna_red_fpi_data.head(5)

Unnamed: 0,datetime,mirror,lookDirection,azimuthAngle,intensity,intensityError,windSpeed,windSpeedError,windDirection,temp,tempError,chiSquared,sigToNoiseRatio
0,2003-02-01 00:00:30,1,N,348.0,10061.11,30.36,176.3,1.76,168.0,322.93,2.51,0.11,64516.0
1,2003-02-01 00:01:01,11,Kir A,43.0,7920.98,26.61,129.59,3.0,223.0,1317.83,11.44,0.13,52114.0
2,2003-02-01 00:01:30,2,E,78.0,6211.83,20.62,12.2,2.94,258.0,1288.63,11.03,0.15,53247.0
3,2003-02-01 00:02:01,8,Cal,,2557.5,37.31,57.17,7.79,,437.18,15.46,2.92,4510.0
4,2003-02-01 00:02:30,12,Kir B,158.0,3327.0,17.8,87.44,4.84,158.0,1351.17,18.74,0.1,20568.0


#### Geomagnetic Data

In [46]:
data = []

with open('/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/data/geomagnetic_data.txt', 'r') as file:
    for line in file:
        # Custom logic to parse the line
        parsed_line = line.strip().split()  # Example: split by whitespace
        data.append(parsed_line)

# Assuming all rows have the same number of columns
columns = ['year', 'month', 'day', 'days_since_start_date', 'days_since_start_date_at_midday', 'bartell_srn', 'bartell_day_number', 'Kp_0000', 'Kp_0300', 'Kp_0600', 'Kp_0900', 'Kp_1200', 'Kp_1500', 'Kp_1800', 'Kp_2100',
           'ap_0000', 'ap_0300', 'ap_0600', 'ap_0900', 'ap_1200', 'ap_1500', 'ap_1800', 'ap_2100', 'Ap', 'sunspot_number', 'observed_F10.7', 'adjusted_F10.7', 'def_or_prelim']  # Replace with your actual column names
geomagnetic_data = pd.DataFrame(data, columns=columns)
geomagnetic_data['date'] = pd.to_datetime(geomagnetic_data[['year', 'month', 'day']]).dt.date
geomagnetic_data.head()

Unnamed: 0,year,month,day,days_since_start_date,days_since_start_date_at_midday,bartell_srn,bartell_day_number,Kp_0000,Kp_0300,Kp_0600,...,ap_1200,ap_1500,ap_1800,ap_2100,Ap,sunspot_number,observed_F10.7,adjusted_F10.7,def_or_prelim,date
0,2003,1,1,25933,25933.5,2312,23,1.0,2.333,2.667,...,7,5,6,9,7,52,115.0,111.2,2,2003-01-01
1,2003,1,2,25934,25934.5,2312,24,2.333,1.0,1.0,...,6,6,6,6,6,45,118.3,114.4,2,2003-01-02
2,2003,1,3,25935,25935.5,2312,25,2.667,0.667,2.0,...,12,48,27,32,18,110,137.6,133.1,2,2003-01-03
3,2003,1,4,25936,25936.5,2312,26,4.667,4.333,2.333,...,9,9,15,12,16,109,143.0,138.2,2,2003-01-04
4,2003,1,5,25937,25937.5,2312,27,1.667,1.667,1.667,...,7,7,7,12,7,114,148.1,143.2,2,2003-01-05


#### Merging the datasets 

Going to have to find an approximate method to match up the times in each dataset, as they are not always mergable. Will find the cloud data which is closest in time to each entry in the FPI dataset, and then merge these values for each row to make a new dataset.

We're going to say that if the datetime in the FPI data is within an hour of any data in the cloud data, then we can use it, if not then we can discard it (this can be changed depednant on if it is accurate engough ie. smaller timescale)

In [47]:
import pandas as pd
from tqdm import tqdm

def find_closest_datetime_with_values(fpi_df, cloud_df, min_range=False, hours=0.1):
    '''This function finds the row in the cloud_df, which has the closesst datetime value to the row in the fpi_df, anbd merges this row to the fpi_df

    Inputs:
       fpi_df: the fabry perot dataframe (created from the functions above)
       cloud_df: the cloud_sensor dataframe (created from the functions above)
       (Optional) min_range: whether or not the user wants to specify a minimum range for the closest datetime value, (default False)
       (Optional) hours: the time difference in hours to be used to find the nearest datetime value, only used when miN_range is true(default 0.1)

    Returns:
        dataframe: a pandas dataframe of the merged fpi and cloud data
    '''
    # Ensure 'datetime' columns are of datetime type
    fpi_df['datetime'] = pd.to_datetime(fpi_df['datetime'])
    cloud_df['datetime'] = pd.to_datetime(cloud_df['datetime'])
    
    # Sort the second dataframe by 'datetime' for efficient searching
    cloud_df = cloud_df.sort_values(by='datetime').reset_index(drop=True)

    # the columns which are going to be merged onto the fpi_df from the cloud_df
    additional_columns = cloud_df.keys()[1:].to_list()
    
    def get_closest(row):
        '''This function finds a row in the cloud_df which is closest in datetime to a row in the fpi_df, to be used to megre the two dataframes in the parent function

        Inputs:
            row: the row in the fpi_df to match with the cloud_df

        Returns:
            closest_row: the row in cloud_df with the closest datatime value to the row in the fpi_df
        '''
        dt = row['datetime']
        # Find the index of the closest datetime in cloud_df
        time_diffs = (cloud_df['datetime'] - dt).abs()
        min_time_diff = time_diffs.min()
        if min_range and min_time_diff > pd.Timedelta(hours=hours):
            # If the closest time difference is more <hours>, return None values
            return pd.Series([pd.NaT] + [None]*len(additional_columns), index=['datetime'] + additional_columns)
        idx = time_diffs.idxmin()
        # Retrieve the closest datetime and additional columns
        closest_row = cloud_df.loc[idx,  additional_columns]
        return closest_row
    
    # Initialize a DataFrame to store results
    result_df = pd.DataFrame(columns= additional_columns, index=fpi_df.index)
    
    # Iterate through each row in fpi_df with tqdm
    for index, row in tqdm(fpi_df.iterrows(), total=fpi_df.shape[0], desc="Processing rows"):
        closest_row = get_closest(row)
        result_df.loc[index] = closest_row
    
    # Concatenate the original fpi_df with the closest rows
    result_df = pd.concat([fpi_df.reset_index(drop=True), result_df.reset_index(drop=True)], axis=1)
    dropped = result_df.dropna()

    #reset the index
    dataframe = dropped.reset_index(drop=True)
    
    return dataframe

In [48]:
# specify the closeness of the datetime values to be merged in hours (change this accordingly, if the <hours> paramter is too small, it will return a small dataframe
# as it returns None values and drops them if there are no values within that specified time difference)
# these values are only used if min_range is set as true in the arguments for find_closest_datetime_values
seconds = 30
hours = seconds/3600
# Find closest date time with values
sodankyla_red_fpi_cs = find_closest_datetime_with_values(sodankyla_red_fpi_data, sodankyla_cloud_data)

Processing rows: 100%|██████████| 73097/73097 [04:49<00:00, 252.82it/s]


In [49]:
kiruna_red_fpi_cs = find_closest_datetime_with_values(kiruna_red_fpi_data, kiruna_cloud_data, hours)

Processing rows: 100%|██████████| 401249/401249 [14:11<00:00, 471.20it/s]


In [50]:
kiruna_red_fpi_cs

Unnamed: 0,datetime,mirror,lookDirection,azimuthAngle,intensity,intensityError,windSpeed,windSpeedError,windDirection,temp,tempError,chiSquared,sigToNoiseRatio,sensorTemp,skyTemp,clarity,light,rain
0,2019-02-25 16:23:56,1,N,0.0,769.36,5.33,180.13,2.97,180.0,276.20,4.61,0.39,8573.0,7.5,-7.3,14.8,0.0,-0.2
1,2023-02-01 15:41:37,12,Kir B,157.0,2231.23,16.65,78.91,6.68,157.0,1370.20,27.07,0.19,10958.0,2.8,-20.1,22.9,0.0,0.0
2,2023-02-01 15:42:13,3,S,180.0,2443.07,15.30,61.65,5.55,180.0,1335.83,22.09,0.19,15473.0,2.8,-20.1,22.9,0.0,0.0
3,2023-02-01 15:43:25,4,W,270.0,2993.80,13.26,40.32,3.92,270.0,1331.86,15.57,0.21,30896.0,2.8,-21.6,24.4,0.0,0.0
4,2023-02-01 15:44:08,1,N,0.0,2008.05,10.23,103.54,4.55,0.0,1366.46,18.42,0.13,23561.0,2.8,-22.4,25.2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21660,2023-03-01 00:00:59,1,N,0.0,2140.86,13.39,16.70,5.72,0.0,1445.86,24.13,0.08,15710.0,8.2,-4.5,12.6,0.0,0.0
21661,2023-03-01 00:01:35,11,Kir A,44.0,2252.61,9.97,62.59,4.04,44.0,1440.19,17.00,0.16,31290.0,8.2,-4.5,12.6,0.0,0.0
21662,2023-03-01 00:03:04,2,E,90.0,2929.48,12.49,138.31,3.82,90.0,1377.23,15.57,0.12,33577.0,8.2,-4.5,12.6,0.0,0.0
21663,2023-03-01 00:04:20,12,Kir B,157.0,2534.86,9.84,141.08,3.54,157.0,1440.30,14.91,0.13,40700.0,8.2,-4.5,12.6,0.0,0.0


In [51]:
# all datetime columns need to be converted back to datetime from string as pandas cant save them in this format
sodankyla_red_fpi_cs['date'] = sodankyla_red_fpi_cs['datetime'].dt.date
sodankyla_full = pd.merge(sodankyla_red_fpi_cs, geomagnetic_data, on='date', how='left')
sodankyla_full

Unnamed: 0,datetime,mirror,lookDirection,azimuthAngle,intensity,intensityError,windSpeed,windSpeedError,windDirection,temp,...,ap_0900,ap_1200,ap_1500,ap_1800,ap_2100,Ap,sunspot_number,observed_F10.7,adjusted_F10.7,def_or_prelim
0,2003-02-03 03:17:18,3,S,188.0,876.37,120.74,299.93,76.72,188.0,401.64,...,15,18,22,27,22,22,59,132.5,128.7,2
1,2003-02-03 05:15:44,1,N,8.0,164.10,20.90,895.31,78.07,8.0,582.49,...,15,18,22,27,22,22,59,132.5,128.7,2
2,2003-02-03 05:18:14,2,E,98.0,152.51,21.06,634.69,104.13,98.0,890.95,...,15,18,22,27,22,22,59,132.5,128.7,2
3,2003-02-03 06:05:39,3,S,188.0,132.69,23.45,903.72,116.67,8.0,700.24,...,15,18,22,27,22,22,59,132.5,128.7,2
4,2003-02-03 16:57:16,3,S,188.0,7.92,0.62,607.43,114.32,188.0,535.20,...,15,18,22,27,22,22,59,132.5,128.7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53655,2023-02-14 05:05:19,3,S,188.0,11191.33,117.69,22.08,8.18,188.0,1994.99,...,7,4,5,3,9,6,159,179.7,175.3,2
53656,2023-02-14 05:06:25,15,Sod B,220.0,11006.17,99.95,9.77,7.07,220.0,1989.67,...,7,4,5,3,9,6,159,179.7,175.3,2
53657,2023-02-14 05:07:32,4,W,278.0,11350.52,140.89,2.41,9.34,278.0,2133.92,...,7,4,5,3,9,6,159,179.7,175.3,2
53658,2023-02-14 05:08:40,14,Sod A,335.0,12052.06,142.05,25.17,8.96,335.0,2094.00,...,7,4,5,3,9,6,159,179.7,175.3,2


In [52]:
kiruna_red_fpi_cs['date'] = kiruna_red_fpi_cs['datetime'].dt.date
kiruna_full = pd.merge(kiruna_red_fpi_cs, geomagnetic_data, on='date', how='left')
kiruna_full

Unnamed: 0,datetime,mirror,lookDirection,azimuthAngle,intensity,intensityError,windSpeed,windSpeedError,windDirection,temp,...,ap_0900,ap_1200,ap_1500,ap_1800,ap_2100,Ap,sunspot_number,observed_F10.7,adjusted_F10.7,def_or_prelim
0,2019-02-25 16:23:56,1,N,0.0,769.36,5.33,180.13,2.97,180.0,276.20,...,2,0,2,0,3,1,0,70.4,68.9,2
1,2023-02-01 15:41:37,12,Kir B,157.0,2231.23,16.65,78.91,6.68,157.0,1370.20,...,2,4,5,12,9,5,72,133.5,129.6,2
2,2023-02-01 15:42:13,3,S,180.0,2443.07,15.30,61.65,5.55,180.0,1335.83,...,2,4,5,12,9,5,72,133.5,129.6,2
3,2023-02-01 15:43:25,4,W,270.0,2993.80,13.26,40.32,3.92,270.0,1331.86,...,2,4,5,12,9,5,72,133.5,129.6,2
4,2023-02-01 15:44:08,1,N,0.0,2008.05,10.23,103.54,4.55,0.0,1366.46,...,2,4,5,12,9,5,72,133.5,129.6,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21660,2023-03-01 00:00:59,1,N,0.0,2140.86,13.39,16.70,5.72,0.0,1445.86,...,12,3,5,7,7,7,93,162.0,159.0,2
21661,2023-03-01 00:01:35,11,Kir A,44.0,2252.61,9.97,62.59,4.04,44.0,1440.19,...,12,3,5,7,7,7,93,162.0,159.0,2
21662,2023-03-01 00:03:04,2,E,90.0,2929.48,12.49,138.31,3.82,90.0,1377.23,...,12,3,5,7,7,7,93,162.0,159.0,2
21663,2023-03-01 00:04:20,12,Kir B,157.0,2534.86,9.84,141.08,3.54,157.0,1440.30,...,12,3,5,7,7,7,93,162.0,159.0,2


### Getting rid of other Kp index columns

In [53]:
# work out the keys needed so that we can adapt the ap and Kp columns so that we only capture thre relevant column when we store the new value, this will be based off of th enarest time 
sodankyla_full.keys()

Index(['datetime', 'mirror', 'lookDirection', 'azimuthAngle', 'intensity',
       'intensityError', 'windSpeed', 'windSpeedError', 'windDirection',
       'temp', 'tempError', 'chiSquared', 'sigToNoiseRatio', 'sensorTemp',
       'skyTemp', 'clarity', 'light', 'rain', 'date', 'year', 'month', 'day',
       'days_since_start_date', 'days_since_start_date_at_midday',
       'bartell_srn', 'bartell_day_number', 'Kp_0000', 'Kp_0300', 'Kp_0600',
       'Kp_0900', 'Kp_1200', 'Kp_1500', 'Kp_1800', 'Kp_2100', 'ap_0000',
       'ap_0300', 'ap_0600', 'ap_0900', 'ap_1200', 'ap_1500', 'ap_1800',
       'ap_2100', 'Ap', 'sunspot_number', 'observed_F10.7', 'adjusted_F10.7',
       'def_or_prelim'],
      dtype='object')

In [54]:
def nearest_Kp_value(df):
    '''This function ibtans the time inn the datetime column, and finds the closest column from the geomagnetic data in time and makes this the Kp or ap value, as currently we have a lot of unneccessary columns

    Inputs:
        df: the dataframe we want to extarct the kp and ap indices from 

    Returns:
        df: the new dataframe 
    '''
    df['time'] = df['datetime'].dt.time

    # Calculate the minutes from midnight
    df['minutes_from_midnight'] = df['datetime'].dt.hour * 60 + df['datetime'].dt.minute

    # Define 3-hour intervals in minutes
    intervals = {
        '0000': 0,
        '0300': 180,
        '0600': 360,
        '0900': 540,
        '1200': 720,
        '1500': 900,
        '1800': 1080,
        '2100': 1260
    }

    # Calculate the absolute differences from each interval
    for key, value in intervals.items():
        df[f'diff_{key}'] = abs(df['minutes_from_midnight'] - value)

    #two functions below get closest interval and then extarct the value from that interval column, then adding this new value to a new column (either Kp or ap index)
    def get_closest_interval(row):
        min_diff = float('inf')
        closest_interval = None
        for key in intervals.keys():
            if row[f'diff_{key}'] < min_diff:
                min_diff = row[f'diff_{key}']
                closest_interval = key
        return closest_interval

    def get_closest_value(row, prefix):
        closest_interval = get_closest_interval(row)
        return row[f'{prefix}{closest_interval}']

    #apply the functions
    df['closest_interval'] = df.apply(get_closest_interval, axis=1)
    df['Kp_value'] = df.apply(get_closest_value, axis=1, args=('Kp_',))
    df['ap_value'] = df.apply(get_closest_value, axis=1, args=('ap_',))

    return df[['datetime', 'date', 'year', 'month', 'day', 'time', 'mirror', 'lookDirection', 'azimuthAngle', 'intensity',
       'intensityError', 'windSpeed', 'windSpeedError', 'temp', 'tempError',
       'chiSquared', 'sigToNoiseRatio', 'sensorTemp', 'skyTemp', 'clarity', 'light', 'rain',
       'bartell_srn', 'bartell_day_number', 'Kp_value', 'ap_value', 'Ap', 'sunspot_number', 'observed_F10.7', 'adjusted_F10.7',]]

In [55]:
sodankyla_final = nearest_Kp_value(sodankyla_full)
sodankyla_final

Unnamed: 0,datetime,date,year,month,day,time,mirror,lookDirection,azimuthAngle,intensity,...,light,rain,bartell_srn,bartell_day_number,Kp_value,ap_value,Ap,sunspot_number,observed_F10.7,adjusted_F10.7
0,2003-02-03 03:17:18,2003-02-03,2003,02,03,03:17:18,3,S,188.0,876.37,...,0.0,0.0,2314,2,3.000,15,22,59,132.5,128.7
1,2003-02-03 05:15:44,2003-02-03,2003,02,03,05:15:44,1,N,8.0,164.10,...,0.0,0.0,2314,2,2.667,12,22,59,132.5,128.7
2,2003-02-03 05:18:14,2003-02-03,2003,02,03,05:18:14,2,E,98.0,152.51,...,0.0,0.0,2314,2,2.667,12,22,59,132.5,128.7
3,2003-02-03 06:05:39,2003-02-03,2003,02,03,06:05:39,3,S,188.0,132.69,...,0.0,0.0,2314,2,2.667,12,22,59,132.5,128.7
4,2003-02-03 16:57:16,2003-02-03,2003,02,03,16:57:16,3,S,188.0,7.92,...,0.0,0.0,2314,2,4.000,27,22,59,132.5,128.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53655,2023-02-14 05:05:19,2023-02-14,2023,02,14,05:05:19,3,S,188.0,11191.33,...,0.0,-0.1,2585,1,2.333,9,6,159,179.7,175.3
53656,2023-02-14 05:06:25,2023-02-14,2023,02,14,05:06:25,15,Sod B,220.0,11006.17,...,0.0,0.0,2585,1,2.333,9,6,159,179.7,175.3
53657,2023-02-14 05:07:32,2023-02-14,2023,02,14,05:07:32,4,W,278.0,11350.52,...,0.0,-0.1,2585,1,2.333,9,6,159,179.7,175.3
53658,2023-02-14 05:08:40,2023-02-14,2023,02,14,05:08:40,14,Sod A,335.0,12052.06,...,0.0,-0.1,2585,1,2.333,9,6,159,179.7,175.3


In [56]:
kiruna_final = nearest_Kp_value(kiruna_full)
kiruna_final

Unnamed: 0,datetime,date,year,month,day,time,mirror,lookDirection,azimuthAngle,intensity,...,light,rain,bartell_srn,bartell_day_number,Kp_value,ap_value,Ap,sunspot_number,observed_F10.7,adjusted_F10.7
0,2019-02-25 16:23:56,2019-02-25,2019,02,25,16:23:56,1,N,0.0,769.36,...,0.0,-0.2,2531,9,0.333,2,1,0,70.4,68.9
1,2023-02-01 15:41:37,2023-02-01,2023,02,01,15:41:37,12,Kir B,157.0,2231.23,...,0.0,0.0,2584,15,1.333,5,5,72,133.5,129.6
2,2023-02-01 15:42:13,2023-02-01,2023,02,01,15:42:13,3,S,180.0,2443.07,...,0.0,0.0,2584,15,1.333,5,5,72,133.5,129.6
3,2023-02-01 15:43:25,2023-02-01,2023,02,01,15:43:25,4,W,270.0,2993.80,...,0.0,0.0,2584,15,1.333,5,5,72,133.5,129.6
4,2023-02-01 15:44:08,2023-02-01,2023,02,01,15:44:08,1,N,0.0,2008.05,...,0.0,0.0,2584,15,1.333,5,5,72,133.5,129.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21660,2023-03-01 00:00:59,2023-03-01,2023,03,01,00:00:59,1,N,0.0,2140.86,...,0.0,0.0,2585,16,3.000,15,7,93,162.0,159.0
21661,2023-03-01 00:01:35,2023-03-01,2023,03,01,00:01:35,11,Kir A,44.0,2252.61,...,0.0,0.0,2585,16,3.000,15,7,93,162.0,159.0
21662,2023-03-01 00:03:04,2023-03-01,2023,03,01,00:03:04,2,E,90.0,2929.48,...,0.0,0.0,2585,16,3.000,15,7,93,162.0,159.0
21663,2023-03-01 00:04:20,2023-03-01,2023,03,01,00:04:20,12,Kir B,157.0,2534.86,...,0.0,0.0,2585,16,3.000,15,7,93,162.0,159.0


In [57]:
#save to csvs
kiruna_final.to_csv(r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/data/kiruna_fpi/kiruna_red_full_merged.csv", index=False)
sodankyla_final.to_csv(r"/Users/elliotdable/Documents/UCL/research/fabry_perot_ml/data/sodankyla_fpi/sodankyla_full_merged.csv", index=False)