This notebook gathers all the seismic data processed through BasicPreprocessing-Batch1.py - BasicPreprocessing-Batch12.py, to create one coherent data set which occupies minimum memory.

BasicPreprocessing-Batch1.py - BasicPreprocessing-Batch12.py created all_data_batch1.pickle - all_data_batch12.pickle and contain sea data and RMS data for all the Italian stations, with four months' worth of data stored in each file for all the stattions. The sea data is incorrect, and as such can be ignored. The error was rectified and the data was processed in a separate script and saved in separate files.

# Import Libraries

In [1]:
import pickle
import pandas as pd
import os
from glob import glob

# Path Definitions

In [2]:
#Define paths to data and find the files

data_path = "Processed Data"
file_pattern = "all_data_batch*.pickle"

file_list = sorted(glob(os.path.join(data_path, file_pattern)))

In [3]:
combined_data = []

# Process Files

In [4]:
#Process each pickle file

def process_pickle(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)

    #Data is a dictionary with MultiIndex DataFrames (hierarchical structure of heading)
    processed_frames = {}
    df = data.copy()
    #Remove timezones from datetime to facilitate processing
    df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)

    #Reset index to facilitate merging and have the datecolumn as the index
    df.reset_index(inplace=True)
    
    #Store the data
    processed_frames = {'data': df}
    
    return processed_frames

In [5]:
#Process files and combine data

def combine_pickle_files(file_list):
    combined_data = []
    for file in file_list:
        print(f"Processing: {file}")
        file_data = process_pickle(file)
        combined_data.append(file_data)
    
    #Merge the data across files. for each key, extract the corresponding data and use pd.concat to merge. reindex the result.
    final_combined_data = {}
    for key in combined_data[0].keys():
        final_combined_data[key] = pd.concat([data[key] for data in combined_data], ignore_index=True)
    
    return final_combined_data

In [6]:
final_data = combine_pickle_files(file_list)

Processing: Processed Data\all_data_batch1.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch10.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch11.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch12.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch2.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch3.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch4.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch5.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch6.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch7.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch8.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


Processing: Processed Data\all_data_batch9.pickle


  df = df.apply(lambda col: col.dt.tz_localize(None) if pd.api.types.is_datetime64tz_dtype(col) else col)


In [7]:
 print(final_data)

{'data': station                       index          CAVT                              \
component                                       Z                               
freq                                     0.05-0.2      0.2-0.35      0.35-0.5   
0         2018-01-01 00:00:00+00:00  8.815104e-08  3.130537e-07  3.858233e-07   
1         2018-01-01 01:00:00+00:00  9.075963e-08  2.701870e-07  3.621845e-07   
2         2018-01-01 02:00:00+00:00  8.520208e-08  2.473702e-07  3.362727e-07   
3         2018-01-01 03:00:00+00:00  9.375001e-08  2.212960e-07  2.984257e-07   
4         2018-01-01 04:00:00+00:00  8.792392e-08  2.167006e-07  2.813728e-07   
...                             ...           ...           ...           ...   
36520     2020-12-31 20:00:00+00:00  8.805159e-08  5.173107e-07  4.951484e-07   
36521     2020-12-31 21:00:00+00:00  8.122070e-08  5.236413e-07  5.062911e-07   
36522     2020-12-31 22:00:00+00:00  7.401806e-08  4.866576e-07  4.634472e-07   
36523     2020-12-3

In [8]:
#Print column headers
for key, df in final_data.items():
    print(f"Columns for key '{key}':")
    for col in df.columns.to_list():
        print(col)

Columns for key 'data':
('index', '', '')
('CAVT', 'Z', '0.05-0.2')
('CAVT', 'Z', '0.2-0.35')
('CAVT', 'Z', '0.35-0.5')
('CAVT', 'Z', '0.5-0.65')
('CAVT', 'Z', '0.65-0.8')
('CAVT', 'Z', '0.8-0.95')
('CAVT', 'Z', '0.95-1.1')
('CAVT', 'Z', '1.1-1.25')
('CAVT', 'Z', '1.25-1.4')
('CAVT', 'Z', '1.4-1.55')
('CAVT', 'Z', '1.55-1.7')
('CAVT', 'Z', '1.7-1.85')
('CAVT', 'Z', '1.85-2.0')
('MMGO', 'Z', '0.05-0.2')
('MMGO', 'Z', '0.2-0.35')
('MMGO', 'Z', '0.35-0.5')
('MMGO', 'Z', '0.5-0.65')
('MMGO', 'Z', '0.65-0.8')
('MMGO', 'Z', '0.8-0.95')
('MMGO', 'Z', '0.95-1.1')
('MMGO', 'Z', '1.1-1.25')
('MMGO', 'Z', '1.25-1.4')
('MMGO', 'Z', '1.4-1.55')
('MMGO', 'Z', '1.55-1.7')
('MMGO', 'Z', '1.7-1.85')
('MMGO', 'Z', '1.85-2.0')
('HPAC', 'Z', '0.05-0.2')
('HPAC', 'Z', '0.2-0.35')
('HPAC', 'Z', '0.35-0.5')
('HPAC', 'Z', '0.5-0.65')
('HPAC', 'Z', '0.65-0.8')
('HPAC', 'Z', '0.8-0.95')
('HPAC', 'Z', '0.95-1.1')
('HPAC', 'Z', '1.1-1.25')
('HPAC', 'Z', '1.25-1.4')
('HPAC', 'Z', '1.4-1.55')
('HPAC', 'Z', '1.55-1.

In [9]:
df.columns = ['/'.join(col) for col in df.columns]

In [10]:
print(df.columns)

Index(['index//', 'CAVT/Z/0.05-0.2', 'CAVT/Z/0.2-0.35', 'CAVT/Z/0.35-0.5',
       'CAVT/Z/0.5-0.65', 'CAVT/Z/0.65-0.8', 'CAVT/Z/0.8-0.95',
       'CAVT/Z/0.95-1.1', 'CAVT/Z/1.1-1.25', 'CAVT/Z/1.25-1.4',
       ...
       'PZIN/E/0.5-0.65', 'PZIN/E/0.65-0.8', 'PZIN/E/0.8-0.95',
       'PZIN/E/0.95-1.1', 'PZIN/E/1.1-1.25', 'PZIN/E/1.25-1.4',
       'PZIN/E/1.4-1.55', 'PZIN/E/1.55-1.7', 'PZIN/E/1.7-1.85',
       'PZIN/E/1.85-2.0'],
      dtype='object', length=547)


In [11]:
print(df)

                        index//  CAVT/Z/0.05-0.2  CAVT/Z/0.2-0.35  \
0     2018-01-01 00:00:00+00:00     8.815104e-08     3.130537e-07   
1     2018-01-01 01:00:00+00:00     9.075963e-08     2.701870e-07   
2     2018-01-01 02:00:00+00:00     8.520208e-08     2.473702e-07   
3     2018-01-01 03:00:00+00:00     9.375001e-08     2.212960e-07   
4     2018-01-01 04:00:00+00:00     8.792392e-08     2.167006e-07   
...                         ...              ...              ...   
36520 2020-12-31 20:00:00+00:00     8.805159e-08     5.173107e-07   
36521 2020-12-31 21:00:00+00:00     8.122070e-08     5.236413e-07   
36522 2020-12-31 22:00:00+00:00     7.401806e-08     4.866576e-07   
36523 2020-12-31 23:00:00+00:00     7.629750e-08     5.090513e-07   
36524 2021-01-01 00:00:00+00:00     4.380724e-08     3.421890e-07   

       CAVT/Z/0.35-0.5  CAVT/Z/0.5-0.65  CAVT/Z/0.65-0.8  CAVT/Z/0.8-0.95  \
0         3.858233e-07     1.872133e-07     8.724746e-08     5.200551e-08   
1         3.62184

# Clean Data

In [12]:
#The original scripts generate 25 data points per day, with midnight occuring twice.
#Since the scripts take over 3 weeks to run in completion, the extra data point will be removed at this point
#The second midnight will be removed, i.e. the one that comes after 2300hrs of any day

datetimecol = df.columns[0]
#print(df[datetimecol])

df['date'] = df[datetimecol].dt.date  #extract date
df['time'] = df[datetimecol].dt.time  #eextract time

rows_to_keep = []

for date in df['date'].unique():
    daily_df = df[df['date'] == date]
    mask_midnight = daily_df['time'] == pd.Timestamp("00:00:00").time() #boolean mask to identify midnights
    
    #mark rows which are midnight and second occurences as TRUE and remove them with bitwise NOT (~)
    if mask_midnight.sum() > 1: #more than one midnight
        #keep the first midnight and remove the second
        daily_df = daily_df[~(mask_midnight & daily_df.duplicated(subset=['date', 'time'], keep='first'))]

    rows_to_keep.append(daily_df)
    
df_cleaned = pd.concat(rows_to_keep)

df_cleaned.drop(columns=['date', 'time'], inplace=True) #drop the columns that were used to help extract the right dates

df_cleaned.reset_index(drop=True, inplace=True)

  df['date'] = df[datetimecol].dt.date  #extract date
  df['time'] = df[datetimecol].dt.time  #eextract time


In [13]:
print(df_cleaned)

                        index//  CAVT/Z/0.05-0.2  CAVT/Z/0.2-0.35  \
0     2018-01-01 00:00:00+00:00     8.815104e-08     3.130537e-07   
1     2018-01-01 01:00:00+00:00     9.075963e-08     2.701870e-07   
2     2018-01-01 02:00:00+00:00     8.520208e-08     2.473702e-07   
3     2018-01-01 03:00:00+00:00     9.375001e-08     2.212960e-07   
4     2018-01-01 04:00:00+00:00     8.792392e-08     2.167006e-07   
...                         ...              ...              ...   
35060 2020-12-31 19:00:00+00:00     8.597725e-08     5.494857e-07   
35061 2020-12-31 20:00:00+00:00     8.805159e-08     5.173107e-07   
35062 2020-12-31 21:00:00+00:00     8.122070e-08     5.236413e-07   
35063 2020-12-31 22:00:00+00:00     7.401806e-08     4.866576e-07   
35064 2020-12-31 23:00:00+00:00     7.629750e-08     5.090513e-07   

       CAVT/Z/0.35-0.5  CAVT/Z/0.5-0.65  CAVT/Z/0.65-0.8  CAVT/Z/0.8-0.95  \
0         3.858233e-07     1.872133e-07     8.724746e-08     5.200551e-08   
1         3.62184

In [14]:
df_cleaned = df_cleaned.sort_values(by=df_cleaned.columns[0],ascending=True) #sort chronologically
df_cleaned = df_cleaned.drop(df_cleaned.index[-1]) #drop the last row since it contains out of range data
df_cleaned.reset_index(drop=True, inplace=True) #reindex and replace idx column with new

In [15]:
print(df_cleaned)

                        index//  CAVT/Z/0.05-0.2  CAVT/Z/0.2-0.35  \
0     2018-01-01 00:00:00+00:00     8.815104e-08     3.130537e-07   
1     2018-01-01 01:00:00+00:00     9.075963e-08     2.701870e-07   
2     2018-01-01 02:00:00+00:00     8.520208e-08     2.473702e-07   
3     2018-01-01 03:00:00+00:00     9.375001e-08     2.212960e-07   
4     2018-01-01 04:00:00+00:00     8.792392e-08     2.167006e-07   
...                         ...              ...              ...   
35059 2021-12-31 19:00:00+00:00     1.602309e-08     5.449989e-08   
35060 2021-12-31 20:00:00+00:00     1.644139e-08     5.304811e-08   
35061 2021-12-31 21:00:00+00:00     1.666800e-08     5.008145e-08   
35062 2021-12-31 22:00:00+00:00     1.740354e-08     4.722499e-08   
35063 2021-12-31 23:00:00+00:00     1.748172e-08     4.882969e-08   

       CAVT/Z/0.35-0.5  CAVT/Z/0.5-0.65  CAVT/Z/0.65-0.8  CAVT/Z/0.8-0.95  \
0         3.858233e-07     1.872133e-07     8.724746e-08     5.200551e-08   
1         3.62184

# Save Data

In [16]:
#Export dataset to parquet format - suitable for large data

df_cleaned.to_parquet('Processed Data/RMS_data.parquet', engine='pyarrow', compression='snappy')

print("Data saved to Processed Data/RMS_data.parquet")

Data saved to Processed Data/RMS_data.parquet
