In [75]:
import sys
import os
from astropy.io import ascii
from astropy.table import unique, Table
import re

In [76]:
ACE_DATA = "/data/mta4/Space_Weather/ACE/Data"

In [77]:
#Doesnt' function very well. not a good use case
def remove_repeats(ifile, start = 0, formatting = None, delim_neg = True):
    """
    removes repeats from a data archive text file and resaves with same delimination
    input: ifile --- data archive text file to clean
           start --- predetermined starting line of data past possible file header
           formatting -- predetermined formatting for use in rewriting text file following astropy convention
           delim_neg --- If there are negative values, allow the negative sign to 
                         enter into the delimination space
    output: cleaned file lines
    """
    #Note that this function assumes we can read the full file into memory.
    header_lines = []
    f = open(ifile)
    for i in range(start):
        header_lines.append(f.readline())
    first_data_line = f.readline()
    f.close()
    first_data_list = first_data_line.strip().split()
    
    #uses to first data line to detemrine the custom string spacing delimination in use
    delimiters = re.split('[^\s]',first_data_line)
    delimiters = [j for j in delimiters if j != '']

    #check the frist data line for negative values to adjust delimiters if necessary
    for i in range(1,len(first_data_list)):
        if (first_data_list[i][0]) == "-" and delim_neg:
            delimiters[i-1] = delimiters[i-1] + " "
    
    #uses astropy to clean the data
    data = ascii.read(ifile, data_start = start)
    data = unique(data)
    if formatting:
        for i, col in enumerate(data.colnames):
            data[col].format = formatting[i]
    
    #Because we need to retain our custom delimination but need to make use of format specification
    #We will write the astropy table to a temporary file, then reading the data lines as strings
    ascii.write(data, 'tmp.dat', overwrite = True, format = 'no_header', comment = False)
    
    with open('tmp.dat') as f:
        data_lines = [x.strip().split() for x in f.readlines()]
    os.remove('tmp.dat')
    
    #records a clean version of the text file in cwd
    file_name = f"clean_{os.path.basename(ifile)}"
    #write the stored header
    f = open(file_name,'w')
    for line in header_lines:
        f.write(line)

    for i in range(len(data_lines)):
        line_list = data_lines[i]
        append_line = ''
        for j in range(len(line_list)):
            if (line_list[j][0]) == "-" and delim_neg:
                append_line = append_line[:-1]
            append_line += str(line_list[j]) + delimiters[j]
        f.write(append_line)
    f.close()

In [78]:
formatting = ["%04d", "%02d", "%02d", "%04d", "%05d", "%05d", "%01d", "%.2e", "%.2e", "%01d", "%.2e", "%.2e", "%.2e", "%.2e", "%.2e", "%.2f"]
remove_repeats('/data/mta4/Space_Weather/ACE/Data/ace_12h_archive', 0, formatting)

In [79]:
formatting = ["%04d", "%02d", "%02d", "%04d", "%05d", "%05d", "%01d", "%.2e", "%.2e", "%01d", "%.2e", "%.2e", "%.2e", "%.2e", "%.2e", "%.2f"]
remove_repeats('/data/mta4/Space_Weather/ACE/Data/ace_7day_archive', 0, formatting)

In [None]:
#For the ace_12h_archive and ace_7day_archive, the archiving script assumes that there 
#are no repeated lines in the text files. Once the archive files are cleaned, the script functions normally.
#Changes to these secondary archive functions to include a check for repeated lines can be handled in a subsequent PR.

In [80]:
#make use of header and new formatting functionality for the ace.archive file as well.
#From demonstrations below, the ace.archive file does not have repeated entries, 
#therefore cleaning the script is not required, but possible nonetheless
#formatting = ["%04d", "%02d", "%02d", "%04d", "%05d", "%5d", "%01d", "%.2e", "%.2e", "%01d", "%.2e", "%.2e", "%.2e", "%.2e", "%.2e", "%.2f", "%.2e", "%.2e"]
#remove_repeats("/data/mta4/Space_Weather/ACE/Data/ace.archive", 17, formatting)

In [81]:
archive_table = ascii.read(f"{ACE_DATA}/ace.archive")

In [82]:
archive_table

col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18
int64,int64,int64,int64,int64,int64,int64,float64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64
2024,2,16,1745,60356,63900,-1,-100000.0,-100000.0,-1,-100000.0,-100000.0,-100000.0,-100000.0,-100000.0,-1.0,360.0,131000000.0
2024,2,16,1740,60356,63600,0,14600.0,489.0,0,7170.0,360.0,58.1,24.3,8.74,-1.0,360.0,131000000.0
2024,2,16,1735,60356,63300,0,14500.0,472.0,0,7070.0,347.0,56.9,22.5,8.83,-1.0,347.0,131000000.0
2024,2,16,1730,60356,63000,0,13400.0,459.0,0,6690.0,338.0,56.9,23.4,9.12,-1.0,338.0,131000000.0
2024,2,16,1725,60356,62700,0,13600.0,471.0,0,6850.0,355.0,55.4,22.9,8.58,-1.0,355.0,131000000.0
2024,2,16,1720,60356,62400,0,14000.0,450.0,0,6970.0,368.0,59.2,23.6,8.02,-1.0,368.0,131000000.0
2024,2,16,1715,60356,62100,0,13600.0,444.0,0,6910.0,362.0,58.1,22.6,8.22,-1.0,362.0,131000000.0
2024,2,16,1710,60356,61800,0,13100.0,451.0,0,6770.0,361.0,57.4,22.8,8.95,-1.0,361.0,131000000.0
2024,2,16,1705,60356,61500,0,12700.0,464.0,0,7040.0,345.0,57.3,23.1,9.12,-1.0,345.0,130000000.0
2024,2,16,1700,60356,61200,0,13300.0,459.0,0,6990.0,348.0,54.6,22.7,8.54,-1.0,348.0,130000000.0


In [83]:
unique_archive_table = unique(archive_table)

In [84]:
unique_archive_table

col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18
int64,int64,int64,int64,int64,int64,int64,float64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64
2020,3,9,1535,58917,56100,0,950.0,31.9,0,3200.0,20.9,2.45,0.89,0.0875,-1.0,20.9,5540000.0
2020,3,9,1540,58917,56400,0,954.0,31.8,0,3180.0,19.8,2.63,0.742,0.212,-1.0,19.8,5550000.0
2020,3,9,1545,58917,56700,0,923.0,33.2,0,3040.0,22.4,2.5,0.696,0.139,-1.0,22.4,5550000.0
2020,3,9,1550,58917,57000,0,1170.0,31.4,0,3260.0,18.8,2.74,0.545,0.213,-1.0,18.8,5560000.0
2020,3,9,1555,58917,57300,0,968.0,34.8,0,3260.0,20.7,2.74,0.822,0.117,-1.0,20.7,5560000.0
2020,3,9,1600,58917,57600,0,1010.0,26.4,0,3040.0,20.2,2.86,0.515,0.21,-1.0,20.2,5570000.0
2020,3,9,1605,58917,57900,0,1080.0,32.3,0,3250.0,23.7,2.6,0.756,0.238,-1.0,23.7,5580000.0
2020,3,9,1610,58917,58200,0,921.0,32.2,0,3200.0,19.0,2.54,0.463,0.223,-1.0,19.0,5580000.0
2020,3,9,1615,58917,58500,0,916.0,36.7,0,3200.0,23.1,2.35,0.708,0.226,-1.0,23.1,5590000.0
2020,3,9,1620,58917,58800,0,944.0,32.3,0,3080.0,22.0,2.13,0.598,0.186,-1.0,22.0,5600000.0


In [85]:
#Since the number data entires contained in the ace.archive file are equivalent 
#to the unique astropy table version, we know that there are now repeated data entries.

In [31]:
#Cleaning the longterm/ace_data.txt file line by line
#Since opening the full file into memory isn't possbile/ideal
import subprocess
import Chandra.Time
import time

In [19]:
filename = f"{os.getcwd()}/ace_data.txt"

In [24]:
#Find the last time entry more efficiently than reading the full file.
cmd = f"tail -n 1 {filename}"
output = subprocess.check_output(cmd, shell=True, executable='/bin/csh')

In [25]:
output.decode()

'2024 02 16  1600   60356   57600  0  1.60e+04  5.05e+02  0  7.20e+03  4.17e+02  6.68e+01  2.75e+01  9.19e+00  -1.00\n'

In [34]:
dfile = ACE_DATA + '/longterm/ace_data.txt'
last_line = subprocess.check_output(f"tail -n 1 {dfile}", shell=True, executable='/bin/csh').decode()
atemp = re.split('\s+', last_line)

In [35]:
atemp

['2024',
 '02',
 '16',
 '1605',
 '60356',
 '57900',
 '0',
 '1.58e+04',
 '5.20e+02',
 '0',
 '7.01e+03',
 '3.93e+02',
 '6.53e+01',
 '2.58e+01',
 '8.81e+00',
 '-1.00',
 '']

In [36]:
ltime = atemp[0] + ':' + atemp[1] + ':' + atemp[2] + ':' + atemp[3][0] + atemp[3][1] + ':'
ltime = ltime    + atemp[3][2] + atemp[3][3] + ':00' 
ltime = time.strftime('%Y:%j:%H:%M:%S', time.strptime(ltime, '%Y:%m:%d:%H:%M:%S'))
stime = int(Chandra.Time.DateTime(ltime).secs)

In [37]:
stime

824486769

In [39]:
#path append git branch.
sys.path.append("/data/mta4/waaron/git/Space_Weather_New/ACE/Scripts")

In [40]:
import update_ace_data_files as upd

In [48]:
[current_ace_data, chead] = upd.read_current_ace_data()

In [49]:
ndata = current_ace_data

In [50]:
dlen  = len(ndata[0])

In [51]:
dlen

24

In [52]:
ndata[0]

[824481069,
 824481369,
 824481669,
 824481969,
 824482269,
 824482569,
 824482869,
 824483169,
 824483469,
 824483769,
 824484069,
 824484369,
 824484669,
 824484969,
 824485269,
 824485569,
 824485869,
 824486169,
 824486469,
 824486769,
 824487069,
 824487369,
 824487669,
 824487969]

In [54]:
#Current method of appending to the longterm archive
line = ''
for m in range(0, dlen):
        if ndata[0][m] > stime:
#
#--- record only good data
#
            if ndata[2][m] != 0 or ndata[5][m] != 0:
                continue

        line = line + ndata[1][m]
        line = line + '%3d'   % ndata[2][m]
        line = line + upd.line_adjust(ndata[3][m])
        line = line + upd.line_adjust(ndata[4][m])
        line = line + '%3d'   % ndata[5][m]
        line = line + upd.line_adjust(ndata[6][m])
        line = line + upd.line_adjust(ndata[7][m])
        line = line + upd.line_adjust(ndata[8][m])
        line = line + upd.line_adjust(ndata[9][m])
        line = line + upd.line_adjust(ndata[10][m])
        line = line + '%7.2f' % ndata[11][m]
        line = line + '\n'

In [57]:
print(f"last line entry of the file: {last_line}")
print(line)

last line entry of the file: 2024 02 16  1605   60356   57900  0  1.58e+04  5.20e+02  0  7.01e+03  3.93e+02  6.53e+01  2.58e+01  8.81e+00  -1.00

2024 02 16  1430   60356   52200  02024 02 16  1430   60356   52200  0  1.63e+04  5.37e+02  0  7.18e+03  4.06e+02  6.78e+01  2.72e+01  9.57e+00  -1.00
2024 02 16  1435   60356   52500  0  1.63e+04  5.27e+02  0  7.16e+03  4.02e+02  6.47e+01  2.76e+01  9.46e+00  -1.00
2024 02 16  1440   60356   52800  0  1.64e+04  5.31e+02  0  7.07e+03  3.93e+02  6.57e+01  2.76e+01  9.07e+00  -1.00
2024 02 16  1445   60356   53100  0  1.64e+04  5.06e+02  0  7.35e+03  4.03e+02  6.84e+01  2.72e+01  9.26e+00  -1.00
2024 02 16  1450   60356   53400  0  1.66e+04  5.28e+02  0  7.14e+03  4.25e+02  7.02e+01  2.73e+01  9.64e+00  -1.00
2024 02 16  1455   60356   53700  0  1.58e+04  5.01e+02  0  6.80e+03  4.09e+02  6.92e+01  2.82e+01  9.13e+00  -1.00
2024 02 16  1500   60356   54000  0  1.63e+04  5.05e+02  0  7.25e+03  4.18e+02  6.76e+01  2.88e+01  9.79e+00  -1.00
2024 02

In [58]:
#From the above data formatting, you can see that the text lines intended to be appended to the longterm file
#will append regardless of whether or not the time of the data entry is last the lat time entry,
#as detemined by stime.

#By adjusting the indentation, we ensute that the data lines are only written if the time entry is past stime

In [59]:
line = ''
for m in range(0, dlen):
        if ndata[0][m] > stime:
#
#--- record only good data
#
            if ndata[2][m] != 0 or ndata[5][m] != 0:
                continue

            line = line + ndata[1][m]
            line = line + '%3d'   % ndata[2][m]
            line = line + upd.line_adjust(ndata[3][m])
            line = line + upd.line_adjust(ndata[4][m])
            line = line + '%3d'   % ndata[5][m]
            line = line + upd.line_adjust(ndata[6][m])
            line = line + upd.line_adjust(ndata[7][m])
            line = line + upd.line_adjust(ndata[8][m])
            line = line + upd.line_adjust(ndata[9][m])
            line = line + upd.line_adjust(ndata[10][m])
            line = line + '%7.2f' % ndata[11][m]
            line = line + '\n'

In [60]:
print(f"last line entry of the file: {last_line}")
print(line)

last line entry of the file: 2024 02 16  1605   60356   57900  0  1.58e+04  5.20e+02  0  7.01e+03  3.93e+02  6.53e+01  2.58e+01  8.81e+00  -1.00

2024 02 16  1610   60356   58200  0  1.58e+04  4.96e+02  0  7.36e+03  3.90e+02  5.95e+01  2.65e+01  8.78e+00  -1.00
2024 02 16  1615   60356   58500  0  1.53e+04  4.90e+02  0  7.26e+03  4.02e+02  6.51e+01  2.63e+01  9.25e+00  -1.00
2024 02 16  1620   60356   58800  0  1.56e+04  4.73e+02  0  6.87e+03  4.05e+02  6.35e+01  2.60e+01  8.74e+00  -1.00



In [67]:
#Use of a smaller ace_data.txt example to show that the following function operates correctly.
filename = f"{os.getcwd()}/ace_data.txt"

In [68]:
#Simple function to clean the longterm ace data archive. Note that since the original
#uncleaned archive file is aroun 1.2G, cleaning the full file donw to 200M will take some time.
def longterm_clean(filename):
    f = open(filename)
    new_file = f"clean_{os.path.basename(filename)}"
    g = open(new_file,'w')
    last_time = 0
    while True:
        f_line = f.readline()
        if f_line == '':
            break
        #break if we reach the end of the file.
        atemp = re.split('\s+', f_line)
        ltime = atemp[0] + ':' + atemp[1] + ':' + atemp[2] + ':' + atemp[3][0] + atemp[3][1] + ':'
        ltime = ltime    + atemp[3][2] + atemp[3][3] + ':00' 
        ltime = time.strftime('%Y:%j:%H:%M:%S', time.strptime(ltime, '%Y:%m:%d:%H:%M:%S'))
        stime = int(Chandra.Time.DateTime(ltime).secs)
        #defines the time of the data entry.
        
        #if the entry is past the time recored from the previous entry, then write the the clean file version.
        if stime > last_time:
            last_time = stime
            g.write(f_line)
    f.close()
    g.close()

In [69]:
longterm_clean(filename)

In [72]:
live_file = "/data/mta4/Space_Weather/ACE/Data/longterm/ace_data.txt"

In [73]:
longterm_clean(live_file)