In [148]:
'''
socrates-scrapper
--------------------
Scrapes the SOCRATES website for upcoming satellite collision data.
https://celestrak.com/SOCRATES/

SOCRATES - Satellite Orbital Conjunction Reports Assessing Threatening Encounters in Space
SOCRATES uses Satellite Tool Kit's Conjunction Analysis Tools (STK/CAT) and the NORAD SGP4 propagator implemented in STK

Author Nicholas Miller
Date   9 Dec 2020
'''

import pandas as pd
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta
from os import listdir
from os.path import isfile, join
import re

def get_last_save_date(path):
    '''
    Get the date on the most recent file
    
    Parameters:
    -----------
    path : str
        Relative file path
    
    Returns
    -------
    file : str
        The most recent filename
        
    date : datetime
        Contains the most recent date
    '''
    
    dates = [ (match[0],match[1]) for f in listdir(path) if isfile(join(path, f))  if (match:=re.search('^socrates_([0-9]{14})\.csv(\.gz)?$', f))]
    try:
        file,date = sorted(dates, reverse=True)[0]
        return file, datetime.strptime(date, '%Y%m%d%H%M%S')
    except:
        return '', datetime.min

def scrape_socrates(num_of_records, min_hours, data_file_path, sort_list):
    '''
    Scrape the SOCRATES website for upcoming close flybys
    
    Parameters:
    -----------
    num_of_records : int
        Number of records to request from SOCRATES
    
    min_hours : int
        Minimum number of hours betwen file saves
        
    data_file_path: str
        Relative file path
    
    sort_list : list(str)
        Each sort order to download
    '''

    cidx_map = {1: 'sat1_norad', 2: 'sat1_name', 3: 'sat1_days_epoch', 4: 'max_prob', 5: 'dil_thr_km', 6: 'min_rng_km',
                7: 'rel_velo_kms', 8: 'sat2_norad', 9: 'sat2_name', 10: 'sat2_days_epoch', 11: 'start_time',
                12: 'tca_time', 13: 'stop_time'}

    # Save the datetime this was scraped
    extract_date = datetime.utcnow()
    concat_df = pd.DataFrame()
    
    for sort in sort_list:
        # Scrape data
        print(f'Making {sort} web request...')
        url = 'https://celestrak.com/SOCRATES/search-results.php?IDENT=NAME&NAME_TEXT1=&NAME_TEXT2=&CATNR_TEXT1=&CATNR_TEXT2=&ORDER=' + sort + '&MAX=' + str(num_of_records) + '&B1=Submit'
        response = requests.get(url)
        print('Request complete.  Begin Parsing...')


        # Parse Data
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find_all('table')[3]
        rows = []

        for record in table.find_all('form'):
            row = {}
            for idx, cell in enumerate(record.find_all('td')):
                if idx in cidx_map.keys():
                    row[cidx_map[idx]] = cell.text
            rows.append(row)
        print('Parsing complete.')

        # Convert the data into a Pandas Dataframe
        df = pd.DataFrame(rows)
        df['extract_sort'] = sort
        df['extract_date'] = extract_date
        concat_df = concat_df.append(df)

    # Save the file if none newer than the min_hours exists
    recent_file, recent_date = get_last_save_date(data_file_path)
    time_dif = extract_date - recent_date
    if time_dif > timedelta(hours=min_hours):
        filename = 'socrates_' + extract_date.strftime('%Y%m%d%H%M%S') + '.csv.gz'
        concat_df.to_csv(data_file_path + filename, index=False)
        print(f'Saving of file \'{filename}\' complete.  Please be sure to commit new file!')
    else:
        print(f'Not saving file since a file was created {time_dif} ago: {recent_file}')

    return concat_df
    

# Parameters:
#-----------------
num_of_records = 25
min_hours = 6
data_file_path = '../data/socrates/'
sort = ['MAXPROB', 'MINRANGE', 'TIMEIN']

df = scrape_socrates (num_of_records, min_hours, data_file_path, sort)
df.head()

Making MAXPROB web request...
Request complete.  Begin Parsing...
Parsing complete.
Making MINRANGE web request...
Request complete.  Begin Parsing...
Parsing complete.
Making TIMEIN web request...
Request complete.  Begin Parsing...
Parsing complete.
Not saving file since a file was created 2:25:45.825658 ago: socrates_20201209042013.csv.gz


Unnamed: 0,sat1_norad,sat1_name,sat1_days_epoch,max_prob,dil_thr_km,min_rng_km,rel_velo_kms,sat2_norad,sat2_name,sat2_days_epoch,start_time,tca_time,stop_time,extract_sort,extract_date
0,44421,COSMOS 2535 [+],0.842,0.04708,0.002,0.009,0.0,44424,COSMOS 2536 [+],0.842,2020 Dec 09 00:00:00.000,2020 Dec 09 00:03:06.880,2020 Dec 16 00:00:00.000,MAXPROB,2020-12-09 06:45:58.825658
1,14452,METEOR 2-10 [?],3.946,0.005832,0.038,0.055,14.803,41302,NOAA 16 DEB [-],3.78,2020 Dec 12 03:23:20.510,2020 Dec 12 03:23:20.848,2020 Dec 12 03:23:21.186,MAXPROB,2020-12-09 06:45:58.825658
2,26113,IMAGE [P],9.65,0.00385,2.112,4.014,16.806,45381,STARLINK-1258 [+],7.375,2020 Dec 15 20:01:51.336,2020 Dec 15 20:01:51.513,2020 Dec 15 20:01:51.690,MAXPROB,2020-12-09 06:45:58.825658
3,14452,METEOR 2-10 [?],3.876,0.002003,0.066,0.093,14.804,41302,NOAA 16 DEB [-],3.71,2020 Dec 12 01:41:42.891,2020 Dec 12 01:41:43.228,2020 Dec 12 01:41:43.566,MAXPROB,2020-12-09 06:45:58.825658
4,40935,LEMUR-2-PETER [+],5.177,0.001938,0.012,0.044,8.494,39842,SL-16 DEB [-],7.27,2020 Dec 13 13:05:04.536,2020 Dec 13 13:05:05.125,2020 Dec 13 13:05:05.713,MAXPROB,2020-12-09 06:45:58.825658


In [151]:
df = pd.read_csv('../data/socrates/socrates_20201209042013.csv.gz')
df

Unnamed: 0,sat1_norad,sat1_name,sat1_days_epoch,max_prob,dil_thr_km,min_rng_km,rel_velo_kms,sat2_norad,sat2_name,sat2_days_epoch,start_time,tca_time,stop_time,extract_sort,extract_date
0,44421,COSMOS 2535 [+],0.842,4.708000e-02,0.002,0.009,0.000,44424,COSMOS 2536 [+],0.842,2020 Dec 09 00:00:00.000,2020 Dec 09 00:03:06.880,2020 Dec 16 00:00:00.000,MAXPROB,2020-12-09 04:20:13.634745
1,14452,METEOR 2-10 [?],3.946,5.832000e-03,0.038,0.055,14.803,41302,NOAA 16 DEB [-],3.780,2020 Dec 12 03:23:20.510,2020 Dec 12 03:23:20.848,2020 Dec 12 03:23:21.186,MAXPROB,2020-12-09 04:20:13.634745
2,26113,IMAGE [P],9.650,3.850000e-03,2.112,4.014,16.806,45381,STARLINK-1258 [+],7.375,2020 Dec 15 20:01:51.336,2020 Dec 15 20:01:51.513,2020 Dec 15 20:01:51.690,MAXPROB,2020-12-09 04:20:13.634745
3,14452,METEOR 2-10 [?],3.876,2.003000e-03,0.066,0.093,14.804,41302,NOAA 16 DEB [-],3.710,2020 Dec 12 01:41:42.891,2020 Dec 12 01:41:43.228,2020 Dec 12 01:41:43.566,MAXPROB,2020-12-09 04:20:13.634745
4,40935,LEMUR-2-PETER [+],5.177,1.938000e-03,0.012,0.044,8.494,39842,SL-16 DEB [-],7.270,2020 Dec 13 13:05:04.536,2020 Dec 13 13:05:05.125,2020 Dec 13 13:05:05.713,MAXPROB,2020-12-09 04:20:13.634745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,14819,COSMOS 1544 [?],0.933,1.092000e-07,2.576,3.699,15.213,28429,PSLV DEB [-],1.200,2020 Dec 09 04:32:17.243,2020 Dec 09 04:32:17.464,2020 Dec 09 04:32:17.685,TIMEIN,2020-12-09 04:20:13.634745
2996,26931,PCSAT (NO-44) [P],1.009,2.917000e-06,1.013,4.127,4.445,30310,FENGYUN 1C DEB [-],1.100,2020 Dec 09 04:33:04.542,2020 Dec 09 04:33:05.177,2020 Dec 09 04:33:05.813,TIMEIN,2020-12-09 04:20:13.634745
2997,45759,STARLINK-1493 [+],1.105,2.549000e-07,1.085,3.761,9.276,46054,STARLINK-1548 [+],0.720,2020 Dec 09 04:33:25.762,2020 Dec 09 04:33:26.117,2020 Dec 09 04:33:26.472,TIMEIN,2020-12-09 04:20:13.634745
2998,28895,CUBESAT XI-V [+],1.288,1.365000e-07,2.082,3.661,14.533,39629,CZ-2C DEB [-],1.030,2020 Dec 09 04:33:26.704,2020 Dec 09 04:33:26.938,2020 Dec 09 04:33:27.172,TIMEIN,2020-12-09 04:20:13.634745
