-
Notifications
You must be signed in to change notification settings - Fork 6
/
categorize.py
197 lines (171 loc) · 6.06 KB
/
categorize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import time
import requests
import urllib
import os
import mysql.connector
from mysql.connector import errorcode
from bs4 import BeautifulSoup
from bs4 import element
from threading import Thread
with open('../../login.txt', 'r') as f:
lines = f.readlines()
dbname = lines[0].strip()
dbtype = lines[1].strip()
dbhostname = lines[2].strip()
dbusername = lines[3].strip()
dbpassword = lines[4].strip()
# --- CONSTANTS ---
dbname = dbname or os.getenv('TRUSAT_DATABASE_NAME', None)
dbhostname = dbhostname or os.getenv('TRUSAT_DATABASE_HOST', None)
dbusername = dbusername or os.getenv('TRUSAT_DATABASE_USER', None)
dbpassword = dbpassword or os.getenv('TRUSAT_DATABASE_PASSWORD', "")
dbname or print("No database name specified")
dbhostname or print("No database host specified")
dbusername or print("No database user specified")
TABLE_create_query = """CREATE TABLE IF NOT EXISTS `categories` (
`obj_no` MEDIUMINT(5) UNSIGNED NOT NULL,
`name` varchar(32) NOT NULL,
`sub_category` varchar(120) NOT NULL,
`description` varchar(120) NOT NULL,
KEY `categories_obj_no_idx` (`obj_no`) USING BTREE
) CHARSET=utf8 ENGINE=Aria"""
# --- VARIABLES ---
buffer = []
tot_proc = 0
# --- UTILITY FUNCTIONS ---
def create_database(cursor):
try:
cursor.execute(
f"CREATE DATABASE {dbname} DEFAULT CHARACTER SET 'utf8'")
except mysql.connector.Error as err:
print(f'Failed creating database: {err}')
os._exit(1)
def process_file(url, name, sub_cat, description):
global buffer, tot_proc
# load txt file
_file = urllib.request.urlopen(url)
i = 0
# TODO: Update this to use TLE class
for _line in _file.readlines():
# read every 3rd line
i += 1
line = str(_line)[2:-3]
if i == 3:
entry = line.split(' ')
obj_id = entry[1]
# populate data in memory first and batch process after
data = (obj_id, name[:-4], sub_cat.strip(), description.strip())
buffer.append(data)
i = 0
tot_proc += 1
# --- INIT ---
def main():
# connect to server
try:
cnx = mysql.connector.connect(
host=dbhostname,
user=dbusername,
passwd=dbpassword,
db=dbname,
charset='utf8',
use_unicode=True
)
cursor = cnx.cursor()
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print('Something is wrong with your user name or password')
else:
print(err)
print('Script is shutting down.')
os._exit(1)
# load DB and initialize table
try:
cursor.execute(f'USE {dbname}')
except mysql.connector.Error as err:
print(f'Database {dbname} does not exist.')
if err.errno == errorcode.ER_BAD_DB_ERROR:
create_database(cursor)
print(f'Database {dbname} created successfully.')
cnx.database = dbname
else:
print(err)
os._exit(1)
cursor.execute(TABLE_create_query)
# scrape main page
URL = 'https://celestrak.com/NORAD/elements/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
tables = soup.find_all('table', class_='striped-odd')
tot_files = 0
for table in tables:
# get main category
header = table.find('tr', class_='header')
main_cat = header.next.next
# find all links within main category
links = header.find_next_siblings()
for link in links:
_tmp_link = link.next.next
if type(_tmp_link) != element.Tag:
continue
if 'href' in _tmp_link.attrs:
name = _tmp_link['href']
if name[-4:] == '.txt':
# start processing file in new thread
sub_cat = _tmp_link.get_text()
_url = URL + name
Thread(target=process_file, args=(_url, name, sub_cat, main_cat)).start()
tot_files += 1
# Skip the supplemental for now
if (False):
# scrape supplemental page
URL_SUP = 'https://celestrak.com/NORAD/elements/supplemental/'
page = requests.get(URL_SUP)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='center outline')
# get main category
header = table.find('tr', class_='header')
main_cat = header.next.next.next
# find all links within main category
links = header.find_next_siblings()
for link in links:
_tmp_link = link.next.next.next
name = _tmp_link['href']
if name[-4:] == '.txt':
# start processing file in new thread
_url = URL_SUP + name
Thread(target=process_file, args=(_url, name, sub_cat, main_cat)).start()
tot_files += 1
# wait for all threads to finish while displaying progress
while True:
if tot_proc == tot_files: break
print(f'Processed {tot_proc}/{tot_files} files', end='\r')
time.sleep(0.25)
print(f'{tot_proc} categories loaded successfully, with {len(buffer)} '
'entries in total.\nSaving to database...', end='')
# clear current records
clear_table = ("TRUNCATE TABLE categories")
cursor.execute(clear_table)
# save to DB
add_entry_query = """INSERT INTO categories
(obj_no, name, sub_category, description)
VALUES (%s, %s, %s, %s)"""
i = 0
entry_list = []
for _x in buffer:
if (i<1000):
entry_list.append(_x)
i+=1
else:
cursor.executemany(add_entry_query, entry_list)
entry_list = []
i = 0
# Commit the remaining batch < 1000
if (len(entry_list) > 0):
cursor.executemany(add_entry_query, entry_list)
cnx.commit()
print('done')
cursor.close()
cnx.close()
print('All satellites successfully saved to database!')
if __name__ == '__main__':
main()