forked from scroberts/Library
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DISC.py
executable file
·403 lines (316 loc) · 14.2 KB
/
DISC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python3
# reference: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# External modules
import requests
import string
from bs4 import BeautifulSoup
import openpyxl
from openpyxl.styles import Font, Style, Alignment
from openpyxl.styles.colors import BLUE
from openpyxl import load_workbook
from datetime import datetime
# My modules
import Config as CF
import DCC
# Set Excel Styles
# Excel hyperlink style is calibri 11, underline blue
font_url_style = Font(color = BLUE, underline = 'single')
bold_style = Font(bold = True)
align_hv_cen_style = Alignment(horizontal = 'center', vertical = 'center')
align_ver_cen_style = Alignment(vertical = 'center')
def get_bb_prop(s, handle):
# Get properties from a bulletin board entry
# Inputs:
# s - requests session object
# handle - bb identifier string of the form 'bulletin-XXXX'
# Returns:
# list containing title, description and keywords strings
url = CF.dcc_url + "/dsweb/PROPFIND/" + handle
headers = {"DocuShare-Version":"5.0", "Content-Type":"text/xml", "Accept":"text/xml","Depth":"0"}
xml = """<?xml version="1.0" ?>
<propfind>
<prop>
<title/><description/><keywords/>
</prop>
</propfind>"""
r = s.post(url,data=xml,headers=headers)
r.raise_for_status()
# print('Status code:', r.status_code)
# Retrieve the results
dom = BeautifulSoup(r.text)
title = dom.title.text.strip()
keywords = dom.keywords.text.strip()
# Have to further process the description
dom = BeautifulSoup(dom.description.text)
description = ''
for string in dom.strings:
description = description.strip() + ' ' + string.strip()
description = clean_string(description.strip())
# print('title:', title)
# print('description:', description)
# print('keywords:', keywords)
return([title, description, keywords])
def set_bb_prop(s, title, description, keywords, handle):
url = CF.dcc_url + "/dsweb/PROPPATCH/" + handle
headers = {"DocuShare-Version":"5.0", "Content-Type":"text/xml", "Accept":"text/xml"}
xml = """<?xml version="1.0" ?>
<propertyupdate>
<set><prop>
<title><![CDATA[""" + title + """]]></title>
<description><![CDATA[""" + description + """]]></description>
<keywords><![CDATA[""" + keywords + """]]></keywords>
</prop></set>
</propertyupdate>"""
r = s.post(url,data=xml,headers=headers)
r.raise_for_status()
# print('Status code:', r.status_code)
def create_bb_post(s, title, description, keywords, handle):
url = CF.dcc_url + "/dsweb/MKRES/" + handle + "/Bulletin"
headers = {"DocuShare-Version":"5.0", "Content-Type":"text/xml", "Accept":"text/xml"}
xml = """<?xml version="1.0" ?>
<propertyupdate>
<set><prop>
<title><![CDATA[""" + title + """]]></title>
<description><![CDATA[""" + description + """]]></description>
<keywords><![CDATA[""" + keywords + """]]></keywords>
</prop></set>
</propertyupdate>"""
r = s.post(url,data=xml,headers=headers)
r.raise_for_status()
# print('Status code:', r.status_code)
def clean_string(str):
# Split text on newlines and add them to the list
# Reference: http://stackoverflow.com/questions/29976234/openpyxl-unicode-values
try:
outstr = ''
for str in str.replace('_x000D_','').splitlines():
outstr = outstr + ' ' + str
except:
outstr = ''
print('clean_str: exception encountered for string', str)
outstr = outstr.strip()
return(outstr)
def get_postentry_author_info( entry ):
pa = entry.find("td", class_= "postentry_author")
author = pa("input")[0].attrs['value']
auth_url = 'https://docushare.tmt.org/' + pa("a")[0].attrs['href']
# The above author = line is the same as:
# td_author = postentry.find("td", class_="postentry_author")
# input_author = td_author("input")
# author = input_author[0].attrs['value']
# Find the date and time of the original entry
dt = pa("span")[0]("span")[0].string.strip()
# Format it as a string
# time format: 02/11/15 11:18 AM
dt = datetime.strptime(dt, "%m/%d/%y %I:%M %p")
ent = entry.find("div", class_ = "postentry_header").find("a")
url = 'https://docushare.tmt.org/' + ent.attrs["href"]
title = ent.text
# If there are carriage returns in the entry there will be multiple "p" instances.
try:
ent_str = entry.find("div", class_ = "postdescription").text.strip()
except:
pentries = entry.find("div", class_ = "postdescription").find_all("p")
ent_str = ''
for ent in pentries:
ent_str = ent_str + '\t' + ent.text
return([title, url, author, auth_url, dt, ent_str])
def set_ss_headings(ws, rownum, col):
ws.cell(row = rownum, column = col).value = "ID"
ws.cell(row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "Title"
ws.cell(row = rownum, column = col).alignment = align_ver_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "Posting"
ws.cell(row = rownum, column = col).alignment = align_ver_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "Author"
ws.cell(row = rownum, column = col).alignment = align_ver_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "Post Date"
ws.cell(row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "# Replies"
ws.cell(row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "Replies By:"
ws.cell(row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell(row = rownum, column = col).value = "Latest Reply Date"
ws.cell(row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell (row = rownum, column = col).value = "Latest Reply"
ws.cell (row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell (row = rownum, column = col).value = "Disposition"
ws.cell (row = rownum, column = col).alignment = align_hv_cen_style
col += 1
ws.cell (row = rownum, column = col).value = "Action"
ws.cell (row = rownum, column = col).alignment = align_hv_cen_style
colmax = col + 1
for col in range(1, colmax):
ws.cell(row = rownum, column = col).font = bold_style
def get_discussion_rowcol(url, htmlfile, xlfile, ssrow, sscol):
# Login to DCC and save the discussion as an html file
s = DCC.login(CF.dcc_url + CF.dcc_login)
res = s.get(url)
res.raise_for_status()
webfile = open(CF.dccfilepath + htmlfile,'wb')
for chunk in res.iter_content(100000):
webfile.write(chunk)
webfile.close
# Get the HTML into the Beautiful Soup object
dcc=open(CF.dccfilepath + htmlfile,'r',encoding='utf-8').read()
dom = BeautifulSoup(dcc, "html.parser")
# Open the spreadsheet
try:
wb = load_workbook(xlfile)
print('Opened existing file :', xlfile)
existing_doc = True
except:
wb = openpyxl.Workbook()
print('Created new file :', xlfile)
existing_doc = False
ws = wb.worksheets[0]
# Write and format the headings in Excel if this is a new document
set_ss_headings(ws, ssrow, sscol)
# Find the Parent of all the original posts (that may have replies)
# This is <form name="ToolbarMulti" method="post" action="/docushare/dsweb/ProcessMultipleCommand">
form_tag = dom.find("form", {"name":"ToolbarMulti"})
rowoff = ssrow
ssrow = ssrow + 1
# Now find all the children that are Post Entries
for idx, postentry in enumerate(form_tag.find_all("div", class_ = "postentry", recursive = False)):
[title, url, author, auth_url, dt, post] = get_postentry_author_info( postentry )
# post is the text of the original posting.
# Look for replies to posts
# The replyposts class is always the next sibling
replies = postentry.find_next_sibling()
# count the number of replies
times = 0
reps = []
r_dates = [dt]
r_latest = ''
r_title = ''
r_disposition = ''
r_action = ''
for replyentry in replies.find_all("div", class_ = "postentry"):
[r_title, r_url, r_author, r_auth_url, r_dt, r_latest] = get_postentry_author_info( replyentry )
# Put results into a list with line breaks
reps.append(r_author + ': ' + r_dt.strftime('%y-%m-%d %H:%M'))
r_dates.append(r_dt)
if '_DISPOSITION' in r_title.upper():
# print('r_disposition is :', r_latest)
r_disposition = r_latest
if '_ACTION' in r_title.upper():
# print('r_action is :', r_latest)
r_action = r_latest
times += 1
# Turn the reps list into a single string
repstr = ';\n'.join(reps)
# Find the latest modified date
r_dates.sort()
r_date_latest = r_dates[-1]
r_date_latest_str = r_date_latest.strftime('%y-%m-%d %H:%M')
# Print output
print('\nEntry:', idx)
print('title:', title)
print('url:', url)
print('posting:', post)
print('author:', author)
print('date:',dt.strftime('%y-%m-%d %H:%M'))
if repstr != '':
print('latest update:', r_date_latest_str)
print('replies by:', repstr)
print('latest reply title:', r_title)
print('latest reply:', r_latest)
if r_disposition != '':
print('Disposition:',r_disposition)
if r_action != '':
print('Action:', r_action)
# Column 1: ID
col = sscol
ws.cell(row = ssrow, column = col).value = ssrow-rowoff
ws.cell(row = ssrow, column = col).alignment = align_hv_cen_style
# Column 2: Title
col += 1
ws.cell(row = ssrow, column = col).value = title
ws.cell(row = ssrow, column = col).font = font_url_style
ws.cell(row = ssrow, column = col).hyperlink = url
ws.cell(row = ssrow, column = col).alignment = Alignment(wrap_text = True, vertical = 'center')
# Column 3: Posting
col += 1
ws.cell(row = ssrow, column = col).value = clean_string(post)
ws.cell(row = ssrow, column = col).alignment = Alignment(wrap_text = True, vertical = 'center')
# Column 4: Author
col += 1
ws.cell(row = ssrow, column = col).value = author
ws.cell(row = ssrow, column = col).hyperlink = auth_url
ws.cell(row = ssrow, column = col).alignment = align_ver_cen_style
ws.cell(row = ssrow, column = col).font = font_url_style
# Column 5: Post Date
col += 1
ws.cell(row = ssrow, column = col).value = dt
ws.cell(row = ssrow, column = col).number_format = 'YY-MM-DD HH:MM'
ws.cell(row = ssrow, column = col).alignment = align_hv_cen_style
# Column 6: # Replies
col += 1
ws.cell(row = ssrow, column = col).value = times
ws.cell(row = ssrow, column = col).alignment = align_hv_cen_style
# Column 7: Replies By
col += 1
ws.cell(row = ssrow, column = col).value = repstr
ws.cell(row = ssrow, column = col).alignment = Alignment(wrap_text = True, vertical = 'center')
# Column 8: Latest Reply Date
col += 1
ws.cell(row = ssrow, column = col).value = r_date_latest_str
ws.cell(row = ssrow, column = col).alignment = align_hv_cen_style
#Column 9: Latest Reply Text
col += 1
ws.cell(row = ssrow, column = col).value = clean_string(r_latest)
ws.cell(row = ssrow, column = col).alignment = Alignment(wrap_text = True, vertical = 'center')
#Column 10: Disposition Text
col += 1
ws.cell(row = ssrow, column = col).value = clean_string(r_disposition)
ws.cell(row = ssrow, column = col).alignment = Alignment(wrap_text = True, vertical = 'center')
#Column 11: Action Text
col += 1
ws.cell(row = ssrow, column = col).value = clean_string(r_action)
ws.cell(row = ssrow, column = col).alignment = Alignment(wrap_text = True, vertical = 'center')
ssrow += 1
# Set column widths
ws.column_dimensions["A"].width = 5.0
ws.column_dimensions["B"].width = 40.0
ws.column_dimensions["C"].width = 60.0
ws.column_dimensions["D"].width = 13.0
ws.column_dimensions["E"].width = 13.0
ws.column_dimensions["F"].width = 13.0
ws.column_dimensions["G"].width = 25.0
ws.column_dimensions["H"].width = 17.0
ws.column_dimensions["I"].width = 60.0
ws.column_dimensions["J"].width = 60.0
ws.column_dimensions["K"].width = 60.0
# Save the spreadsheet
wb.save(xlfile)
def get_discussion(url, htmlfile, xlfile):
get_discussion_rowcol(url, htmlfile, xlfile, 1, 1)
def test_bulletin_post():
# Login to DCC
s = DCC.login(CF.dcc_url + CF.dcc_login)
handle = 'Bulletin-6185'
title = '_DISPOSITION'
description = 'This item is open. MELCO made no reply.'
# description = 'This can be closed with a tracked action'
# description = "The reply doesn't answer directly to the question of the RIX. The reply seems to say that DP04S-A isn't a document to be reviewed. Is DP04S-A a review item of FDRP2? Need more clear reply."
keywords = 'Reviewer Disposition'
create_bb_post(s, title, description, keywords, handle)
title = '_ACTION'
description = 'ACTION: TBD (To Be Determined) after issue is addressed by MELCO.'
# description = 'ACTION: MELCO to study what is the probability of the scenario.'
# description = "ACTION: MELCO to add all the safety related drives to the Table 2.1-1."
keywords = 'Reviewer ACTION'
create_bb_post(s, title, description, keywords, handle)
# if __name__ == '__main__':
# print("Running module test code for",__file__)
# test_bulletin_post()