Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
284 lines (269 sloc) 11.7 KB
import re, textwrap, json, sys
from collections import OrderedDict
from Queue import *
from pprint import pprint
from lxml import etree
from nltk.corpus import stopwords
width = 80
content_url = 'http://www.projectaon.org/en/xhtml/lw/02fotw/'
def processPara(para):
para = re.sub('</?p/?>', '', para)
para = re.sub('</?choice( idref=".*?")?/?>', '', para)
para = re.sub('</?link-text/?>', '', para)
para = re.sub('</?cite/?>', '', para)
para = para.replace('<ch.apos/>', "'")
para = para.replace('<ch.endash/>', "-")
para = para.replace('<ch.emdash/>', "-")
para = para.replace('<ch.ellips/>', "...")
para = para.replace('<ch.thinspace/>', "")
para = para.replace('<ch.frac12/>', "1/2")
para = re.sub('</?quote/?>', "\"", para)
para = re.sub('</?onomatopoeia>', '', para)
para = re.sub('<footref.*?/>', '', para)
para = re.sub('</?signpost>', '', para)
para = re.sub('</?description>', '', para)
para = re.sub('</?blockquote>', '', para)
para = re.sub('</?bookref.*?>', '', para)
return textwrap.wrap(para, width)
sections = OrderedDict()
custom = json.load(open('fotw_custom.json'))
parser = etree.XMLParser(resolve_entities=False)
tree = etree.parse('fotw.xml', parser=parser)
root = tree.getroot()
stopwords = set(stopwords.words('english'))
for w in ['turn', 'wish', 'want', 'turning', 'rather', 'would', 'along', 'upon', 'another']:
stopwords.add(w)
for sect_elem in root.findall('.//section[@class="numbered"]')[1:]:
#for sect_elem in root.findall('.//section[@id="sect%s"]' % 181):
sect_id = sect_elem.find('.//title').text
sect_paras = []
choices = []
combat = {}
enemies = []
rnt_found = False
ac_found = False
stats_found = False
undead_found = False
sommerswerd_found = False
immune_to_mindblast_found = False
illustration_found = False
must_eat = False
list_found = False
footref_found = False
mindforce_found = False
for item in sect_elem.find('data'):
s = etree.tostring(item).strip()
for subitem in item:
if subitem.tag == 'footref':
footref_found = True
if '<a idref="random">Random Number Table</a>' in s:
rnt_found = True
s = s.replace('<a idref="random">Random Number Table</a>', 'Random Number Table')
for a in ['COMBAT SKILL', 'ENDURANCE']:
if '<typ class="attribute">%s</typ>' % a in s:
s = s.replace('<typ class="attribute">%s</typ>' % a, a)
stats_found = True
if '<a idref=\"action\">Action Chart</a>' in s:
ac_found = True
s = s.replace('<a idref=\"action\">Action Chart</a>', 'Action Chart')
if 'undead' in s.lower():
undead_found = True
if 'sommerswerd' in s.lower():
sommerswerd_found = True
if 'immune' in s.lower() and 'mindblast' in s.lower():
immune_to_mindblast_found = True
if 'Meal' in s and 'must' in s:
must_eat = True
if 'Mindforce' in s:
mindforce_found = True
if item.tag == 'p':
sect_paras.append(processPara(s))
if item.tag == 'signpost':
s = processPara(s)[0]
spacer = ' ' * ((width - len(s)) / 2)
sect_paras.append([spacer + s])
if item.tag == 'ul':
for li in item:
t = etree.tostring(li).strip()
for a in ['COMBAT SKILL', 'ENDURANCE']:
if '<typ class="attribute">%s</typ>' % a in t:
t = t.replace('<typ class="attribute">%s</typ>' % a, a)
stats_found = True
lis = processPara(t)[0]
lis = re.sub('</?li>', '', lis)
sect_paras.append(['* %s' % lis])
list_found = True
elif item.tag == 'combat':
e = {'name': item.find('.//enemy').text,
'combat_skill': int(item.find('..//enemy-attribute[@class="combatskill"]').text),
'endurance': int(item.find('..//enemy-attribute[@class="endurance"]').text)}
enemies.append(e)
sect_paras.append(['%s: COMBAT SKILL %d, ENDURANCE %d' % (e['name'], e['combat_skill'], e['endurance'])])
elif item.tag == 'choice':
choice_sect_id = re.search('idref="sect(\d+)"', s).group(1)
choices.append({'section': choice_sect_id, 'text': '\n'.join(processPara(s))})
elif item.tag == 'illustration':
illustration_found = True
desc = ''
src = ''
for sub in item:
if sub.tag == 'meta' and len(sub) > 1 and sub[1].tag == 'description':
desc = ' ' + etree.tostring(sub[1]).rstrip()
if sub.get('class') == 'html':
src = sub.get('src')
if sub.get('class') == 'text':
for subsub in sub:
desc += ' '.join([line.strip() for line in etree.tostring(subsub).split()])
desc = '\n'.join(processPara(desc)).strip()
desc = ': %s' % desc if desc else ''
sect_paras.append(['%s/%s[Illustration]%s' % (content_url, src, desc)])
sect_text = '\n\n'.join(['\n'.join(p) for p in sect_paras])
is_random_pick = False
if rnt_found and re.search('\d-\d', choices[0]['text']):
is_random_pick = True
for choice in choices:
if re.search('(\d+)-(\d+)', choice['text']):
choice['range'] = [int(s) for s in re.search('(\d+)-(\d+)', choice['text']).groups()]
else:
n = int(re.search('(\d)', choice['text']).group(1))
choice['range'] = [n, n]
else: #if len(choices) > 1:
for choice in choices:
words = []
for w in re.split("[^A-Za-z0-9'-]+", choice['text']): # every nonalpha except "'" and "-"
w = w.lower()
if (len(w) < 3 and w != 'go') or w in stopwords or re.match('\d+', w): continue
words.append(w)
if words:
choice['words'] = words
if enemies:
combat['enemies'] = enemies
for i, choice in enumerate(choices):
if 'win' in choice['text'] or 'if you kill' in choice['text'].lower():
combat['win'] = {'choice': i}
elif 'evade' in choice['text']:
combat['evasion'] = {'choice': i}
if re.search('at any time|stage', choice['text']):
combat['evasion']['n_rounds'] = 0
elif 'after two rounds' in choice['text']:
combat['evasion']['n_rounds'] = 2
section = {'text': sect_text, 'choices': choices}
if combat:
if undead_found or sommerswerd_found:
assert len(combat['enemies']) == 1
combat['enemies'][0]['double_damage'] = True
if immune_to_mindblast_found:
assert len(combat['enemies']) == 1
combat['enemies'][0]['immune'] = "Mindblast"
if mindforce_found:
assert len(combat['enemies']) == 1
combat['enemies'][0]['has_mindforce'] = True
if 'win' not in combat and len(choices) == 1:
combat['win'] = {'choice': 0}
section['combat'] = combat
if is_random_pick: section['is_random_pick'] = True
if must_eat: section['must_eat'] = True
# merge custom content
if sect_id in custom['sections']:
cust_sect = custom['sections'][sect_id]
if 'alternate_choices' in cust_sect:
section['alternate_choices'] = True
# if 'chain_choices' in cust_sect:
# section['chain_choices'] = True
if 'no_ambiguity' in cust_sect:
section['no_ambiguity'] = True
if 'is_special' in cust_sect:
section['is_special'] = True
if 'items' in cust_sect:
section['items'] = cust_sect['items']
if 'endurance' in cust_sect:
section['endurance'] = cust_sect['endurance']
if 'combat' in cust_sect:
section['combat'].update(cust_sect['combat'])
if 'options' in cust_sect:
section['options'] = cust_sect['options']
if 'must_eat' in cust_sect:
if cust_sect['must_eat']:
section['must_eat'] = cust_sect['must_eat'] # possibly a int
else:
del section['must_eat']
if 'text' in cust_sect:
section['text'] = cust_sect['text']
if 'n_items_to_pick' in cust_sect:
section['n_items_to_pick'] = cust_sect['n_items_to_pick']
if 'trim_choices' in cust_sect:
section['trim_choices'] = True
if 'reduce_choices' in cust_sect:
section['reduce_choices'] = True
cc_sections = []
for custom_choice in custom['sections'][sect_id].get('choices', []):
if custom_choice['section'] in cc_sections:
exit('Error in custom section %s: duplicate choice sections' % sect_id)
cc_sections.append(custom_choice['section'])
for custom_choice in custom['sections'][sect_id].get('choices', []):
# no key to match here, so we got to match using choice.section (thus the need to search)
#print custom_choice['section']
found = False
for choice in section['choices']:
if choice.get('section') == custom_choice['section'] and 'is_artificial' not in custom_choice:
choice.update(custom_choice)
if '+words' in custom_choice:
choice['words'].extend(custom_choice['+words'])
del choice['+words']
if '-words' in custom_choice:
choice['words'] = [w for w in choice['words'] if w not in custom_choice['-words']]
del choice['-words']
found = True
break
if 'is_artificial' in custom_choice:
choices.append(custom_choice)
elif not found:
exit('Error in custom section %s: choice for section %s cannot be found' % (sect_id, custom_choice['section']))
sections[sect_id] = section
# special case reporting
report = []
if ac_found:
report.append('ac')
if rnt_found and not is_random_pick:
report.append('rnt')
if stats_found:
report.append('stats')
if footref_found:
report.append('footref')
# if illustration_found:
# report.append('illustration')
# if must_eat:
# report.append('must eat')
if list_found:
report.append('list')
if report and sect_id not in custom['sections']: #True:
print '%s: %s' % (sect_id, ', '.join(report))
section_od = OrderedDict()
for sect_id in range(1, 351):
sect_id = str(sect_id)
section_od[sect_id] = sections[sect_id]
# q = Queue()
# visited = set()
# q.put('1')
# section_od = OrderedDict()
# to_set = []
# while not q.empty():
# sect_id = q.get()
# to_set.append(sect_id)
# section_od[sect_id] = sections[sect_id]
# # if sect_id == '197': continue
# for choice in sections[sect_id]['choices']:
# if choice['section'] not in visited:
# q.put(choice['section'])
# visited.add(choice['section'])
result_od = OrderedDict()
setup_od = OrderedDict()
for f in ['sequence', 'disciplines', 'weapons', 'equipment']:
setup_od[f] = custom['setup'][f]
custom['setup'] = setup_od
for f in ['prompt', 'intro_sequence', 'setup', 'synonyms']:
result_od[f] = custom[f]
result_od['sections'] = section_od
#json.dump(result_od, open('fotw_generated.json', 'w'), indent=4)
open('fotw.json', 'w').write('\n'.join([line.rstrip() for line in json.dumps(result_od, indent=4).split('\n')]))
print 'produced %d sections' % len(result_od['sections'])