Skip to content

Commit

Permalink
WIP #2 - Simplify page data parser
Browse files Browse the repository at this point in the history
  • Loading branch information
emibcn committed Nov 11, 2018
1 parent d46967c commit 60dbd90
Showing 1 changed file with 15 additions and 12 deletions.
27 changes: 15 additions & 12 deletions Rac1.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,16 +350,22 @@ def parse_rac1_list_page(cls, data, discard_pages=False):

# Parse response:
# - Filter lines containing data-audio-id or data-audioteca-search-page
# - Decode from binary utf-8 to string
# - Only get values for data-* HTML attributes, without quotes
data_list = [re.sub(my_re, r'\1=\2', line) \
for line in data.split(u'\n')
if u'data-audio-id' in line \
or (not discard_pages and u'data-audioteca-search-page' in line)]
data_list = [
re.sub(my_re, r'\1=\2', line).split(u'=')
for line in data.split(u'\n')
if u'data-audio-id' in line \
or (not discard_pages and u'data-audioteca-search-page' in line)]

# Filter results by type
audio_uuid_list = [line for line in data_list if u'data-audio-id' in line]
pages_list = [] if discard_pages else [line for line in data_list if u'data-audioteca-search-page' in line]
audio_uuid_list = [
line[1]
for line in data_list
if line[0] == u'data-audio-id']
pages_list = [] if discard_pages else [
line[1]
for line in data_list
if line[0] == u'data-audioteca-search-page']

# Deduply
audio_uuid_list_dedups = []
Expand All @@ -384,11 +390,8 @@ def get_audio_uuids(self):
# [1:] : remove first page, as it has already been downloaded
for page in pages_list[1:]:

# Get page number (discard variable name)
_, page_number = page.split(u'=')

# Download page uuids
data = self.get_rac1_list_page(page_number)
data = self.get_rac1_list_page(page)

# Parse page data (discard pages list, as we already have it)
audio_uuid_list_page, _ = self.parse_rac1_list_page(data, discard_pages=True)
Expand All @@ -399,7 +402,7 @@ def get_audio_uuids(self):
audio_uuid_list.append(uuid)

# Return only each audio's UUID (discard variable name)
return [varval.split(u'=')[1] for varval in audio_uuid_list]
return audio_uuid_list


def get_podcast_data(self, uuid):
Expand Down

0 comments on commit 60dbd90

Please sign in to comment.