Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,584 changes: 0 additions & 1,584 deletions crawlers/ucsc/pisa_index.json

This file was deleted.

200 changes: 200 additions & 0 deletions crawlers/ucsc/ucsc/architecture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import scrapy
import re
''' Crappy initial implementation. Minimum required to run / pass. Can add useful features later. '''


class BaseCrawler:
pass

class SelectorWrapper:
def __init__ (self, value):
self.value = value

def xpath_require_one (self, selection):
if self.value:
result = self.value.xpath(selection)
if result is None or len(result) > 1:
raise Exception("Expected single selection with '%s', got '%s', prev selection:\n%s"%(
selection, result, self.value.extract()))
return SelectorWrapper(result)
return self

def xpath_require_many (self, selection):
if self.value:
result = self.value.xpath(selection)
if result is None:
raise Exception("Expected 1+ selection(s) with '%s', got '%s', prev selection:\n%s"%(
selection, result, self.value.extract()))
return SelectorWrapper(result)
return self

def map_async (self, callback):
if not self.value:
callback(self)
else:
for entry in self.value:
callback(SelectorWrapper(entry))

def xpath_stripped_text (self, selection=None, strip=None):
if self.value:
selection = '%s/text()'%selection if selection else 'text()'

result = self.value.xpath(selection)
result = result.extract() if result else result
if result is None:# or len(result) != 1:
raise Exception("Expected text(), in selection '%s', got '%s' in:\n%s"%(
selection, result, self.value.extract()))
return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip())
return self

def xpath_attrib (self, selection, strip=None):
if self.value:
result = self.value.xpath(selection)
result = result.extract() if result else result
if result is None or len(result) != 1:
raise Exception("Expected attrib '%s', got '%s' in:\n%s"%(
selection, result, self.value.extract()))
return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip())
return self



def bind (self, result, attrib):
if self.value:
value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \
else self.value.extract()[0]
if type(attrib) == str or type(attrib) == unicode:
result[attrib] = self.value
elif type(attrib) == tuple:
for k in attrib:
result[k] = self.value
else:
raise Exception("Invalid argument passed to %s.bind(): %s %s"%(
type(self), type(attrib), attrib))
else:
result[attrib] = None
print("Failed to assign attrib '%s' to %s in %s"%(
attrib, type(result[attrib]), type(result)))

def equals (self, other):
# if (type(self.value) == str or type(self.value) == unicode) == (type(other) == str or type(other) == unicode):
# pass
# if type(self.value) != type(other):
# raise Exception("%s.equals() attempting to compare conflicting types: %s and %s"%(
# type(self), type(self.value), type(other)))
return self.value == other

def matches_re (self, regex):
if not self.value:
raise Exception("Attempting to do regex match on null result")

if type(self.value) == str or type(self.value) == unicode:
return re.match(regex, self.value) is not None
return self.value.re(regex) is not None

def contains (self, other):
if type(self.value) == str or type(self.value) == unicode:
return other in self.value
return self.value.contains(other)

def bind_re (self, regex, result, attrib):
if self.value:
try:
value = self.value.extract()[0]
except AttributeError:
value = self.value
# value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \
# else self.value.extract()[0]

match = re.match(regex, self.value)
if not match:
raise Exception("Failed to match regex '%s' against input %s"%(
match, value))

if type(attrib) == str or type(attrib) == unicode:
result[attrib] = match.group(1)
elif type(attrib) == tuple:
for i, k in enumerate(attrib):
result[k] = match.group(i+1)
else:
raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%(
type(self), type(attrib), attrib))
else:
result[attrib] = None
print("Failed to assign attrib '%s' to %s in %s"%(
attrib, type(result[attrib]), type(result)))

def bind_re_map (self, regex, result, attrib, transform):
if self.value:
value = self.value if type(self.value) == str or type(self.value) == int or type(self.value) == unicode \
else self.value.extract()[0]

match = re.match(regex, value)
if not match:
raise Exception("Failed to match regex '%s' against input %s"%(
regex, value))

if type(attrib) == str:
result[attrib] = transform(match.group(1))
elif type(attrib) == tuple:
for i, (k, f) in enumerate(zip(attrib, transform)):
result[k] = f(match.group(i+1))
else:
raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%(
type(self), type(attrib), attrib))
else:
result[attrib] = None
print("Failed to assign attrib '%s' to %s in %s"%(
attrib, type(result[attrib]), type(result)))

def to_int (self):
if self.value:
return SelectorWrapper(int(self.value))
return self

def request_async_crawl (self, crawler=None, url=None):
assert(crawler is not None and url is not None)


def map_sequential_cases (self, selection=None, check='maybe', cases=None):
assert(check in set(('yes', 'no', 'maybe')))
assert(cases is not None)
assert(type(cases) == tuple)
assert(type(cases[0]) == tuple)
assert(type(cases[0][0]) == str)

do_check = check != 'no'
if not self.value:
for req, test, applicator in cases:
applicator(self)
else:
results = self.value.xpath(selection) if selection else self.value
i = 0
for item in results:
result = SelectorWrapper(item)
if i > len(cases):
print("Too few items to match all cases")
return
if do_check and not cases[i][1](result):
if cases[i][0] == 'required':
raise Exception("Failed map_sequential_cases case test (%d):\n%s"%(
i, result))
else:
cases[i][2](result)
i += 1
if i < len(cases):
print("Did not visit all items")


def item_producer (Item):
def decorator (fcn):
def wrapper (self, request):
result = Item()
fcn(self, request, result)
return wrapper
return decorator

def parser_entrypoint (fcn):
def wrapper (self, request):
return fcn(self, SelectorWrapper(request))
return wrapper
28 changes: 27 additions & 1 deletion crawlers/ucsc/ucsc/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,30 @@ class PisaIndexItem(scrapy.Item):

class PisaCourseItem(scrapy.Item):
""" Encapsulates all the data visible from a class page; TBD """
raw_content = Field() # raw HTML; todo: parse this properly using xpath + regexes
url = Field() # url of class page, eg. "https://pisa.ucsc.edu/class_search/index.php/index.php?action=detail&class_data=YToyOntzOjU6IjpTVFJNIjtzOjQ6IjIxODgiO3M6MTA6IjpDTEFTU19OQlIiO3M6NToiMjE3MjMiO30%3D"
course_name = Field() # string, eg. "AMS 03"
course_title = Field() # string, eg. "Precalculus"
course_section = Field() # string, eg. "01"
class_number = Field() # int, eg. 21723
lecture_number = Field() # int, class_number of lecture component (or class_number)
instructor = Field() # string, eg. "Garaud,P."
class_type = Field() # "LEC", "LAB", or "SEM" (or "DISC"...?)
class_type_pretty = Field() # "Lecture", ...
location = Field() # string, eg. "Soc Sci 2 075"
meet_times = Field() # string, eg. "MWF 10:40AM-11:45AM"
enroll_max = Field() # int
enroll_current = Field() # int
materials_url = Field() # link to materials page, eg. "http://ucsc.verbacompare.com/comparison?id=FL18__AMS__003__01"
term = Field() # eg. "Fall 2018"
term_id = Field() # integer id used when searching via form
career_type = Field()
grading_options = Field()
credits = Field()
gen_ed_categories = Field()
waitlist_max = Field()
waitlist_current = Field()

course_description = Field() # Description text
enrollment_reqs = Field() # Enrollment text
class_notes = Field() # Class notes text
class_dates = Field()
10 changes: 6 additions & 4 deletions crawlers/ucsc/ucsc/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,18 @@

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
AUTOTHROTTLE_DEBUG = False
#CONCURRENT_REQUESTS_PER_IP
CONCURRENT_REQUESTS_PER_IP = 5

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
Expand Down
Loading