coursegraph · SeijiEmery · Jul 10, 2018 · Jul 7, 2018 · Jul 7, 2018 · Jul 7, 2018
diff --git a/crawlers/ucsc/pisa_index.json b/crawlers/ucsc/pisa_index.json
diff --git a/crawlers/ucsc/ucsc/architecture.py b/crawlers/ucsc/ucsc/architecture.py
@@ -0,0 +1,200 @@
+import scrapy
+import re
+''' Crappy initial implementation. Minimum required to run / pass. Can add useful features later. '''
+
+
+class BaseCrawler:
+    pass
+
+class SelectorWrapper:
+    def __init__ (self, value):
+        self.value = value
+
+    def xpath_require_one (self, selection):
+        if self.value:
+            result = self.value.xpath(selection)
+            if result is None or len(result) > 1:
+                raise Exception("Expected single selection with '%s', got '%s', prev selection:\n%s"%(
+                    selection, result, self.value.extract()))
+            return SelectorWrapper(result)
+        return self
+
+    def xpath_require_many (self, selection):
+        if self.value:
+            result = self.value.xpath(selection)
+            if result is None:
+                raise Exception("Expected 1+ selection(s) with '%s', got '%s', prev selection:\n%s"%(
+                    selection, result, self.value.extract()))
+            return SelectorWrapper(result)
+        return self
+
+    def map_async (self, callback):
+        if not self.value:
+            callback(self)
+        else:
+            for entry in self.value:
+                callback(SelectorWrapper(entry))
+
+    def xpath_stripped_text (self, selection=None, strip=None):
+        if self.value:
+            selection = '%s/text()'%selection if selection else 'text()'
+
+            result = self.value.xpath(selection)
+            result = result.extract() if result else result
+            if result is None:# or len(result) != 1:
+                raise Exception("Expected text(), in selection '%s', got '%s' in:\n%s"%(
+                    selection, result, self.value.extract()))
+            return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip())
+        return self
+
+    def xpath_attrib (self, selection, strip=None):
+        if self.value:
+            result = self.value.xpath(selection)
+            result = result.extract() if result else result
+            if result is None or len(result) != 1:
+                raise Exception("Expected attrib '%s', got '%s' in:\n%s"%(
+                    selection, result, self.value.extract()))
+            return SelectorWrapper(result[0].strip(strip) if strip else result[0].strip())
+        return self
+
+
+
+    def bind (self, result, attrib):
+        if self.value:
+            value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \
+                else self.value.extract()[0]
+            if type(attrib) == str or type(attrib) == unicode:
+                result[attrib] = self.value
+            elif type(attrib) == tuple:
+                for k in attrib:
+                    result[k] = self.value
+            else:
+                raise Exception("Invalid argument passed to %s.bind(): %s %s"%(
+                    type(self), type(attrib), attrib))
+        else:
+            result[attrib] = None
+            print("Failed to assign attrib '%s' to %s in %s"%(
+                attrib, type(result[attrib]), type(result)))
+
+    def equals (self, other):
+        # if (type(self.value) == str or type(self.value) == unicode) == (type(other) == str or type(other) == unicode):
+        #     pass
+        # if type(self.value) != type(other):
+        #     raise Exception("%s.equals() attempting to compare conflicting types: %s and %s"%(
+        #         type(self), type(self.value), type(other)))
+        return self.value == other
+
+    def matches_re (self, regex):
+        if not self.value:
+            raise Exception("Attempting to do regex match on null result")
+
+        if type(self.value) == str or type(self.value) == unicode:
+            return re.match(regex, self.value) is not None
+        return self.value.re(regex) is not None
+
+    def contains (self, other):
+        if type(self.value) == str or type(self.value) == unicode:
+            return other in self.value
+        return self.value.contains(other)
+
+    def bind_re (self, regex, result, attrib):
+        if self.value:
+            try:
+                value = self.value.extract()[0]
+            except AttributeError:
+                value = self.value
+            # value = self.value if type(self.value) == str or type(self.value) == unicode or type(self.value) == int \
+            #     else self.value.extract()[0]
+
+            match = re.match(regex, self.value)
+            if not match:
+                raise Exception("Failed to match regex '%s' against input %s"%(
+                    match, value))
+
+            if type(attrib) == str or type(attrib) == unicode:
+                result[attrib] = match.group(1)
+            elif type(attrib) == tuple:
+                for i, k in enumerate(attrib):
+                    result[k] = match.group(i+1)
+            else:
+                raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%(
+                    type(self), type(attrib), attrib))
+        else:
+            result[attrib] = None
+            print("Failed to assign attrib '%s' to %s in %s"%(
+                attrib, type(result[attrib]), type(result)))
+
+    def bind_re_map (self, regex, result, attrib, transform):
+        if self.value:
+            value = self.value if type(self.value) == str or type(self.value) == int or type(self.value) == unicode \
+                else self.value.extract()[0]
+
+            match = re.match(regex, value)
+            if not match:
+                raise Exception("Failed to match regex '%s' against input %s"%(
+                    regex, value))
+
+            if type(attrib) == str:
+                result[attrib] = transform(match.group(1))
+            elif type(attrib) == tuple:
+                for i, (k, f) in enumerate(zip(attrib, transform)):
+                    result[k] = f(match.group(i+1))
+            else:
+                raise Exception("Invalid argument passed to %s.bind_re(): %s %s"%(
+                    type(self), type(attrib), attrib))
+        else:
+            result[attrib] = None
+            print("Failed to assign attrib '%s' to %s in %s"%(
+                attrib, type(result[attrib]), type(result)))
+
+    def to_int (self):
+        if self.value:
+            return SelectorWrapper(int(self.value))
+        return self
+
+    def request_async_crawl (self, crawler=None, url=None):
+        assert(crawler is not None and url is not None)
+
+
+    def map_sequential_cases (self, selection=None, check='maybe', cases=None):
+        assert(check in set(('yes', 'no', 'maybe')))
+        assert(cases is not None)
+        assert(type(cases) == tuple)
+        assert(type(cases[0]) == tuple)
+        assert(type(cases[0][0]) == str)
+
+        do_check = check != 'no'
+        if not self.value:
+            for req, test, applicator in cases:
+                applicator(self)
+        else:
+            results = self.value.xpath(selection) if selection else self.value
+            i = 0
+            for item in results:
+                result = SelectorWrapper(item)
+                if i > len(cases):
+                    print("Too few items to match all cases")
+                    return
+                if do_check and not cases[i][1](result):
+                    if cases[i][0] == 'required':
+                        raise Exception("Failed map_sequential_cases case test (%d):\n%s"%(
+                            i, result))
+                else:
+                    cases[i][2](result)
+                    i += 1
+            if i < len(cases):
+                print("Did not visit all items")
+
+
+def item_producer (Item):
+    def decorator (fcn):
+        def wrapper (self, request):
+            result = Item()
+            fcn(self, request, result)
+        return wrapper
+    return decorator
+
+def parser_entrypoint (fcn):
+    def wrapper (self, request):
+        return fcn(self, SelectorWrapper(request))
+    return wrapper
diff --git a/crawlers/ucsc/ucsc/items.py b/crawlers/ucsc/ucsc/items.py
@@ -28,4 +28,30 @@ class PisaIndexItem(scrapy.Item):
 
 class PisaCourseItem(scrapy.Item):
     """ Encapsulates all the data visible from a class page; TBD """
-    raw_content = Field()       # raw HTML; todo: parse this properly using xpath + regexes
+    url = Field()               # url of class page, eg. "https://pisa.ucsc.edu/class_search/index.php/index.php?action=detail&class_data=YToyOntzOjU6IjpTVFJNIjtzOjQ6IjIxODgiO3M6MTA6IjpDTEFTU19OQlIiO3M6NToiMjE3MjMiO30%3D"
+    course_name = Field()       # string, eg. "AMS 03"
+    course_title = Field()      # string, eg. "Precalculus"
+    course_section = Field()    # string, eg. "01"
+    class_number = Field()      # int, eg. 21723
+    lecture_number = Field()    # int, class_number of lecture component (or class_number)
+    instructor = Field()        # string, eg. "Garaud,P."
+    class_type = Field()        # "LEC", "LAB", or "SEM" (or "DISC"...?)
+    class_type_pretty = Field() # "Lecture", ...
+    location = Field()          # string, eg. "Soc Sci 2 075"
+    meet_times = Field()        # string, eg. "MWF 10:40AM-11:45AM"
+    enroll_max = Field()        # int
+    enroll_current = Field()    # int
+    materials_url = Field()     # link to materials page, eg. "http://ucsc.verbacompare.com/comparison?id=FL18__AMS__003__01"
+    term = Field()              # eg. "Fall 2018"
+    term_id = Field()           # integer id used when searching via form
+    career_type = Field()
+    grading_options = Field()
+    credits = Field()
+    gen_ed_categories = Field()
+    waitlist_max = Field()
+    waitlist_current = Field()
+
+    course_description = Field()    # Description text
+    enrollment_reqs = Field()       # Enrollment text
+    class_notes = Field()           # Class notes text
+    class_dates = Field()
diff --git a/crawlers/ucsc/ucsc/settings.py b/crawlers/ucsc/ucsc/settings.py
@@ -70,16 +70,18 @@
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+AUTOTHROTTLE_DEBUG = False
+#CONCURRENT_REQUESTS_PER_IP
+CONCURRENT_REQUESTS_PER_IP = 5
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings