Permalink
Browse files

Initial commit

  • Loading branch information...
ejamesc committed Nov 27, 2011
0 parents commit 1fc93b5a2fe377560d040d60018f9209f5714f87
Showing with 1,920 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. 0 cors/__init__.py
  3. +18 −0 cors/items.py
  4. +8 −0 cors/pipelines.py
  5. +15 −0 cors/settings.py
  6. +8 −0 cors/spiders/__init__.py
  7. +35 −0 cors/spiders/cors_spider.py
  8. +1,822 −0 cors_first_run.json
  9. +11 −0 scrapy.cfg
@@ -0,0 +1,3 @@
+.DS_Store
+*.pyc
+
No changes.
@@ -0,0 +1,18 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class CorsItem(Item):
+ # define the fields for your item here like:
+ code = Field()
+ desc = Field()
+ name = Field()
+ mc = Field()
+ lecture_time_table = Field()
+ exam = Field()
+ prerequisite = Field()
+ preclusion = Field()
+ workload = Field()
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+
+class CorsPipeline(object):
+ def process_item(self, item, spider):
+ return item
@@ -0,0 +1,15 @@
+# Scrapy settings for cors project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+# http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'cors'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['cors.spiders']
+NEWSPIDER_MODULE = 'cors.spiders'
+USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
@@ -0,0 +1,8 @@
+# This package will contain the spiders of your Scrapy project
+#
+# To create the first spider for your project use this command:
+#
+# scrapy genspider myspider myspider-domain.com
+#
+# For more info see:
+# http://doc.scrapy.org/topics/spiders.html
@@ -0,0 +1,35 @@
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+
+from cors.items import CorsItem
+
+class CorsSpider(CrawlSpider):
+ name = "cors"
+ allowed_domains = ["nus.edu.sg"]
+ start_urls = [
+ "https://aces01.nus.edu.sg/cors/jsp/report/ModuleInfoListing.jsp",
+ "https://aces01.nus.edu.sg/cors/jsp/report/GEMInfoListing.jsp",
+ "https://aces01.nus.edu.sg/cors/jsp/report/SSMInfoListing.jsp"
+ ]
+
+ rules = (
+ Rule(SgmlLinkExtractor(allow=('ModuleDetailedInfo\.jsp', )), callback='parse_module'),
+ )
+
+ def parse_module(self, response):
+ hxs = HtmlXPathSelector(response)
+ module = hxs.select('id("wrapper")/table/tr[2]/td/table[1]/tr[3]/td/table')
+
+ item = CorsItem()
+
+ item['code'] = module.select('tr[position()=2]/td[position()=2]/text()').extract()
+ item['name'] = module.select('tr[position()=3]/td[position()=2]/text()').extract()
+ item['desc'] = module.select('tr[position()=4]/td[position()=2]/text()').extract()
+ item['mc'] = module.select('tr[position()=7]/td[position()=2]/text()').extract()
+ item['lecture_time_table'] = module.select('tr[position()=2]/td/div/table/tr/td/text()').extract()
+ item['exam'] = module.select('tr[position()=6]/td[position()=2]/text()').extract()
+ item['prerequisite'] = module.select('tr[position()=8]/td[position()=2]/text()').extract()
+ item['preclusion'] = module.select('tr[position()=9]/td[position()=2]/text()').extract()
+ item['workload'] = module.select('tr[position()=10]/td[position()=2]/text()').extract()
+ return item

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = cors.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = cors

0 comments on commit 1fc93b5

Please sign in to comment.