Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions crawlers/d-crawler/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
.dub
docs.json
__dummy.html
docs/
crawler-demo.so
crawler-demo.dylib
crawler-demo.dll
crawler-demo.a
crawler-demo.lib
crawler-demo-test-*
*.exe
*.o
*.obj
*.lst
1 change: 1 addition & 0 deletions crawlers/d-crawler/course_list.json

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions crawlers/d-crawler/course_regex_replacements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Detect unmatched text:
\n[^"][^\n]+\n

// Remove parentheses
\.([\)"]+)
$1\.

// Detect abbreviations
([A-Z][a-z]*\.\s*)+[a-z]
// then remove w/ result.replace(".","")

// Simple abbreviation remover:
([A-Z][a-z]*\.\s*)+([a-z])
$2

https?://[\w\d\-\/\.]+/([a-z]+).html
"dept"="$1"

\n(?:\[Return to top\])
N/A

\*\s+Not offered in \d+\-\d+\n
N/A

\nRevised:\s+(\d+/\d+/\d+)\n
\n"revision_date" = "$1"

\n\s*(\d+\-\d+\s+(?:General\s+)?Catalog)
\n"catalog_version" = "$1"

\n([A-Za-z\-\s]+)\s+Courses\n
\n"division"="$1"\n

\n(?:Department of (?:the\s+)?([A-Za-z]+(?:\s+[A-Za-z]+)*))?\s*([^\n]+)\n(?:(?:Faculty|Program Statement|[\s\w]*Courses?[\s\w]+)|\s*\|\s*)+\n
\n"department_title" = "$1"\n"contact_info"="$2"\n

\n(\d+[A-Z]?)\.\s+
\n"course_id" = "$1"\n

"\n([^\.]+)\.\s+
"\n"course_title" = "$1"\n

"\n([FWS](?:,[FWS])*|\*)\s+
"\n"course_terms" = "$1"\n

\s*((?:[A-Z](?:\.|[a-z]+,?)(?:\s+|\-))+[A-Z][a-z]*)\n
\n"course_instructor" = "$1"\n

\s*\(General Education Code\(s\):\s+([A-Z\-,\s]+)\)\.\n
\n"ge" = "$1"\n

\s*Offered in alternate academic years.\n
\n"offered_in_alt_academic_years" = "true"\n

\s*May be repeated for credit.\n
\n"is_repeatable_for_credit" = "true"\n

\s+Enrollment limited to (\d+)\.\n
\n"enroll_limit" = $1\n

\s*Enrollment (?:is )?restricted to ([^\.]+)\.\n
\n"enroll_restrict" = "$1"\n

Concurrent enrollment in ([^\.(?:is)]+)\s+(?:is required|required)\.\n
\n"concurrent_req" = "$1"\n

\s*Prerequisite\(?s?\)?:\s+([^\.]+)\.\s*\n
\n"prereqs" = "$1"\n

\n([^"][^\n]+)\n
\n"course_description = "$1"\n
Binary file added crawlers/d-crawler/crawler-demo
Binary file not shown.
56,659 changes: 56,659 additions & 0 deletions crawlers/d-crawler/data.json

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions crawlers/d-crawler/dub.sdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name "crawler-demo"
description "Prototyping a distributed crawler built on vibe.d"
authors "Seiji Emery"
copyright "Copyright © 2018, Seiji Emery"
license "MIT"
#dependency "vibe-d" version="~>0.8.4"
dependency "arsd-official:dom" version="~>2.1.1"
dependency "arsd-official:htmltotext" version="~>2.1.1"
dependency "jsonizer" version="~>0.7.6"
20 changes: 20 additions & 0 deletions crawlers/d-crawler/dub.selections.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"fileVersion": 1,
"versions": {
"arsd-official": "2.1.1",
"botan": "1.12.10",
"botan-math": "1.0.3",
"diet-ng": "1.5.0",
"eventcore": "0.8.35",
"jsonizer": "0.7.6",
"libasync": "0.8.3",
"libevent": "2.0.2+2.0.16",
"memutils": "0.4.11",
"mir-linux-kernel": "1.0.0",
"openssl": "1.1.6+1.0.1g",
"stdx-allocator": "2.77.2",
"taggedalgebraic": "0.10.11",
"vibe-core": "1.4.1",
"vibe-d": "0.8.4"
}
}
56,659 changes: 56,659 additions & 0 deletions crawlers/d-crawler/fubar.json

Large diffs are not rendered by default.

31,381 changes: 31,381 additions & 0 deletions crawlers/d-crawler/hand_processed_file.txt

Large diffs are not rendered by default.

3,151 changes: 3,151 additions & 0 deletions crawlers/d-crawler/raw_courses_html.txt

Large diffs are not rendered by default.

17,699 changes: 17,699 additions & 0 deletions crawlers/d-crawler/raw_courses_text.txt

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions crawlers/d-crawler/source/app.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import std.stdio;
import std.net.curl: get, CurlException;
import std.format: format;
import std.exception: enforce;
import std.parallelism: parallel, defaultPoolThreads;
import std.getopt: getopt;
import std.string: toUpper, strip;
import std.regex: matchFirst, ctRegex;
import std.conv: parse;
import arsd.dom;
import arsd.htmltotext: htmlToText;
import course_data: CourseEntry;
import department_info: fetchDepartment, DepartmentInfo;
import core.sync.mutex;
import jsonizer;

__gshared Mutex mutex;
__gshared DepartmentInfo[string] data;
shared static this () { mutex = new Mutex(); }

void submit (DepartmentInfo dept) {
synchronized (mutex) {
data[dept.departmentId] = dept;
}
}

// regex: \n\s+(\d+\w?)\.\s+([\w+\s+\-:,/\'\"]+)(?:\s+\((\d+)\s+credits?|no credit\))?\.(?:\s+([FWS\*,]+))?\s+(.+)
// replace: {\n\t"course_id": "$1",\n\t"course_title": "$2",\n\t"credit(s)": "$3",\n\t"offered term(s)": "$4",\n\t"description": "$5"\n},\n

void processRegistrarCoursePage (string dept) {
writefln("Fetching data for dept '%s'", dept);
auto result = fetchDepartment("https://registrar.ucsc.edu/catalog/archive/17-18/programs-courses", dept);
writefln("%s course(s), %s faculty member(s)",
result.courses.length, result.faculty.length);
submit(result);
//writefln("\n%s", result);
}

void main(string[] args)
{
bool runParallel = false;
size_t numThreads = 16;
string outputFile = "data.json";
args.getopt(
"parallel", &runParallel,
"nthreads", &numThreads,
"o", &outputFile);

remove("raw_courses_html.txt");
remove("raw_courses_text.txt");

string[] depts = [
"acen", "anth", "aplx", "art", "artg", "havc", "arts", "astr", "bioc", "eeb", "mcdb", "mcdb", "chem", "chin", "clst", "cogs", "clni", "clte", "cmmu", "cowl", "cres", "crwn", "danm", "eart", "east", "econ", "educ", "ams", "beng", "bme", "cmpm", "cmpe", "cmps", "ee", "engr", "tim", "envs", "fmst", "film", "fren", "germ", "gmst", "gree", "hebr", "his", "havc", "hisc", "humn", "ital", "itst", "japn", "jwst", "krsg", "laal", "lnst", "latn", "lals", "lgst", "ling", "lit", "ocea", "math", "merr", "metx", "musc", "oaks", "ocea", "phil", "pbs", "phye", "phys", "poli", "prtr", "port", "psyc", "punj", "qsex", "crsn", "reli", "russ", "scic", "sced", "socd", "socs", "socy", "sphs", "spst", "stev", "sust", "thea", "ucdc", "writ", "yidd"
];
if (runParallel) {
defaultPoolThreads = 32;
foreach (dept; parallel(depts)) {
processRegistrarCoursePage(dept);
}
} else {
foreach (dept; depts) {
processRegistrarCoursePage(dept);
}
}
import std.file: write;
import std.algorithm: map;
import std.array;
import std.conv: to;
write(outputFile, data.toJSONString);

//write("data.json", format("{ %s }", data.map!"a.to!string".join(", ")));
}
51 changes: 51 additions & 0 deletions crawlers/d-crawler/source/course_data.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
module course_data;
import std.format: format;

struct CourseEntry {
string name;
string title;
int credits;
string quartersOffered;
string departmentTitle;
string division;
string rawDescription;
string description;
string instructor;
string prereqs;
string coreqs;
bool gradOnly = false;
bool requiresInstructorPermission = false;
bool mayBeRepeatedForCredit = false;
bool satisfiesAmericanHistoryReq = false;
string enrollmentRestrictions;
string geCategories;
string courseAlias;
int enrollLimit = 0;


string toString () {
return format(`
{
"course_name": "%s",
"course_title": "%s",
"department": "%s",
"credits": "%d",
"terms": "%s",
"division": "%s",
"instructor": "%s",
"description": "%s",
"prereqs": "%s",
"coreqs": "%s",
"enrollment_restrictions": "%s",
"requires_instructor_permission": "%s",
"repeatable_for_credit": "%s",
"satisfies_american_history_and_institutions_req": "%s",
"alias": "%s",
"ge_categories": "%s",
"enroll_limit": %d,
"raw_description": "%s",
},`, name, title, departmentTitle, credits, quartersOffered, division, instructor, description,
prereqs, coreqs, enrollmentRestrictions, requiresInstructorPermission,
mayBeRepeatedForCredit, satisfiesAmericanHistoryReq, courseAlias, geCategories, enrollLimit, rawDescription);
}
}
126 changes: 126 additions & 0 deletions crawlers/d-crawler/source/department_info/fetch_courses.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
module department_info.fetch_courses;
import department_info.model;
import department_info.parse_utils;
import util.fetch_html: fetchHtml;
import util.search_utils: childRange, regexMatch;
import std.stdio;
import std.regex;
import std.exception: enforce;
import std.string: strip, toLower;
import std.array: replace;
import arsd.dom;

DepartmentInfo fetchCourses (DepartmentInfo dept) {
fetchHtml(dept.coursesUrl, dept.error, (Document document) {
auto main = document
.requireSelector("body")
.requireSelector("div[id=wrap]")
.requireSelector("div[id=container]")
.requireSelector("div[id=sprflt]")
.requireSelector("div[id=main]");

dept.departmentName = main
.requireSelector("h1[id=title]")
.innerText;

auto content = main.requireSelector("div[class~=content]");

auto text = content.innerText;


import std.file;
append("raw_courses_html.txt", format("\n%s\n%s\n", dept.coursesUrl, content.innerHTML));
append("raw_courses_text.txt", format("\n%s\n%s\n", dept.coursesUrl, content.innerText));

auto sections = content.childRange
.splitSectionsByHeaders;

foreach (section, items; sections) {
if (auto match = matchFirst(section, ctRegex!`([\w\-]+(?:\s+[\w\-])*)\s+Courses`)) {
section = match[1].toLower;
} else {
writefln("Non-matching section: '%s'", section);
continue;
}

//writefln("Section %s:", section);
foreach (item; items) {
//writefln("\t%s", item.innerText);
auto text = item.innerText.strip();
if (text == "" || matchFirst(text, ctRegex!`(\* Not offered in|\[Return to top\])`)) { continue; }
if (auto match = matchFirst(text, ctRegex!`Revised:\s+([^\n]+)`)) {
dept.lastCourseRevisionDate = match[1];
continue;
}

//size_t i = 0;
//writefln("%d: %s\n", ++i, text);
auto courseNumber = matchFirst(text, ctRegex!`(\d+[A-Z]?)\.(?:\s+|$)`);
//enforce(courseNumber, format("Could not match course number in '%s'", text));
if (!courseNumber) {
writefln("Could not match course number in '%s'", text);
continue;
}


string name = dept.departmentId ~ " " ~ courseNumber[1];
text = courseNumber.post;

//writefln("%d: %s\n", ++i, text);
text = text.replace("U.S.", "US");

//writefln("%d: %s\n", ++i, text);
string title, units, terms;
if (text.length) {
auto match = matchFirst(text, ctRegex!`([^\.]+)(?:\s+\((\d+)\s+units?\))?\.(?:\s+|$)`);
if (!match && ((match = matchFirst(text, ctRegex!`([FWS](?:,[FWS])*|\*)?\s*`)))) {
terms = match[1].replace(",","");
text = match.post;
} else {
enforce(match, format("Could not match course title in '%s'", text));
title = match[1];
units = match[2] ? match[2] : "-1";
text = match.post;

//writefln("%d: %s\n", ++i, text);
if (!!(match = matchFirst(text, ctRegex!`([FWS](?:,[FWS])*|\*)?\s*`))) {
terms = match[1].replace(",","");
text = match.post;
}
}
}

//writefln("%d: %s\n", ++i, text);
string geCodes = null;
if (auto match = matchFirst(text, ctRegex!(`\s+\(General Education Code\(s\):\s+([^\.\)]+)[\.\)]+`, "g"))) {
geCodes = match[1];
text = match.pre ~ match.post;
}
//writefln("%d: %s\n", ++i, text);
//auto instructorMatch = matchFirst(text, ctRegex!`(?:\.\)?\s+|^)([^\.]+)\.?\s*$`);
string instructor = null;
if (text && text.length) {

// see this stupid thing here?
// \.["\)]?
// blame english style guides (or lack thereof...). (ie. `(fubar.) `"Baz."` etc...)

auto instructorMatch = matchFirst(text, ctRegex!`(?:\.["\)]?\s+|^)([^\.]+)\.?\s*$`);
enforce(instructorMatch, format("Could not match instructor in '%s'", text));
instructor = instructorMatch[1];
text = instructorMatch.pre;
//writefln("%d: %s\n", ++i, text);
}
//writefln("\t%s '%s' (%s units). '%s'. '%s'. %s", name, title, units, instructor, terms, text);

if (name in dept.courses) {
writefln("'%s' already exists in deps.courses!", name);
}
dept.courses[name] = DepartmentInfo.CourseListing(
name, title, section, terms, instructor, text, geCodes
);
}
}
});
return dept;
}
Loading