Permalink
Browse files

first commit

  • Loading branch information...
0 parents commit beeef0ccc7cdd34f1b2f4b3a2f2a06f30551b0bb @g000001 committed May 13, 2013
Showing with 144,779 additions and 0 deletions.
  1. +1 −0 .gitignore
  2. +15 −0 COPYRIGHT
  3. +2 −0 README
  4. +260 −0 analysis/analysis-protocol.lisp
  5. +275 −0 analysis/class-guesser.lisp
  6. +331 −0 analysis/fsa-tokenizer.lisp
  7. +122 −0 analysis/lexicon-protocol.lisp
  8. +225 −0 analysis/tag-analysis.lisp
  9. +97 −0 analysis/tag-basics.lisp
  10. +296 −0 analysis/tag-brown.lisp
  11. +76 −0 analysis/tag-english.lisp
  12. +299 −0 analysis/tag-trainer.lisp
  13. +358 −0 analysis/vector-lexicon.lisp
  14. +15 −0 cltl1-compat.lisp
  15. +208 −0 common-lisp/common-lisp-package.lisp
  16. +11 −0 common-lisp/common-lisp-precom.lisp
  17. +652 −0 common-lisp/common-lisp.lisp
  18. +826 −0 common-lisp/conditions.lisp
  19. +48 −0 common-lisp/ensure-pcl-loaded.lisp
  20. +46 −0 common-lisp/lucid-condition-patches.lisp
  21. +116 −0 corpus/corpus-protocol.lisp
  22. +1 −0 data/brown/brown.classes
  23. BIN data/brown/brown.hmm
  24. +58,683 −0 data/brown/lexicon.txt
  25. +536 −0 data/brown/suffix.trie
  26. +1 −0 data/moby-dick/brown.classes
  27. BIN data/moby-dick/brown.hmm
  28. +485 −0 fsa/fsa-basics.lisp
  29. +187 −0 fsa/fsa-calculus.lisp
  30. +27 −0 fsa/fsa-standard.lisp
  31. +74 −0 fsa/fsa-test.lisp
  32. +139 −0 fsa/list-sets.lisp
  33. +40 −0 fsa/skip-list-relations.lisp
  34. +17 −0 fsa/standard-states.lisp
  35. +90 −0 fsa/standard-symbols.lisp
  36. +1 −0 orig/data/brown/brown.classes
  37. BIN orig/data/brown/brown.hmm
  38. +58,683 −0 orig/data/brown/lexicon.txt
  39. +536 −0 orig/data/brown/suffix.trie
  40. +1 −0 orig/data/moby-dick/brown.classes
  41. BIN orig/data/moby-dick/brown.hmm
  42. BIN orig/doc/anlp92.ps
  43. BIN orig/doc/riao91.ps
  44. BIN orig/doc/tagger.ps
  45. BIN orig/src/.rsrc/analysis
  46. BIN orig/src/.rsrc/common-lisp
  47. BIN orig/src/.rsrc/corpus
  48. BIN orig/src/.rsrc/fsa
  49. BIN orig/src/.rsrc/pdefsys.doc
  50. BIN orig/src/.rsrc/pdefsys.lisp
  51. BIN orig/src/.rsrc/sysdcl
  52. BIN orig/src/.rsrc/util
  53. +260 −0 orig/src/analysis/analysis-protocol.lisp
  54. +275 −0 orig/src/analysis/class-guesser.lisp
  55. +330 −0 orig/src/analysis/fsa-tokenizer.lisp
  56. +117 −0 orig/src/analysis/lexicon-protocol.lisp
  57. +225 −0 orig/src/analysis/tag-analysis.lisp
  58. +97 −0 orig/src/analysis/tag-basics.lisp
  59. +296 −0 orig/src/analysis/tag-brown.lisp
  60. +76 −0 orig/src/analysis/tag-english.lisp
  61. +299 −0 orig/src/analysis/tag-trainer.lisp
  62. +358 −0 orig/src/analysis/vector-lexicon.lisp
  63. +208 −0 orig/src/common-lisp/common-lisp-package.lisp
  64. +11 −0 orig/src/common-lisp/common-lisp-precom.lisp
  65. +652 −0 orig/src/common-lisp/common-lisp.lisp
  66. +826 −0 orig/src/common-lisp/conditions.lisp
  67. +48 −0 orig/src/common-lisp/ensure-pcl-loaded.lisp
  68. +46 −0 orig/src/common-lisp/lucid-condition-patches.lisp
  69. +116 −0 orig/src/corpus/corpus-protocol.lisp
  70. +482 −0 orig/src/fsa/fsa-basics.lisp
  71. +187 −0 orig/src/fsa/fsa-calculus.lisp
  72. +27 −0 orig/src/fsa/fsa-standard.lisp
  73. +74 −0 orig/src/fsa/fsa-test.lisp
  74. +139 −0 orig/src/fsa/list-sets.lisp
  75. +40 −0 orig/src/fsa/skip-list-relations.lisp
  76. +17 −0 orig/src/fsa/standard-states.lisp
  77. +90 −0 orig/src/fsa/standard-symbols.lisp
  78. +303 −0 orig/src/pdefsys.doc
  79. +1,970 −0 orig/src/pdefsys.lisp
  80. +120 −0 orig/src/sysdcl/analysis-sysdcl.lisp
  81. +61 −0 orig/src/sysdcl/common-lisp-sysdcl.lisp
  82. +154 −0 orig/src/sysdcl/corpus-sysdcl.lisp
  83. +49 −0 orig/src/sysdcl/fsa-sysdcl.lisp
  84. +8 −0 orig/src/sysdcl/skip-list-sysdcl.lisp
  85. +74 −0 orig/src/sysdcl/tag-analysis-sysdcl.lisp
  86. +136 −0 orig/src/sysdcl/tdb-sysdcl-sysdcl.lisp
  87. +156 −0 orig/src/sysdcl/util-sysdcl.lisp
  88. +484 −0 orig/src/util/cl-extensions.lisp
  89. +129 −0 orig/src/util/cons-resource.lisp
  90. +60 −0 orig/src/util/float-vector.lisp
  91. +333 −0 orig/src/util/hmm-test.lisp
  92. +591 −0 orig/src/util/hmm.lisp
  93. +435 −0 orig/src/util/io-builtin.lisp
  94. +116 −0 orig/src/util/io-byte8-c.lisp
  95. +231 −0 orig/src/util/io-byte8.c
  96. +231 −0 orig/src/util/io-byte8.lisp
  97. +273 −0 orig/src/util/io-structs.lisp
  98. +71 −0 orig/src/util/skip-list-test.lisp
  99. +508 −0 orig/src/util/skip-list.lisp
  100. +108 −0 orig/src/util/ssb.lisp
  101. +86 −0 orig/src/util/string-resource.lisp
  102. +69 −0 orig/src/util/sv-resource.lisp
  103. +96 −0 orig/src/util/svb.lisp
  104. +261 −0 orig/src/util/trie.lisp
  105. +218 −0 orig/src/util/variable-storage.lisp
  106. +310 −0 orig/src/util/vector-resource.lisp
  107. +311 −0 package.lisp
  108. +303 −0 pdefsys.doc
  109. +1,970 −0 pdefsys.lisp
  110. +120 −0 sysdcl/analysis-sysdcl.lisp
  111. +61 −0 sysdcl/common-lisp-sysdcl.lisp
  112. +154 −0 sysdcl/corpus-sysdcl.lisp
  113. +49 −0 sysdcl/fsa-sysdcl.lisp
  114. +8 −0 sysdcl/skip-list-sysdcl.lisp
  115. +74 −0 sysdcl/tag-analysis-sysdcl.lisp
  116. +142 −0 sysdcl/tdb-sysdcl-sysdcl.lisp
  117. +158 −0 sysdcl/util-sysdcl.lisp
  118. +59 −0 tagger.asd
  119. +502 −0 util/cl-extensions.lisp
  120. +129 −0 util/cons-resource.lisp
  121. +60 −0 util/float-vector.lisp
  122. +333 −0 util/hmm-test.lisp
  123. +593 −0 util/hmm.lisp
  124. +435 −0 util/io-builtin.lisp
  125. +116 −0 util/io-byte8-c.lisp
  126. +231 −0 util/io-byte8.c
  127. +233 −0 util/io-byte8.lisp
  128. +274 −0 util/io-structs.lisp
  129. +71 −0 util/skip-list-test.lisp
  130. +513 −0 util/skip-list.lisp
  131. +108 −0 util/ssb.lisp
  132. +86 −0 util/string-resource.lisp
  133. +69 −0 util/sv-resource.lisp
  134. +96 −0 util/svb.lisp
  135. +261 −0 util/trie.lisp
  136. +233 −0 util/variable-storage.lisp
  137. +310 −0 util/vector-resource.lisp
@@ -0,0 +1 @@
+*.*~
@@ -0,0 +1,15 @@
+Copyright (c) 1991, 1992, 1993 Xerox Corporation. All Rights Reserved.
+
+Use, reproduction, and distribution of this software is permitted, but
+only for non-commercial research or educational purposes. Any copy of
+this software must include both the above copyright notice of Xerox
+Corporation and this paragraph. Any distribution of this software
+must comply with all applicable United States export control laws.
+This software is made available AS IS, and XEROX CORPORATION DISCLAIMS
+ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE, AND NOTWITHSTANDING ANY OTHER PROVISION CONTAINED HEREIN, ANY
+LIABILITY FOR DAMAGES RESULTING FROM THE SOFTWARE OR ITS USE IS
+EXPRESSLY DISCLAIMED, WHETHER ARISING IN CONTRACT, TORT (INCLUDING
+NEGLIGENCE) OR STRICT LIABILITY, EVEN IF XEROX CORPORATION IS ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGES.
2 README
@@ -0,0 +1,2 @@
+This directory contains release 0.9 of the Xerox Part-of-Speech tagger.
+For more information, print the file doc/tagger/tagger.ps.
@@ -0,0 +1,260 @@
+;;;-*- Package: TDB; Syntax: Common-Lisp; Mode: Lisp; Base: 10 -*-
+
+;;; Copyright (c) 1988, 1989, 1990, 1991 by Xerox Corporation
+
+
+;;;; Token Protocol
+
+(cl:in-package :tdb)
+
+#|(eval-when (compile eval load)
+
+ (use-package :cl-extensions)
+ (use-package :ssb)
+ (use-package :string-resource)
+
+ (export '(analysis *analysis-directory*
+
+ ts char-ts char-stream ts-char-stream ts-doc-id
+ make-ts
+
+ token-filter
+ next-token token-start-pos token-end-pos
+ sentence-start-pos paragraph-start-pos
+ get-text-string ts-types
+
+ do-tokens
+
+ clean-text-string print-paragraph)))|#
+
+
+
+;;;; basics
+
+(defvar *analysis-directory* (tdb-dirpath "data" "english"))
+
+(defclass analysis () ())
+
+(defclass ts ()
+ ((doc-id :accessor ts-doc-id :initarg :doc-id)))
+
+(defclass char-ts (ts)
+ ((char-stream :accessor ts-char-stream :initarg :char-stream)))
+
+(defmethod shared-initialize :around ((stream char-ts) slots &rest rest
+ &key char-stream start end
+ &allow-other-keys)
+ (declare (dynamic-extent rest))
+ (cond
+ (char-stream
+ (if start ; default START to current position
+ (file-position char-stream start) ; reposition to START if provided
+ (setq start (file-position char-stream)))
+ (unless end ; default END to end of CHAR-STREAM
+ (setq end (cl-extensions::file-length char-stream)))
+ (apply #'call-next-method stream slots :start start :end end rest))
+ (t (call-next-method))))
+
+;;;; Analyzers must provide token streams through two means: (1) given a
+;;;; character stream, as from a query, and (2) given a document ID, from a
+;;;; mixed-in corpus. For plain-text corpora the latter is usually implemented
+;;;; in terms of the former. The default methods do this when
+;;;; MAKE-TS is defined to return a subclass of CHAR-TS.
+
+(defgeneric make-ts (char-stream analysis &key start end))
+
+;;; Token streams returned by MAKE-TS must also support
+;;; REINITIALIZE-INSTANCE with args :CHAR-STREAM :START and :END. It is
+;;; suggested that this be supported through methods on SHARED-INITIALIZE.
+
+
+;;;;
+;;;; Token streams are instances of classes defined as follows:
+;;;;
+;;;; (DEFCLASS <class> (<filter1> <filter2> ... <source>) ())
+;;;;
+
+(defclass token-filter () ())
+
+;;;; Sources & Filters must support the following methods:
+
+
+;;;; Tokenizers implement NEXT-TOKEN directly.
+;;;; Filters implement NEXT-TOKEN by processing the results of CALL-NEXT-METHOD.
+
+;;; Returns the next token from TS, or NIL if at end of stream.
+;;; Tokens are simple strings which are not elsewhere pointed at, i.e. clients
+;;; may alter them. (Use ALLOC-STRING, -COPY and -FREE).
+(defgeneric next-token (ts))
+
+;;; Tokenizers implement the following directly.
+;;; Filters must ensure these are maintained in the face of elisions and/or
+;;; buffering of tokens. Positions are assumed to be monotonically increasing.
+
+;;; Returns the position of the start of the last token returned by NEXT-TOKEN.
+;;; Undefined before the first token is read or after stream is repositioned,
+;;; i.e. only valid immediately after calls to NEXT-TOKEN.
+;;; Defined at eos to be the position at end of stream.
+(defgeneric token-start-pos (ts))
+
+;;; Returns the position of the end of the last token returned by NEXT-TOKEN.
+;;; Initially returns the position of the beginning of the text.
+;;; Undefined at eos.
+(defgeneric token-end-pos (ts))
+
+;;; Resets the position of a token stream. This should only be called with
+;;; positions previously returned by TOKEN-END-POSITION. Note that
+;;; SENTENCE-START-POS must also be restored when repositioning.
+(defgeneric (setf token-end-pos) (ts pos))
+
+
+;;; Returns the position of the beginning of the sentence containing the last
+;;; token returned by NEXT-TOKEN. Initially returns the beginning of the text.
+;;; Defined at eos to be the position at end of stream.
+(defgeneric sentence-start-pos (ts))
+
+;;; Must be called with appropriate position when stream is repositioned.
+(defgeneric (setf sentence-start-pos) (ts pos))
+
+;;; Returns the position of the beginning of the paragraph containing the last
+;;; token returned by NEXT-TOKEN. Initially returns the beginning of the text.
+;;; Defined at eos to be the position at end of stream.
+(defgeneric paragraph-start-pos (ts))
+
+;;; Must be called with appropriate position when stream is repositioned.
+(defgeneric (setf paragraph-start-pos) (ts pos))
+
+
+;;;; Tokenizers implement the following directly w/o filter intervention.
+
+;;; Returns a string containing the text between the named positions.
+(defgeneric get-text-string (start-pos end-pos ts))
+
+
+;;;; Protocol for typed token streams
+;;;; Types are keywords. The types :WORD and :SENT and :PARA are reserved.
+
+;;; Returns the list of types that may be seen.
+(defgeneric ts-types (ts))
+
+
+
+;;; DO-TOKENS: a handy macro
+
+(defmacro do-tokens ((vars ts &optional value) &body body)
+ (let ((ts-var (gensym "TS"))
+ (vars (if (consp vars) vars (list vars))))
+ `(let ((,ts-var ,ts))
+ (block do-tokens
+ (loop (multiple-value-bind ,vars (next-token ,ts-var)
+ (unless ,(car vars) (return ,value))
+ ,@body))))))
+
+
+;;; the following two functions properly belong elsewhere...
+
+(defun clean-text-string (string
+ &key (end (length string)) (ssb (make-ssb)) endsp)
+;;; Replaces sequences of whitespace characters with a single space. Unless
+;;; ENDSP is true leading & trailing whitespace is eliminated too.
+ (declare (simple-string string) (type byte28 end))
+ (check-type string simple-string)
+ (setf (ssb-pointer ssb) 0)
+ (let ((last-char-was-whitespace-p (not endsp)))
+ (dotimes (i end)
+ (declare (type byte28 i))
+ (let ((char (schar string i)))
+ (case char
+ ((#\space #\tab #\linefeed #\return #\page)
+ (unless last-char-was-whitespace-p
+ (ssb-push-extend #\space ssb)
+ (setq last-char-was-whitespace-p t)))
+ (t (ssb-push-extend char ssb)
+ (setq last-char-was-whitespace-p nil)))))
+ (when (and last-char-was-whitespace-p
+ (not endsp)
+ (not (zerop (ssb-pointer ssb))))
+ (decf (ssb-pointer ssb)))
+ (simple-string-copy (ssb-buffer ssb) (ssb-pointer ssb))))
+
+(defvar *line-width* 79)
+
+(defun print-paragraph (string indent stream)
+;;; Breaks lines in STRING, printing to STREAM. Lines (except the first) are
+;;; prefixed with INDENT. Returns the number of lines printed.
+ (let ((line-count 0)
+ (column indent)
+ (line-start 0)
+ (last-space nil)
+ (limit (length string))
+ (pos 0))
+ (loop
+ (when (= pos limit)
+ (when (or (zerop limit) (/= line-start limit))
+ (format stream "~A~%" (subseq string line-start limit))
+ (incf line-count))
+ (return line-count))
+ (let ((char (schar string pos)))
+ (when (char= char #\space)
+ (setf last-space pos))
+ (incf pos) (incf column)
+ (when (= column *line-width*)
+ (format stream "~A~%~VA"
+ (subseq string line-start (or last-space pos))
+ indent "")
+ (incf line-count)
+ (setq line-start (if last-space (1+ last-space) pos)
+ column (+ indent (- pos line-start))))))))
+
+
+
+
+#|
+
+(defmethod next-sentence ((stream ts))
+;;; Returns a list of the tokens in the next sentence on STREAM.
+ (let ((sentence-start (sentence-start-pos stream))
+ (last-token-end (token-end-pos stream))
+ (tokens ()))
+ (declare (type byte28 sentence-start last-token-end) (list tokens))
+ (do-tokens (token stream)
+ (unless (= (sentence-start-pos stream) sentence-start)
+ ;; back up one token to the begining of the new sentence
+ (setf (token-end-pos stream) last-token-end)
+ ;; SENTENCE-START-POS is ok where it is
+ (return))
+ (%push token tokens)
+ (setq last-token-end (token-end-pos stream)))
+ (nreverse tokens)))
+
+(defmethod next-paragraph ((stream ts))
+;;; Returns a list of the sentences in the next paragraph on STREAM.
+ (let ((paragraph-start (paragraph-start-pos stream))
+ (sentences ()))
+ (declare (type byte28 paragraph-start) (list sentences))
+ (loop
+ (let ((sentence (next-sentence stream)))
+ (unless sentence (return))
+ (%push sentence sentences)
+ (unless (= (paragraph-start-pos stream) paragraph-start)
+ (return))))
+ (nreverse sentences)))
+
+(defun stems-byte-offsets (function tokens char-stream tdb &key start end)
+;;; Calls (FUNCTION <token> <start pos> <length>) for occurence of a token in
+;;; TOKENS in STREAM between START and END.
+ (let ((table (make-hash-table :test 'equal :size (length tokens))))
+ (dolist (token tokens)
+ (setf (gethash token table) token))
+ (let ((stream (make-ts char-stream (tdb-corpus tdb))))
+ (do-tokens (token stream)
+ (let ((start-pos (token-start-pos stream)))
+ (declare (type byte28 start-pos))
+ (when (and end (> start-pos (the byte28 end)))
+ (return))
+ (when (and (gethash token table)
+ (or (null start) (>= start-pos (the byte28 start))))
+ (funcall function (gethash token table)
+ start-pos (token-end-pos stream))))))))
+
+|#
Oops, something went wrong.

0 comments on commit beeef0c

Please sign in to comment.