From e5d45caec66690a1ad6f5c89238c33ee59367577 Mon Sep 17 00:00:00 2001
From: pitekus <pitekus@a7a69d2c-e1b7-11de-9e61-43d41527e390>
Date: Sat, 23 Oct 2010 18:17:52 +0000
Subject: [PATCH] Fixed issue 2 which prevented morfette from compiling and/or
 working correctly on GHC-6.12. Moved several files out of src. Bumped
 version.

---
 src/INSTALL => INSTALL               |   0
 src/LICENSE => LICENSE               |   0
 src/Makefile => Makefile             |   4 +-
 README                               | 118 ++++++++++++++++++++++++++-
 src/Setup.lhs => Setup.lhs           |   0
 src/morfette.cabal => morfette.cabal |   7 +-
 src/GramLab/Morfette/NGrams.hs       |  46 -----------
 src/GramLab/Morfette/Utils.hs        |  13 ++-
 src/README                           | 117 --------------------------
 9 files changed, 134 insertions(+), 171 deletions(-)
 rename src/INSTALL => INSTALL (100%)
 rename src/LICENSE => LICENSE (100%)
 rename src/Makefile => Makefile (96%)
 mode change 120000 => 100644 README
 rename src/Setup.lhs => Setup.lhs (100%)
 rename src/morfette.cabal => morfette.cabal (90%)
 delete mode 100755 src/GramLab/Morfette/NGrams.hs
 delete mode 100644 src/README

diff --git a/src/INSTALL b/INSTALL
similarity index 100%
rename from src/INSTALL
rename to INSTALL
diff --git a/src/LICENSE b/LICENSE
similarity index 100%
rename from src/LICENSE
rename to LICENSE
diff --git a/src/Makefile b/Makefile
similarity index 96%
rename from src/Makefile
rename to Makefile
index 07fd19a..dbdc9e6 100644
--- a/src/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ defaults: all
 
 
 #program name
-MORFETTE=../src/dist/build/morfette/morfette
+MORFETTE=dist/build/morfette/morfette
 
 
 #parameters for training and eval
@@ -13,7 +13,7 @@ MORFETTE=../src/dist/build/morfette/morfette
 # for french TYPE can be either ftb4 or ftbmax
 
 TYPE=ftb4
-TRAINDATADIR=../DATA/
+TRAINDATADIR=DATA/
 PREF=${TRAINDATADIR}/${TYPE}
 TRAINSET=${PREF}/ftb_1.pos.utf8.morpheteready
 DEVSET=${PREF}/ftb_2.pos.utf8.morpheteready
diff --git a/README b/README
deleted file mode 120000
index fece262..0000000
--- a/README
+++ /dev/null
@@ -1 +0,0 @@
-src/README
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..a1976fd
--- /dev/null
+++ b/README
@@ -0,0 +1,117 @@
+=INTRODUCTION=
+
+Morfette website: http://sites.google.com/site/morfetteweb/
+
+Morfette is a tool for supervised learning of inflectional
+morphology. Given a corpus of sentences annotated with lemmas 
+and morphological labels, and optionally a lexicon, morfette 
+learns how to morphologically analyse new sentences. 
+
+In the learning stage Morfette fits two separate logistic regression
+models: one for morphological tagging and one for lemmatization. The
+predictions of the models are combined dynamically and produce a 
+globally plausible sequence of morphological-tag - lemma pairs for 
+a sentence.
+
+In Morfette lemmatization is cast as a classification task where a 
+a lemmatization class corresponds to the specification of the edit 
+operations which are needed to transform the inflected word form into
+the corresponding lemma.
+
+The basic approach is described in (Chrupala et al 2008 and Chrupala 2008). 
+The current version of Morfette uses an averaged perceptron to 
+fit the models, rather than Maximum Entropy training. The lemmatization 
+classes are Edit-Tree-based as described in (Chrupala 2008).
+
+=LICENSE= 
+The source code in the src directory is licensed under
+the BSD license.
+
+=INSTALLATION=
+Pre-built binaries are available from the project website. 
+If they don't work on your system you will
+need to build from source, using the GHC Haskell compiler. Build
+instructions are in [INSTALL]
+
+=USAGE=
+Usage: morfette command [OPTION...] [ARG...]
+train:    train models
+train [OPTION...] TRAIN-FILE MODEL-DIR 
+    --dict-file=PATH                      path to optional dictionary
+    --language-configuration=es|pl|tr|..  language configuration
+    --class-entropy-prune-threshold=NUM   class prune threshold
+
+predict:  predict postags and lemmas using saved model data
+predict [OPTION...] MODEL-DIR 
+    --beam=+INT  beam size to use
+    --tokenize   tokenize input
+
+eval:     evaluate morpho-tagging and lemmatization results
+eval [OPTION...] TRAIN-FILE GOLD-FILE TEST-FILE 
+    --ignore-case            ignore case for evaluation
+    --baseline-file=PATH     path to baseline results
+    --dict-file=PATH         path to optional dictionary
+    --ignore-punctuation     ignore punctuation for evaluation
+    --ignore-pos=POS-prefix  ignore POS starting with POS-prefix for evaluation
+
+
+=EXAMPLE USAGE=
+To train a new model:
+morfette train --dict-file=DICT TRAINING-FILE MODEL-DIR +RTS -K100m
+
+To use the model in MODEL-DIR to analyze new data:
+morfette predict MODEL-DIR < TEST-DATA > ANALYZED-TEST-DATA
+
+=DATA FORMAT=
+Morfette expects both training and testing data to be tokenized and
+split into sentences. The format of training data look like this:
+
+Gómez Gómez np0000p
+sostiene sostener vmip3s0
+que que cs
+la el da0fs0
+propuesta propuesta ncfs000
+no no rn
+cambiará cambiar vmif3s0
+. . Fp
+
+La el da0fs0
+propuesta propuesta ncfs000
+será ser vsif3s0
+la el da0fs0
+misma mismo pi0fs000
+
+
+There is one token per line, with three columns separated by spaces or
+tabs. The columns contain word form, lemma and morphological tag
+respectively. Sentences are separated by an empty line. Text should be
+encoded in UTF-8.
+
+Test data format is similar, except only the first column is needed:
+
+Gómez
+sostiene
+que
+la
+propuesta
+no
+cambiará
+.
+
+La
+propuesta
+será
+la
+misma
+
+
+=References=
+[1] Grzegorz Chrupala, Georgiana Dinu and Josef van Genabith. 2008.
+    Learning Morphology with Morfette. In Proceedings of LREC 2008.
+    http://www.lrec-conf.org/proceedings/lrec2008/pdf/594_paper.pdf
+
+[2] Grzegorz Chrupala. 2008. Towards a Machine-Learning Architecture
+    for Lexical Functional Grammar Parsing. Chapter 6. PhD
+    dissertation, Dublin City
+    University. 
+    http://www.lsv.uni-saarland.de/personalPages/gchrupala/papers/phd.pdf
diff --git a/src/Setup.lhs b/Setup.lhs
similarity index 100%
rename from src/Setup.lhs
rename to Setup.lhs
diff --git a/src/morfette.cabal b/morfette.cabal
similarity index 90%
rename from src/morfette.cabal
rename to morfette.cabal
index 5e4f6af..6649a1e 100644
--- a/src/morfette.cabal
+++ b/morfette.cabal
@@ -1,5 +1,5 @@
 Name:		morfette
-Version:	0.3.1
+Version:	0.3.2
 Homepage:	http://sites.google.com/site/morfetteweb/
 Synopsis:	A tool for supervised learning of morphology
 Description:	Morfette is a tool for supervised learning of inflectional
@@ -15,6 +15,7 @@ category:       Natural Language Processing
 Extra-source-files: README, INSTALL, Makefile
 
 Executable:     morfette
+hs-source-dirs: src
 Main-Is:        Main.hs
 Other-Modules:  GramLab.Commands, GramLab.Morfette.LZipper, 
 		GramLab.Data.StringLike,
@@ -29,7 +30,7 @@ Other-Modules:  GramLab.Commands, GramLab.Morfette.LZipper,
 		GramLab.Morfette.Settings.Defaults, 
 		GramLab.Morfette.Features.Common, 
 		Lemma, POS, GramLab.Morfette.Utils
-Build-depends:	base >=3 && <=4 , containers, array, QuickCheck, mtl, 
+Build-depends:	base >=3 && <=5 , containers, array, mtl, 
 		directory, filepath, haskell98, pretty,
-		utf8-string, bytestring, binary
+		utf8-string, bytestring, binary, QuickCheck >= 2.3
 cc-options:     -Wall 
diff --git a/src/GramLab/Morfette/NGrams.hs b/src/GramLab/Morfette/NGrams.hs
deleted file mode 100755
index afbeddf..0000000
--- a/src/GramLab/Morfette/NGrams.hs
+++ /dev/null
@@ -1,46 +0,0 @@
---module GramLab.Morfette.NGrams ( )
---where
-
-import qualified Data.ByteString.Lazy.Char8 as S
-
---import qualified Data.List as S
-import Data.List
-import Data.Char
-import qualified GramLab.Data.MultiSet as MS
-import qualified Data.Map as Map
---import GramLab.Utils
-import GramLab.Morfette.LZipper
-import Data.Binary
-import System 
-
-sepPunct xs = let (before,rest) = S.span isPunctuation xs
-                  (this,after)  = S.span (not . isPunctuation) rest
-              in  filter (not . S.null) [before,this,after]
-
-tokenize = concatMap sepPunct . S.words
-
-toNGrams i z | atEnd z = []
-toNGrams i z = let Just w = focus z 
-               in (w, flatten (take i (left z)) (take i (right z))):toNGrams i (slide z)
-
-
-flatten l r = (l,r) --S.concat [S.unwords l,S.pack "_",S.unwords r]
-main = do
-  [n] <- getArgs
-  txt <- S.getContents
-  let paragraphs = map S.unlines (splitWith S.null (S.lines txt))
-      ngs = concatMap (\p -> toNGrams (read n) (fromList . tokenize $ txt)) paragraphs
-  flip mapM_ ngs $ \(w,(b,a)) -> S.putStrLn (S.unwords [w, S.unwords b, S.pack "__", S.unwords a])
-  
-
-splitWith ::  (a -> Bool) -> [a] -> [[a]]
-splitWith f s =  case dropWhile f s of
-                   [] -> []
-                   s' -> w : splitWith f s''
-                       where (w, s'') = break f s'
-
-instance (Binary a, Ord a) => Binary (MS.MultiSet a) where
-    put ms = put (MS.toAscOccurList ms)
-    get = do
-      xs <- get
-      return $ MS.fromAscOccurList xs
\ No newline at end of file
diff --git a/src/GramLab/Morfette/Utils.hs b/src/GramLab/Morfette/Utils.hs
index a87575a..f138689 100755
--- a/src/GramLab/Morfette/Utils.hs
+++ b/src/GramLab/Morfette/Utils.hs
@@ -7,8 +7,9 @@ module GramLab.Morfette.Utils ( train
 where
 import Prelude hiding (print,getContents,putStrLn,putStr
                       ,writeFile,readFile)
-import System.IO (stderr,stdout)
-import System.IO.UTF8
+import System.IO (stderr,stdout,stdin,hSetBinaryMode)
+import System.IO.UTF8 hiding (getContents,print,putStr,putStrLn)
+import qualified System.IO.UTF8 as UTF8
 import GramLab.Commands
 import qualified GramLab.Morfette.Models as Models
 import GramLab.Morfette.Models (Smth(..))
@@ -329,3 +330,11 @@ filterZip :: [Bool] -> [a] -> [a]
 filterZip xs ys = catMaybes $ zipWith (\b x -> if b then Just x else Nothing) xs ys
     
 
+getContents :: IO String
+getContents = hSetBinaryMode stdin True >> UTF8.getContents
+              
+putStr :: String -> IO ()
+putStr s = hSetBinaryMode stdout True >> UTF8.putStr s
+
+putStrLn :: String -> IO ()
+putStrLn s = hSetBinaryMode stdout True >> UTF8.putStrLn s
diff --git a/src/README b/src/README
deleted file mode 100644
index a1976fd..0000000
--- a/src/README
+++ /dev/null
@@ -1,117 +0,0 @@
-=INTRODUCTION=
-
-Morfette website: http://sites.google.com/site/morfetteweb/
-
-Morfette is a tool for supervised learning of inflectional
-morphology. Given a corpus of sentences annotated with lemmas 
-and morphological labels, and optionally a lexicon, morfette 
-learns how to morphologically analyse new sentences. 
-
-In the learning stage Morfette fits two separate logistic regression
-models: one for morphological tagging and one for lemmatization. The
-predictions of the models are combined dynamically and produce a 
-globally plausible sequence of morphological-tag - lemma pairs for 
-a sentence.
-
-In Morfette lemmatization is cast as a classification task where a 
-a lemmatization class corresponds to the specification of the edit 
-operations which are needed to transform the inflected word form into
-the corresponding lemma.
-
-The basic approach is described in (Chrupala et al 2008 and Chrupala 2008). 
-The current version of Morfette uses an averaged perceptron to 
-fit the models, rather than Maximum Entropy training. The lemmatization 
-classes are Edit-Tree-based as described in (Chrupala 2008).
-
-=LICENSE= 
-The source code in the src directory is licensed under
-the BSD license.
-
-=INSTALLATION=
-Pre-built binaries are available from the project website. 
-If they don't work on your system you will
-need to build from source, using the GHC Haskell compiler. Build
-instructions are in [INSTALL]
-
-=USAGE=
-Usage: morfette command [OPTION...] [ARG...]
-train:    train models
-train [OPTION...] TRAIN-FILE MODEL-DIR 
-    --dict-file=PATH                      path to optional dictionary
-    --language-configuration=es|pl|tr|..  language configuration
-    --class-entropy-prune-threshold=NUM   class prune threshold
-
-predict:  predict postags and lemmas using saved model data
-predict [OPTION...] MODEL-DIR 
-    --beam=+INT  beam size to use
-    --tokenize   tokenize input
-
-eval:     evaluate morpho-tagging and lemmatization results
-eval [OPTION...] TRAIN-FILE GOLD-FILE TEST-FILE 
-    --ignore-case            ignore case for evaluation
-    --baseline-file=PATH     path to baseline results
-    --dict-file=PATH         path to optional dictionary
-    --ignore-punctuation     ignore punctuation for evaluation
-    --ignore-pos=POS-prefix  ignore POS starting with POS-prefix for evaluation
-
-
-=EXAMPLE USAGE=
-To train a new model:
-morfette train --dict-file=DICT TRAINING-FILE MODEL-DIR +RTS -K100m
-
-To use the model in MODEL-DIR to analyze new data:
-morfette predict MODEL-DIR < TEST-DATA > ANALYZED-TEST-DATA
-
-=DATA FORMAT=
-Morfette expects both training and testing data to be tokenized and
-split into sentences. The format of training data look like this:
-
-Gómez Gómez np0000p
-sostiene sostener vmip3s0
-que que cs
-la el da0fs0
-propuesta propuesta ncfs000
-no no rn
-cambiará cambiar vmif3s0
-. . Fp
-
-La el da0fs0
-propuesta propuesta ncfs000
-será ser vsif3s0
-la el da0fs0
-misma mismo pi0fs000
-
-
-There is one token per line, with three columns separated by spaces or
-tabs. The columns contain word form, lemma and morphological tag
-respectively. Sentences are separated by an empty line. Text should be
-encoded in UTF-8.
-
-Test data format is similar, except only the first column is needed:
-
-Gómez
-sostiene
-que
-la
-propuesta
-no
-cambiará
-.
-
-La
-propuesta
-será
-la
-misma
-
-
-=References=
-[1] Grzegorz Chrupala, Georgiana Dinu and Josef van Genabith. 2008.
-    Learning Morphology with Morfette. In Proceedings of LREC 2008.
-    http://www.lrec-conf.org/proceedings/lrec2008/pdf/594_paper.pdf
-
-[2] Grzegorz Chrupala. 2008. Towards a Machine-Learning Architecture
-    for Lexical Functional Grammar Parsing. Chapter 6. PhD
-    dissertation, Dublin City
-    University. 
-    http://www.lsv.uni-saarland.de/personalPages/gchrupala/papers/phd.pdf