From e5d45caec66690a1ad6f5c89238c33ee59367577 Mon Sep 17 00:00:00 2001 From: pitekus Date: Sat, 23 Oct 2010 18:17:52 +0000 Subject: [PATCH] Fixed issue 2 which prevented morfette from compiling and/or working correctly on GHC-6.12. Moved several files out of src. Bumped version. --- src/INSTALL => INSTALL | 0 src/LICENSE => LICENSE | 0 src/Makefile => Makefile | 4 +- README | 118 ++++++++++++++++++++++++++- src/Setup.lhs => Setup.lhs | 0 src/morfette.cabal => morfette.cabal | 7 +- src/GramLab/Morfette/NGrams.hs | 46 ----------- src/GramLab/Morfette/Utils.hs | 13 ++- src/README | 117 -------------------------- 9 files changed, 134 insertions(+), 171 deletions(-) rename src/INSTALL => INSTALL (100%) rename src/LICENSE => LICENSE (100%) rename src/Makefile => Makefile (96%) mode change 120000 => 100644 README rename src/Setup.lhs => Setup.lhs (100%) rename src/morfette.cabal => morfette.cabal (90%) delete mode 100755 src/GramLab/Morfette/NGrams.hs delete mode 100644 src/README diff --git a/src/INSTALL b/INSTALL similarity index 100% rename from src/INSTALL rename to INSTALL diff --git a/src/LICENSE b/LICENSE similarity index 100% rename from src/LICENSE rename to LICENSE diff --git a/src/Makefile b/Makefile similarity index 96% rename from src/Makefile rename to Makefile index 07fd19a..dbdc9e6 100644 --- a/src/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ defaults: all #program name -MORFETTE=../src/dist/build/morfette/morfette +MORFETTE=dist/build/morfette/morfette #parameters for training and eval @@ -13,7 +13,7 @@ MORFETTE=../src/dist/build/morfette/morfette # for french TYPE can be either ftb4 or ftbmax TYPE=ftb4 -TRAINDATADIR=../DATA/ +TRAINDATADIR=DATA/ PREF=${TRAINDATADIR}/${TYPE} TRAINSET=${PREF}/ftb_1.pos.utf8.morpheteready DEVSET=${PREF}/ftb_2.pos.utf8.morpheteready diff --git a/README b/README deleted file mode 120000 index fece262..0000000 --- a/README +++ /dev/null @@ -1 +0,0 @@ -src/README \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..a1976fd --- /dev/null +++ b/README @@ -0,0 +1,117 @@ +=INTRODUCTION= + +Morfette website: http://sites.google.com/site/morfetteweb/ + +Morfette is a tool for supervised learning of inflectional +morphology. Given a corpus of sentences annotated with lemmas +and morphological labels, and optionally a lexicon, morfette +learns how to morphologically analyse new sentences. + +In the learning stage Morfette fits two separate logistic regression +models: one for morphological tagging and one for lemmatization. The +predictions of the models are combined dynamically and produce a +globally plausible sequence of morphological-tag - lemma pairs for +a sentence. + +In Morfette lemmatization is cast as a classification task where a +a lemmatization class corresponds to the specification of the edit +operations which are needed to transform the inflected word form into +the corresponding lemma. + +The basic approach is described in (Chrupala et al 2008 and Chrupala 2008). +The current version of Morfette uses an averaged perceptron to +fit the models, rather than Maximum Entropy training. The lemmatization +classes are Edit-Tree-based as described in (Chrupala 2008). + +=LICENSE= +The source code in the src directory is licensed under +the BSD license. + +=INSTALLATION= +Pre-built binaries are available from the project website. +If they don't work on your system you will +need to build from source, using the GHC Haskell compiler. Build +instructions are in [INSTALL] + +=USAGE= +Usage: morfette command [OPTION...] [ARG...] +train: train models +train [OPTION...] TRAIN-FILE MODEL-DIR + --dict-file=PATH path to optional dictionary + --language-configuration=es|pl|tr|.. language configuration + --class-entropy-prune-threshold=NUM class prune threshold + +predict: predict postags and lemmas using saved model data +predict [OPTION...] MODEL-DIR + --beam=+INT beam size to use + --tokenize tokenize input + +eval: evaluate morpho-tagging and lemmatization results +eval [OPTION...] TRAIN-FILE GOLD-FILE TEST-FILE + --ignore-case ignore case for evaluation + --baseline-file=PATH path to baseline results + --dict-file=PATH path to optional dictionary + --ignore-punctuation ignore punctuation for evaluation + --ignore-pos=POS-prefix ignore POS starting with POS-prefix for evaluation + + +=EXAMPLE USAGE= +To train a new model: +morfette train --dict-file=DICT TRAINING-FILE MODEL-DIR +RTS -K100m + +To use the model in MODEL-DIR to analyze new data: +morfette predict MODEL-DIR < TEST-DATA > ANALYZED-TEST-DATA + +=DATA FORMAT= +Morfette expects both training and testing data to be tokenized and +split into sentences. The format of training data look like this: + +Gómez Gómez np0000p +sostiene sostener vmip3s0 +que que cs +la el da0fs0 +propuesta propuesta ncfs000 +no no rn +cambiará cambiar vmif3s0 +. . Fp + +La el da0fs0 +propuesta propuesta ncfs000 +será ser vsif3s0 +la el da0fs0 +misma mismo pi0fs000 + + +There is one token per line, with three columns separated by spaces or +tabs. The columns contain word form, lemma and morphological tag +respectively. Sentences are separated by an empty line. Text should be +encoded in UTF-8. + +Test data format is similar, except only the first column is needed: + +Gómez +sostiene +que +la +propuesta +no +cambiará +. + +La +propuesta +será +la +misma + + +=References= +[1] Grzegorz Chrupala, Georgiana Dinu and Josef van Genabith. 2008. + Learning Morphology with Morfette. In Proceedings of LREC 2008. + http://www.lrec-conf.org/proceedings/lrec2008/pdf/594_paper.pdf + +[2] Grzegorz Chrupala. 2008. Towards a Machine-Learning Architecture + for Lexical Functional Grammar Parsing. Chapter 6. PhD + dissertation, Dublin City + University. + http://www.lsv.uni-saarland.de/personalPages/gchrupala/papers/phd.pdf diff --git a/src/Setup.lhs b/Setup.lhs similarity index 100% rename from src/Setup.lhs rename to Setup.lhs diff --git a/src/morfette.cabal b/morfette.cabal similarity index 90% rename from src/morfette.cabal rename to morfette.cabal index 5e4f6af..6649a1e 100644 --- a/src/morfette.cabal +++ b/morfette.cabal @@ -1,5 +1,5 @@ Name: morfette -Version: 0.3.1 +Version: 0.3.2 Homepage: http://sites.google.com/site/morfetteweb/ Synopsis: A tool for supervised learning of morphology Description: Morfette is a tool for supervised learning of inflectional @@ -15,6 +15,7 @@ category: Natural Language Processing Extra-source-files: README, INSTALL, Makefile Executable: morfette +hs-source-dirs: src Main-Is: Main.hs Other-Modules: GramLab.Commands, GramLab.Morfette.LZipper, GramLab.Data.StringLike, @@ -29,7 +30,7 @@ Other-Modules: GramLab.Commands, GramLab.Morfette.LZipper, GramLab.Morfette.Settings.Defaults, GramLab.Morfette.Features.Common, Lemma, POS, GramLab.Morfette.Utils -Build-depends: base >=3 && <=4 , containers, array, QuickCheck, mtl, +Build-depends: base >=3 && <=5 , containers, array, mtl, directory, filepath, haskell98, pretty, - utf8-string, bytestring, binary + utf8-string, bytestring, binary, QuickCheck >= 2.3 cc-options: -Wall diff --git a/src/GramLab/Morfette/NGrams.hs b/src/GramLab/Morfette/NGrams.hs deleted file mode 100755 index afbeddf..0000000 --- a/src/GramLab/Morfette/NGrams.hs +++ /dev/null @@ -1,46 +0,0 @@ ---module GramLab.Morfette.NGrams ( ) ---where - -import qualified Data.ByteString.Lazy.Char8 as S - ---import qualified Data.List as S -import Data.List -import Data.Char -import qualified GramLab.Data.MultiSet as MS -import qualified Data.Map as Map ---import GramLab.Utils -import GramLab.Morfette.LZipper -import Data.Binary -import System - -sepPunct xs = let (before,rest) = S.span isPunctuation xs - (this,after) = S.span (not . isPunctuation) rest - in filter (not . S.null) [before,this,after] - -tokenize = concatMap sepPunct . S.words - -toNGrams i z | atEnd z = [] -toNGrams i z = let Just w = focus z - in (w, flatten (take i (left z)) (take i (right z))):toNGrams i (slide z) - - -flatten l r = (l,r) --S.concat [S.unwords l,S.pack "_",S.unwords r] -main = do - [n] <- getArgs - txt <- S.getContents - let paragraphs = map S.unlines (splitWith S.null (S.lines txt)) - ngs = concatMap (\p -> toNGrams (read n) (fromList . tokenize $ txt)) paragraphs - flip mapM_ ngs $ \(w,(b,a)) -> S.putStrLn (S.unwords [w, S.unwords b, S.pack "__", S.unwords a]) - - -splitWith :: (a -> Bool) -> [a] -> [[a]] -splitWith f s = case dropWhile f s of - [] -> [] - s' -> w : splitWith f s'' - where (w, s'') = break f s' - -instance (Binary a, Ord a) => Binary (MS.MultiSet a) where - put ms = put (MS.toAscOccurList ms) - get = do - xs <- get - return $ MS.fromAscOccurList xs \ No newline at end of file diff --git a/src/GramLab/Morfette/Utils.hs b/src/GramLab/Morfette/Utils.hs index a87575a..f138689 100755 --- a/src/GramLab/Morfette/Utils.hs +++ b/src/GramLab/Morfette/Utils.hs @@ -7,8 +7,9 @@ module GramLab.Morfette.Utils ( train where import Prelude hiding (print,getContents,putStrLn,putStr ,writeFile,readFile) -import System.IO (stderr,stdout) -import System.IO.UTF8 +import System.IO (stderr,stdout,stdin,hSetBinaryMode) +import System.IO.UTF8 hiding (getContents,print,putStr,putStrLn) +import qualified System.IO.UTF8 as UTF8 import GramLab.Commands import qualified GramLab.Morfette.Models as Models import GramLab.Morfette.Models (Smth(..)) @@ -329,3 +330,11 @@ filterZip :: [Bool] -> [a] -> [a] filterZip xs ys = catMaybes $ zipWith (\b x -> if b then Just x else Nothing) xs ys +getContents :: IO String +getContents = hSetBinaryMode stdin True >> UTF8.getContents + +putStr :: String -> IO () +putStr s = hSetBinaryMode stdout True >> UTF8.putStr s + +putStrLn :: String -> IO () +putStrLn s = hSetBinaryMode stdout True >> UTF8.putStrLn s diff --git a/src/README b/src/README deleted file mode 100644 index a1976fd..0000000 --- a/src/README +++ /dev/null @@ -1,117 +0,0 @@ -=INTRODUCTION= - -Morfette website: http://sites.google.com/site/morfetteweb/ - -Morfette is a tool for supervised learning of inflectional -morphology. Given a corpus of sentences annotated with lemmas -and morphological labels, and optionally a lexicon, morfette -learns how to morphologically analyse new sentences. - -In the learning stage Morfette fits two separate logistic regression -models: one for morphological tagging and one for lemmatization. The -predictions of the models are combined dynamically and produce a -globally plausible sequence of morphological-tag - lemma pairs for -a sentence. - -In Morfette lemmatization is cast as a classification task where a -a lemmatization class corresponds to the specification of the edit -operations which are needed to transform the inflected word form into -the corresponding lemma. - -The basic approach is described in (Chrupala et al 2008 and Chrupala 2008). -The current version of Morfette uses an averaged perceptron to -fit the models, rather than Maximum Entropy training. The lemmatization -classes are Edit-Tree-based as described in (Chrupala 2008). - -=LICENSE= -The source code in the src directory is licensed under -the BSD license. - -=INSTALLATION= -Pre-built binaries are available from the project website. -If they don't work on your system you will -need to build from source, using the GHC Haskell compiler. Build -instructions are in [INSTALL] - -=USAGE= -Usage: morfette command [OPTION...] [ARG...] -train: train models -train [OPTION...] TRAIN-FILE MODEL-DIR - --dict-file=PATH path to optional dictionary - --language-configuration=es|pl|tr|.. language configuration - --class-entropy-prune-threshold=NUM class prune threshold - -predict: predict postags and lemmas using saved model data -predict [OPTION...] MODEL-DIR - --beam=+INT beam size to use - --tokenize tokenize input - -eval: evaluate morpho-tagging and lemmatization results -eval [OPTION...] TRAIN-FILE GOLD-FILE TEST-FILE - --ignore-case ignore case for evaluation - --baseline-file=PATH path to baseline results - --dict-file=PATH path to optional dictionary - --ignore-punctuation ignore punctuation for evaluation - --ignore-pos=POS-prefix ignore POS starting with POS-prefix for evaluation - - -=EXAMPLE USAGE= -To train a new model: -morfette train --dict-file=DICT TRAINING-FILE MODEL-DIR +RTS -K100m - -To use the model in MODEL-DIR to analyze new data: -morfette predict MODEL-DIR < TEST-DATA > ANALYZED-TEST-DATA - -=DATA FORMAT= -Morfette expects both training and testing data to be tokenized and -split into sentences. The format of training data look like this: - -Gómez Gómez np0000p -sostiene sostener vmip3s0 -que que cs -la el da0fs0 -propuesta propuesta ncfs000 -no no rn -cambiará cambiar vmif3s0 -. . Fp - -La el da0fs0 -propuesta propuesta ncfs000 -será ser vsif3s0 -la el da0fs0 -misma mismo pi0fs000 - - -There is one token per line, with three columns separated by spaces or -tabs. The columns contain word form, lemma and morphological tag -respectively. Sentences are separated by an empty line. Text should be -encoded in UTF-8. - -Test data format is similar, except only the first column is needed: - -Gómez -sostiene -que -la -propuesta -no -cambiará -. - -La -propuesta -será -la -misma - - -=References= -[1] Grzegorz Chrupala, Georgiana Dinu and Josef van Genabith. 2008. - Learning Morphology with Morfette. In Proceedings of LREC 2008. - http://www.lrec-conf.org/proceedings/lrec2008/pdf/594_paper.pdf - -[2] Grzegorz Chrupala. 2008. Towards a Machine-Learning Architecture - for Lexical Functional Grammar Parsing. Chapter 6. PhD - dissertation, Dublin City - University. - http://www.lsv.uni-saarland.de/personalPages/gchrupala/papers/phd.pdf