Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Initial commit.

  • Loading branch information...
commit 27a78c9988385ef01e2fa3c73c761aac4986af9f 0 parents
@drbunsen authored
Showing with 175 additions and 0 deletions.
  1. +77 −0 README.md
  2. BIN  describe
  3. +77 −0 describe.hs
  4. +21 −0 license.txt
77 README.md
@@ -0,0 +1,77 @@
+`describe`: an exploratory data analysis tool
+=============================================
+
+### Description:
+
+`describe` is a simple command line utility written in Haskell for calculating common descriptive statistics on numerical data.
+
+### Installation:
+
+To install `describe`, follow these steps:
+
+1. Download the haskell platform.
+
+ * On a Mac:
+ `$ brew install haskell-platform`
+ * On Linux:
+ `$ sudo apt-get install ghc ghc-prof cabal-install`
+
+2. After successful installation, run the following commands to install the approporiate dependencies:
+
+ `$ cabal update`
+ `$ cabal install cabal-install`
+ `$ cabal install statistics`
+
+3. Next, compile `describe`:
+
+ `$ ghc -O2 describe.hs`
+
+4. Place describe in your path:
+
+ `$ mv describe ~/bin/
+
+5. Test that describe is working:
+
+ `$ echo "1\n2\n3\n4" | describe`
+
+### Usage:
+
+`describe` accepts numerical data in column form, i.e. delimited by newline characters.
+
+As an example consider the following data file with two columns of comma seperated values:
+
+```
+1e-10,3
+1e-10,6
+2.2345,2
+3.4569,1
+1e3,5
+```
+
+To analyze the first column:
+
+`$ cut -f1 -d"," | describe | column -t`
+
+This will print the following summary to the screen:
+
+```
+Length : 5
+Min : 1.0e-10
+Max : 1000.0
+Range : 1000.0000
+Q1 : 0.0000
+Q2 : 2.2345
+Q3 : 335.6379
+IQR : 335.6379
+Trimean : 85.0267
+Midhinge : 167.8190
+Mean : 201.1383
+Mode : 1.0e-10
+Kurtosis : 0.2499
+Skewness : 1.5000
+Entropy : 1.9219
+```
+
+Further information on how to work with rows and multiple columns can be found [in this blog post][1]
+
+[1]: http://www.drbunsen.org/explorations-in-unix.html#describe
BIN  describe
Binary file not shown
77 describe.hs
@@ -0,0 +1,77 @@
+import qualified System.Environment as SE
+import qualified Data.ByteString.Lazy.Char8 as LC
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.List as DL
+import qualified Data.Map as DM
+import qualified Data.Vector as DV
+import qualified Statistics.Sample as SS
+import qualified Statistics.Quantile as SQ
+import Text.Printf
+
+main :: IO()
+main = do
+ input <- LC.getContents
+ output (nums $ LC.lines input)
+
+-- clean input into a list of numbers
+nums :: [LC.ByteString] -> [Double]
+nums llns = floats $ map BC.unpack $ slns llns
+ where
+ slns = map (BC.concat . LC.toChunks)
+ floats [] = []
+ floats [x] = [read x :: Double]
+ floats (x:xs) = [read x :: Double] ++ floats xs
+
+-- | calculate the frequency of items in a set
+-- [ [(number, frequency), ....
+freqs xs = DM.toList $ DM.fromListWith (+) [(c, 1) | c <- xs]
+
+-- | calculate the mode of a list of numbers
+mode xs = (fst . DL.head) $ DL.sortBy sortGT $ freqs xs
+ where
+ sortGT (a1, b1) (a2, b2)
+ | b1 < b2 = GT
+ | b1 > b2 = LT
+ | b1 == b2 = compare b1 b2
+
+-- | calculate a quantile point
+qtile q t xs = (SQ.continuousBy SQ.medianUnbiased q t) $ DV.fromList xs
+
+-- | calculate the midhinge
+midhinge xs = (qtile 1 4 xs + qtile 3 4 xs) / 2
+
+-- | calculate the trimean
+trimean xs = (qtile 2 4 xs + midhinge xs) / 2
+
+h :: (Floating a, Ord t) => [t] -> a
+-- | calculate Shannon Entropy
+h xs = negate . sum . map (\(i,j) -> (p j xs) * (logBase 2 $ p j xs)) $ freqs xs
+ where
+ p n lst = (/) n tot_len
+ tot_len = (fromIntegral $ length xs)
+
+-- | string formatter: round to 4 decimal places
+format f n = printf "%.4f" (f $ DV.fromList n)
+
+-- | display output formatting
+display xs = sequence_ [putStrLn (a++" : "++b) | (a,b) <- xs]
+
+output :: [Double] -> IO ()
+output xs = display table
+ where table = [("Length", show $ length xs)
+ ,("Min", show $ minimum xs)
+ ,("Max", show $ maximum xs)
+ ,("Range", printf "%.4f" $ maximum xs - minimum xs)
+ ,("Q1", format (SQ.continuousBy SQ.medianUnbiased 1 4) xs)
+ ,("Q2", format (SQ.continuousBy SQ.medianUnbiased 2 4) xs)
+ ,("Q3", format (SQ.continuousBy SQ.medianUnbiased 3 4) xs)
+ ,("IQR", format (SQ.midspread SQ.medianUnbiased 4) xs)
+ ,("Trimean", printf "%.4f" $ trimean xs)
+ ,("Midhinge", printf "%.4f" $ midhinge xs)
+ ,("Mean", format SS.mean xs)
+ ,("Mode", show $ mode xs)
+ ,("Kurtosis", format SS.kurtosis xs)
+ ,("Skewness", format SS.skewness xs)
+ ,("Entropy", printf "%.4f" ((h xs) :: Float))]
+
+
21 license.txt
@@ -0,0 +1,21 @@
+
+Copyright 2012 Seth Brown, http://www.drbunsen.org/explorations-in-unix.html
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

0 comments on commit 27a78c9

Please sign in to comment.
Something went wrong with that request. Please try again.