Skip to content
This repository has been archived by the owner on Nov 28, 2018. It is now read-only.

Commit

Permalink
Include working clogparse
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisdone committed Oct 17, 2014
1 parent 9374a93 commit 19b0258
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 0 deletions.
152 changes: 152 additions & 0 deletions clogparse-0.2/Data/IRC/CLog/Parse.hs
@@ -0,0 +1,152 @@
{-# LANGUAGE
PatternGuards #-}

-- | Parse events from @clog@ output, such as the files
-- at <http://tunes.org/~nef/logs/haskell/>.
--
-- IRC has no single standard character encoding. This
-- module decodes messages as UTF-8 following common
-- practice on Freenode.

module Data.IRC.CLog.Parse
(
-- * Parsing log files
parseLog
-- * Configuring the parser
, Config(..)
, haskellConfig
-- * Re-export
, module Data.IRC.Event
) where

import Data.IRC.Event

import Data.Word
import Data.List
import Control.Applicative

import qualified Data.Foldable as F
import qualified Data.Attoparsec as P
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import qualified Data.Time as Time
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as T
import qualified System.FilePath as Path
import qualified System.Environment as Env
import qualified System.IO.Error as IOError
import qualified Control.Exception as Ex
import qualified Data.Time.LocalTime.TimeZone.Series as Zone
import qualified Data.Time.LocalTime.TimeZone.Olson as Zone


-- | Configuring the parser.
data Config = Config
{ timeZone :: String -- ^ Timestamp time zone; an Olson time zone name.
, zoneInfo :: FilePath -- ^ Directory for time zone files; @$TZDIR@ overrides.
} deriving (Show)

-- | @'Config'@ value suitable for parsing @#haskell@ logs on Linux.
haskellConfig :: Config
haskellConfig = Config
{ timeZone = "America/Los_Angeles"
, zoneInfo = "/usr/share/zoneinfo" }


-- Many text encodings are used on IRC.
-- We decode clog metadata as ASCII.
-- We parse messages as UTF-8 in a lenient mode.

decode :: B.ByteString -> T.Text
decode = T.decodeUtf8With T.lenientDecode


-- Timestamps are in local time and must be converted.

type TimeConv = Time.LocalTime -> Time.UTCTime

getTimeConv :: FilePath -> IO TimeConv
getTimeConv p = Zone.localTimeToUTC' <$> Zone.getTimeZoneSeriesFromOlsonFile p

data TimeAdj = TimeAdj Time.Day TimeConv


-- Parsers.

notNewline :: Word8 -> Bool
notNewline w = w /= 13 && w /= 10

restOfLine :: P.Parser T.Text
restOfLine = decode <$> P.takeWhile notNewline <* P.take 1

nextLine :: P.Parser ()
nextLine = P.skipWhile notNewline <* P.take 1

digits :: Int -> P.Parser Int
digits n = atoi <$> P.count n digit where
atoi = foldl' (\m d -> m*10 + fromIntegral d - 48) 0
digit = P.satisfy isDigit
isDigit w = w >= 48 && w <= 57

time :: TimeAdj -> P.Parser Time.UTCTime
time (TimeAdj day conv) = f <$> d2 <* col <*> d2 <* col <*> d2 where
d2 = digits 2
col = P.word8 58
f h m s = conv . Time.LocalTime day $ Time.TimeOfDay h m (fromIntegral s)

event :: P.Parser Event
event = F.asum
[ str " --- " *> F.asum
[ userAct Join "join: "
, userAct Part "part: "
, userAct Quit "quit: "
, ReNick <$ str "nick: " <*> nick <* str " -> " <*> nick <* nextLine
, Mode <$ str "mode: " <*> nick <* str " set " <*> restOfLine
, Kick <$ str "kick: " <*> nick <* str " was kicked by " <*> nick <* chr ' ' <*> restOfLine
, global Log "log: "
, global Topic "topic: "
, global Names "names: "
]
, Talk <$ str " <" <*> nick <* str "> " <*> restOfLine
, Notice <$ str " -" <*> nick <*> restOfLine -- FIXME: parse host
, Act <$ str " * " <*> nick <* chr ' ' <*> restOfLine
] where
chr = P.word8 . fromIntegral . fromEnum
str = P.string . B8.pack
nick = (Nick . decode) <$> P.takeWhile (not . P.inClass " \n\r\t\v<>")
userAct f x = f <$ str x <*> nick <* chr ' ' <*> restOfLine
global f x = f <$ str x <*> restOfLine

line :: TimeAdj -> P.Parser EventAt
line adj =
P.try (EventAt <$> time adj <*> event)
<|> (NoParse <$> restOfLine)

safeRead :: (Read a) => String -> Maybe a
safeRead x | [(v,"")] <- reads x = Just v
safeRead _ = Nothing

getDay :: FilePath -> Time.Day
getDay p
| (_, [y1,y0,'.',m1,m0,'.',d1,d0]) <- Path.splitFileName p
, Just [y,m,d] <- mapM safeRead [[y1,y0],[m1,m0],[d1,d0]]
= Time.fromGregorian (2000 + fromIntegral y) m d
getDay p = error ("cannot parse date from filename: " ++ p)

-- | Parse a log file.
--
-- The file name (after any directory) is significant.
-- It is used to set the date for timestamps.
-- It should have the form @YY.MM.DD@, as do the files on
-- @tunes.org@.
parseLog :: Config -> FilePath -> IO [EventAt]
parseLog (Config{timeZone=tz, zoneInfo=zi}) p = do
tzdir <- either (const zi :: Ex.IOException -> FilePath) id <$> Ex.try (Env.getEnv "TZDIR")
adj <- TimeAdj (getDay p) <$> getTimeConv (Path.combine tzdir tz)
b <- B.readFile p
let go r@P.Fail{} = error $ show r
go (P.Partial g) = go $ g B.empty
go (P.Done _ x) = x
let es = go $ P.parse (P.manyTill (line adj) P.endOfInput) b
return es
81 changes: 81 additions & 0 deletions clogparse-0.2/Data/IRC/Event.hs
@@ -0,0 +1,81 @@
{-# LANGUAGE
DeriveDataTypeable #-}

-- | Represents events in an IRC channel.
-- These do not correspond precisely to messages of the IRC
-- protocol. They provide a somewhat higher-level view.

module Data.IRC.Event
( -- * Events
Nick (..)
, Event (..)
, EventAt(..)

-- * Generic events
, GenericEvent(..)
, decompose
) where

import qualified Data.Time as Time
import qualified Data.Text as T

import Data.Typeable ( Typeable )
import Data.Data ( Data, Constr, toConstr )

-- | Event with timestamp.
data EventAt
= EventAt Time.UTCTime Event -- ^ Event with timestamp.
| NoParse T.Text -- ^ Unparsable line.
deriving (Show, Eq, Ord, Typeable)
-- UTCTime lacks Data, so we can't derive it.
-- We don't want to force that orphan instance on users.

-- | IRC nicks.
newtype Nick = Nick T.Text
deriving (Show, Eq, Ord, Typeable, Data)

-- | Events in an IRC channel.
data Event
= Join Nick T.Text -- ^ User joined.
| Part Nick T.Text -- ^ User left the channel.
| Quit Nick T.Text -- ^ User quit the server.
| ReNick Nick Nick -- ^ User changed from one to another nick.
| Talk Nick T.Text -- ^ User spoke (@PRIVMSG@).
| Notice Nick T.Text -- ^ User spoke (@NOTICE@).
| Act Nick T.Text -- ^ User acted (@CTCP ACTION@).
| Kick Nick Nick T.Text -- ^ User was kicked by user.
| Mode Nick T.Text -- ^ User set mode on the channel.
| Log T.Text -- ^ Logging started or stopped.
| Topic T.Text -- ^ Topic listing or change.
| Names T.Text -- ^ Users list.
deriving (Show, Eq, Ord, Typeable, Data)

-- | For working with @'Event'@s generically.
--
-- Indicates the \"subject\" of an event, if any, followed
-- by other text.
--
-- The subject of a @'ReNick'@ event is the old nick.
data GenericEvent
= GenericEvent Constr (Maybe Nick) [T.Text]
deriving (Show, Eq, Typeable)

-- | Decompose an @'Event'@ into a @'GenericEvent'@.
decompose :: Event -> GenericEvent
decompose x = go x where
c = GenericEvent $ toConstr x

go (Join n t) = c (Just n) [t]
go (Part n t) = c (Just n) [t]
go (Quit n t) = c (Just n) [t]
go (Talk n t) = c (Just n) [t]
go (Notice n t) = c (Just n) [t]
go (Act n t) = c (Just n) [t]
go (Mode n t) = c (Just n) [t]

go (Log t) = c Nothing [t]
go (Topic t) = c Nothing [t]
go (Names t) = c Nothing [t]

go (ReNick n (Nick t)) = c (Just n) [t]
go (Kick a (Nick b) t) = c (Just a) [b,t]
26 changes: 26 additions & 0 deletions clogparse-0.2/LICENSE
@@ -0,0 +1,26 @@
Copyright (c) Keegan McAllister 2010

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the author nor the names of his contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4 changes: 4 additions & 0 deletions clogparse-0.2/Setup.hs
@@ -0,0 +1,4 @@
#! /usr/bin/runhaskell

import Distribution.Simple
main = defaultMain
31 changes: 31 additions & 0 deletions clogparse-0.2/clogparse.cabal
@@ -0,0 +1,31 @@
name: clogparse
version: 0.2
license: BSD3
license-file: LICENSE
synopsis: Parse IRC logs such as the #haskell logs on tunes.org
category: Text, IRC, Language
author: Keegan McAllister <mcallister.keegan@gmail.com>
maintainer: Keegan McAllister <mcallister.keegan@gmail.com>
build-type: Simple
cabal-version: >=1.2
description:
Use this library to parse IRC logs saved by the @clog@ bot on Freenode.
This includes the logs of @#haskell@ which are available from
<http://tunes.org/~nef/logs/haskell/>.
.
Suggestions and patches are welcome.

library
exposed-modules:
Data.IRC.Event
, Data.IRC.CLog.Parse
ghc-options: -Wall
build-depends:
base >= 3 && < 5
, bytestring >= 0.9
, text >= 0.8
, time >= 1.1
, filepath >= 1.1
, timezone-olson >= 0.1
, timezone-series >= 0.1
, attoparsec >= 0.8

0 comments on commit 19b0258

Please sign in to comment.