This repository has been archived by the owner on Nov 28, 2018. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
{-# LANGUAGE | ||
PatternGuards #-} | ||
|
||
-- | Parse events from @clog@ output, such as the files | ||
-- at <http://tunes.org/~nef/logs/haskell/>. | ||
-- | ||
-- IRC has no single standard character encoding. This | ||
-- module decodes messages as UTF-8 following common | ||
-- practice on Freenode. | ||
|
||
module Data.IRC.CLog.Parse | ||
( | ||
-- * Parsing log files | ||
parseLog | ||
-- * Configuring the parser | ||
, Config(..) | ||
, haskellConfig | ||
-- * Re-export | ||
, module Data.IRC.Event | ||
) where | ||
|
||
import Data.IRC.Event | ||
|
||
import Data.Word | ||
import Data.List | ||
import Control.Applicative | ||
|
||
import qualified Data.Foldable as F | ||
import qualified Data.Attoparsec as P | ||
import qualified Data.ByteString as B | ||
import qualified Data.ByteString.Char8 as B8 | ||
import qualified Data.Time as Time | ||
import qualified Data.Text as T | ||
import qualified Data.Text.Encoding as T | ||
import qualified Data.Text.Encoding.Error as T | ||
import qualified System.FilePath as Path | ||
import qualified System.Environment as Env | ||
import qualified System.IO.Error as IOError | ||
import qualified Control.Exception as Ex | ||
import qualified Data.Time.LocalTime.TimeZone.Series as Zone | ||
import qualified Data.Time.LocalTime.TimeZone.Olson as Zone | ||
|
||
|
||
-- | Configuring the parser. | ||
data Config = Config | ||
{ timeZone :: String -- ^ Timestamp time zone; an Olson time zone name. | ||
, zoneInfo :: FilePath -- ^ Directory for time zone files; @$TZDIR@ overrides. | ||
} deriving (Show) | ||
|
||
-- | @'Config'@ value suitable for parsing @#haskell@ logs on Linux. | ||
haskellConfig :: Config | ||
haskellConfig = Config | ||
{ timeZone = "America/Los_Angeles" | ||
, zoneInfo = "/usr/share/zoneinfo" } | ||
|
||
|
||
-- Many text encodings are used on IRC. | ||
-- We decode clog metadata as ASCII. | ||
-- We parse messages as UTF-8 in a lenient mode. | ||
|
||
decode :: B.ByteString -> T.Text | ||
decode = T.decodeUtf8With T.lenientDecode | ||
|
||
|
||
-- Timestamps are in local time and must be converted. | ||
|
||
type TimeConv = Time.LocalTime -> Time.UTCTime | ||
|
||
getTimeConv :: FilePath -> IO TimeConv | ||
getTimeConv p = Zone.localTimeToUTC' <$> Zone.getTimeZoneSeriesFromOlsonFile p | ||
|
||
data TimeAdj = TimeAdj Time.Day TimeConv | ||
|
||
|
||
-- Parsers. | ||
|
||
notNewline :: Word8 -> Bool | ||
notNewline w = w /= 13 && w /= 10 | ||
|
||
restOfLine :: P.Parser T.Text | ||
restOfLine = decode <$> P.takeWhile notNewline <* P.take 1 | ||
|
||
nextLine :: P.Parser () | ||
nextLine = P.skipWhile notNewline <* P.take 1 | ||
|
||
digits :: Int -> P.Parser Int | ||
digits n = atoi <$> P.count n digit where | ||
atoi = foldl' (\m d -> m*10 + fromIntegral d - 48) 0 | ||
digit = P.satisfy isDigit | ||
isDigit w = w >= 48 && w <= 57 | ||
|
||
time :: TimeAdj -> P.Parser Time.UTCTime | ||
time (TimeAdj day conv) = f <$> d2 <* col <*> d2 <* col <*> d2 where | ||
d2 = digits 2 | ||
col = P.word8 58 | ||
f h m s = conv . Time.LocalTime day $ Time.TimeOfDay h m (fromIntegral s) | ||
|
||
event :: P.Parser Event | ||
event = F.asum | ||
[ str " --- " *> F.asum | ||
[ userAct Join "join: " | ||
, userAct Part "part: " | ||
, userAct Quit "quit: " | ||
, ReNick <$ str "nick: " <*> nick <* str " -> " <*> nick <* nextLine | ||
, Mode <$ str "mode: " <*> nick <* str " set " <*> restOfLine | ||
, Kick <$ str "kick: " <*> nick <* str " was kicked by " <*> nick <* chr ' ' <*> restOfLine | ||
, global Log "log: " | ||
, global Topic "topic: " | ||
, global Names "names: " | ||
] | ||
, Talk <$ str " <" <*> nick <* str "> " <*> restOfLine | ||
, Notice <$ str " -" <*> nick <*> restOfLine -- FIXME: parse host | ||
, Act <$ str " * " <*> nick <* chr ' ' <*> restOfLine | ||
] where | ||
chr = P.word8 . fromIntegral . fromEnum | ||
str = P.string . B8.pack | ||
nick = (Nick . decode) <$> P.takeWhile (not . P.inClass " \n\r\t\v<>") | ||
userAct f x = f <$ str x <*> nick <* chr ' ' <*> restOfLine | ||
global f x = f <$ str x <*> restOfLine | ||
|
||
line :: TimeAdj -> P.Parser EventAt | ||
line adj = | ||
P.try (EventAt <$> time adj <*> event) | ||
<|> (NoParse <$> restOfLine) | ||
|
||
safeRead :: (Read a) => String -> Maybe a | ||
safeRead x | [(v,"")] <- reads x = Just v | ||
safeRead _ = Nothing | ||
|
||
getDay :: FilePath -> Time.Day | ||
getDay p | ||
| (_, [y1,y0,'.',m1,m0,'.',d1,d0]) <- Path.splitFileName p | ||
, Just [y,m,d] <- mapM safeRead [[y1,y0],[m1,m0],[d1,d0]] | ||
= Time.fromGregorian (2000 + fromIntegral y) m d | ||
getDay p = error ("cannot parse date from filename: " ++ p) | ||
|
||
-- | Parse a log file. | ||
-- | ||
-- The file name (after any directory) is significant. | ||
-- It is used to set the date for timestamps. | ||
-- It should have the form @YY.MM.DD@, as do the files on | ||
-- @tunes.org@. | ||
parseLog :: Config -> FilePath -> IO [EventAt] | ||
parseLog (Config{timeZone=tz, zoneInfo=zi}) p = do | ||
tzdir <- either (const zi :: Ex.IOException -> FilePath) id <$> Ex.try (Env.getEnv "TZDIR") | ||
adj <- TimeAdj (getDay p) <$> getTimeConv (Path.combine tzdir tz) | ||
b <- B.readFile p | ||
let go r@P.Fail{} = error $ show r | ||
go (P.Partial g) = go $ g B.empty | ||
go (P.Done _ x) = x | ||
let es = go $ P.parse (P.manyTill (line adj) P.endOfInput) b | ||
return es |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
{-# LANGUAGE | ||
DeriveDataTypeable #-} | ||
|
||
-- | Represents events in an IRC channel. | ||
-- These do not correspond precisely to messages of the IRC | ||
-- protocol. They provide a somewhat higher-level view. | ||
|
||
module Data.IRC.Event | ||
( -- * Events | ||
Nick (..) | ||
, Event (..) | ||
, EventAt(..) | ||
|
||
-- * Generic events | ||
, GenericEvent(..) | ||
, decompose | ||
) where | ||
|
||
import qualified Data.Time as Time | ||
import qualified Data.Text as T | ||
|
||
import Data.Typeable ( Typeable ) | ||
import Data.Data ( Data, Constr, toConstr ) | ||
|
||
-- | Event with timestamp. | ||
data EventAt | ||
= EventAt Time.UTCTime Event -- ^ Event with timestamp. | ||
| NoParse T.Text -- ^ Unparsable line. | ||
deriving (Show, Eq, Ord, Typeable) | ||
-- UTCTime lacks Data, so we can't derive it. | ||
-- We don't want to force that orphan instance on users. | ||
|
||
-- | IRC nicks. | ||
newtype Nick = Nick T.Text | ||
deriving (Show, Eq, Ord, Typeable, Data) | ||
|
||
-- | Events in an IRC channel. | ||
data Event | ||
= Join Nick T.Text -- ^ User joined. | ||
| Part Nick T.Text -- ^ User left the channel. | ||
| Quit Nick T.Text -- ^ User quit the server. | ||
| ReNick Nick Nick -- ^ User changed from one to another nick. | ||
| Talk Nick T.Text -- ^ User spoke (@PRIVMSG@). | ||
| Notice Nick T.Text -- ^ User spoke (@NOTICE@). | ||
| Act Nick T.Text -- ^ User acted (@CTCP ACTION@). | ||
| Kick Nick Nick T.Text -- ^ User was kicked by user. | ||
| Mode Nick T.Text -- ^ User set mode on the channel. | ||
| Log T.Text -- ^ Logging started or stopped. | ||
| Topic T.Text -- ^ Topic listing or change. | ||
| Names T.Text -- ^ Users list. | ||
deriving (Show, Eq, Ord, Typeable, Data) | ||
|
||
-- | For working with @'Event'@s generically. | ||
-- | ||
-- Indicates the \"subject\" of an event, if any, followed | ||
-- by other text. | ||
-- | ||
-- The subject of a @'ReNick'@ event is the old nick. | ||
data GenericEvent | ||
= GenericEvent Constr (Maybe Nick) [T.Text] | ||
deriving (Show, Eq, Typeable) | ||
|
||
-- | Decompose an @'Event'@ into a @'GenericEvent'@. | ||
decompose :: Event -> GenericEvent | ||
decompose x = go x where | ||
c = GenericEvent $ toConstr x | ||
|
||
go (Join n t) = c (Just n) [t] | ||
go (Part n t) = c (Just n) [t] | ||
go (Quit n t) = c (Just n) [t] | ||
go (Talk n t) = c (Just n) [t] | ||
go (Notice n t) = c (Just n) [t] | ||
go (Act n t) = c (Just n) [t] | ||
go (Mode n t) = c (Just n) [t] | ||
|
||
go (Log t) = c Nothing [t] | ||
go (Topic t) = c Nothing [t] | ||
go (Names t) = c Nothing [t] | ||
|
||
go (ReNick n (Nick t)) = c (Just n) [t] | ||
go (Kick a (Nick b) t) = c (Just a) [b,t] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
Copyright (c) Keegan McAllister 2010 | ||
|
||
All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions | ||
are met: | ||
1. Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
2. Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
3. Neither the name of the author nor the names of his contributors | ||
may be used to endorse or promote products derived from this software | ||
without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' | ||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY | ||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | ||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON | ||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#! /usr/bin/runhaskell | ||
|
||
import Distribution.Simple | ||
main = defaultMain |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: clogparse | ||
version: 0.2 | ||
license: BSD3 | ||
license-file: LICENSE | ||
synopsis: Parse IRC logs such as the #haskell logs on tunes.org | ||
category: Text, IRC, Language | ||
author: Keegan McAllister <mcallister.keegan@gmail.com> | ||
maintainer: Keegan McAllister <mcallister.keegan@gmail.com> | ||
build-type: Simple | ||
cabal-version: >=1.2 | ||
description: | ||
Use this library to parse IRC logs saved by the @clog@ bot on Freenode. | ||
This includes the logs of @#haskell@ which are available from | ||
<http://tunes.org/~nef/logs/haskell/>. | ||
. | ||
Suggestions and patches are welcome. | ||
|
||
library | ||
exposed-modules: | ||
Data.IRC.Event | ||
, Data.IRC.CLog.Parse | ||
ghc-options: -Wall | ||
build-depends: | ||
base >= 3 && < 5 | ||
, bytestring >= 0.9 | ||
, text >= 0.8 | ||
, time >= 1.1 | ||
, filepath >= 1.1 | ||
, timezone-olson >= 0.1 | ||
, timezone-series >= 0.1 | ||
, attoparsec >= 0.8 |