/
Main.hs
128 lines (122 loc) · 6.27 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
module Main where
import Data.Char
import Data.List
import Data.Maybe
import Data.Matrix
import System.Random
import Control.Monad
main :: IO ()
main = do
-- Training a Naive Bayes classifier.
putStrLn "Training model..."
-- Reading text files with tweets. Data from Sentiment140.com.
niceTweets <- readFile "src/niceTweets.txt"
meanTweets <- readFile "src/meanTweets.txt"
-- Getting global random number generator to randomize data set.
generator <- getStdGen
let -- Matrix with processed lines (a column vector, but the type is Matrix).
allExamples :: Matrix (Int,[String])
allExamples = (niceTweets`labeledWith`1) <-> (meanTweets`labeledWith`0)
-- Number of examples to train and test the model on.
sampleSize :: Int
sampleSize = 1000
-- List of sampleSize random indices.
randomIndices :: [Int]
randomIndices = take sampleSize (randomRs (1,nrows allExamples) generator)
-- Test and train data sets with a 25-75 split.
test, train :: [(Int, [String])]
test = [allExamples!(i,1) | i <- take (sampleSize`div`4) randomIndices]
train = [allExamples!(i,1) | i <- drop (sampleSize`div`4) randomIndices]
-- Set of all words from our training examples.
vocabulary :: [String]
vocabulary = nub (concatMap snd train)
-- List with the number of training examples per label (matching labels).
examplesPerLabel :: [Double]
examplesPerLabel = map count [0,1] where
count l = foldr (\ (n,_) a -> a + if l == n then 1 else 0) 1 train
-- Matrix of learnt parameters (counts).
trainedParameters :: Matrix Double
trainedParameters = fit train vocabulary examplesPerLabel
-- Predictions of our trained model on the test set.
predictions :: [Int]
predictions = map (\ e -> predict e trainedParameters vocabulary examplesPerLabel)
[unwords example | example <- map snd test]
-- Percentage of incorrect predictions, where 0 < error < 1.
error :: Double
error = fromIntegral (length [(p,l) | (p,l) <- zip predictions (map fst test), p /= l])
/ fromIntegral (length test)
-- Showing error.
if error > 1.0
then putStrLn "Something went wrong."
else do putStrLn "Training complete. Error: "
print error
-- Running main program.
forever $ do
putStrLn "Tell me something..."
input <- getLine
putStrLn $ if predict input trainedParameters vocabulary examplesPerLabel == 1
then "That is nice."
else "That is not nice."
-- Separates semantically important characters (so they are treated as words)
-- and removes other punctuation from text to return a set of its words.
clean :: String -> [String]
clean text = (nub.words.filterpunc) text where
filterpunc :: String -> String
filterpunc [] = []
filterpunc (h:t) = let processed c | c `elem` "?!@#$%&^*=+-" = ' ' : c : " "
| c `elem` ".,:;_'`()[]{}0123456789" = []
| otherwise = toLower c : []
in processed h ++ filterpunc t
-- Processes the file to create a vector of tuples of form (label,sentence).
labeledWith :: String -> Int -> Matrix (Int,[String])
labeledWith text label = fromList (length labeledText) 1 labeledText where
labeledText :: [(Int,[String])]
labeledText = zip [label,label..] [clean line | line <- lines text]
-- Trains the classifier by returning a matrix where the (i,j) element is the
-- number of jth-labelled examples with ith word. (e.g. number of "Negative"
-- examples with word "sad")
fit :: [(Int,[String])] -> [String] -> [Double] -> Matrix Double
fit examples vocabulary labels = foldr addCount allOnesMatrix examples where
-- Laplace smoothing is done here by initializing a matrix with 1's.
allOnesMatrix :: Matrix Double
allOnesMatrix = matrix (length labels) (length vocabulary) (\ _ -> 1)
-- Traverses the sentence to update the matrix at every word.
addCount :: (Int,[String]) -> Matrix Double -> Matrix Double
addCount (label,sentence) parameters = let
add1 :: String -> Matrix Double -> Matrix Double
add1 word params = setElem (getElem i j params + 1) (i,j) params where
i = 1 + label
j = 1 + fromJust (elemIndex word vocabulary)
in foldr add1 parameters sentence
-- Predicts a label (e.g."Positive") given an input phrase and a trained model.
predict :: String -> Matrix Double -> [String] -> [Double] -> Int
predict phrase parameters vocabulary examplesPerLabel = let
-- Returns label corresponding to index with the highest probability in list.
highestProbability :: [Double] -> Int
highestProbability list = fromJust (elemIndex (maximum list) list)
-- Computes the probability that the input phrase has the given label.
-- (Not really conditional probabilities since we do not compute the
-- denominator). Using log probabilities to avoid underflow issues.
probability :: Int -> Double
probability label = sum (fmap log conditionals) + log labelProbability where
-- Probability p(y=l) that any phrase is tagged with label l.
labelProbability :: Double
labelProbability = examplesPerLabel!!label / sum examplesPerLabel
-- Vector of conditional probabilities p(x|y) that a y-labeled phrase has
-- the word x. We speed up computations by mapping ¬p(x|y) to the y row
-- and only modify it at the indices of the words from the input phrase.
conditionals :: Matrix Double
conditionals = negProbs `positiveAt` wordIndices where
-- Uses the count to compute the probability ¬p(x|y).
negProbs :: Matrix Double
negProbs = fmap (\ count -> 1 - (count/examplesPerLabel!!label))
(submatrix (label+1) (label+1) 1 (length vocabulary) parameters)
-- Reverses the probability to p(x|y) at every given index.
positiveAt :: Matrix Double -> [Int] -> Matrix Double
positiveAt ps [] = ps
positiveAt ps (j:js) = positiveAt (setElem (1 - getElem 1 (j+1) ps) (1,j+1) ps) js
-- Indices of the words from the input phrase that are in our vocabulary.
wordIndices :: [Int]
wordIndices = map (\ word -> (fromJust . elemIndex word) vocabulary)
[word | word <- clean phrase, word `elem` vocabulary]
in highestProbability (map probability [0 .. nrows parameters - 1])