# Setup Environment

In [1]:
;; Enable stack traces
;; (clojupyter.misc.stacktrace/set-print-stacktraces! true)
(require '[clojupyter.misc.helper :as helper])

(->> '[[clojure-opennlp "0.5.0"]
       [kixi/stats "0.5.0"]
       [io.forward/clojure-mail "1.0.7"]
       [clojure2d "1.1.0"]
       [metasoarous/oz "1.5.0"]
       [clj-time "0.15.0"]
       [net.cgrand/xforms "0.18.2"]]
     (map helper/add-dependencies)
     doall)

(print (str "Done!"))

Done!

nil

In [2]:
;; Load VADER as local repository
;; The vader repo binary must be installed in this directory ./maven-repository
(do
    (use '[cemerick.pomegranate :only (add-dependencies)])
    (add-dependencies 
        :coordinates '[[local/vader "2.0.1"]] 
        :repositories {"local/vader" (str (.toURI (java.io.File. "./maven_repository")))}))

{[local/vader "2.0.1"] nil}

In [3]:
;; Build namespace
(ns drafts.sentiment_analysis
    (:import [net.nunoachenriques.vader SentimentAnalysis]
             [net.nunoachenriques.vader.lexicon English]
             [net.nunoachenriques.vader.text TokenizerEnglish]
             [java.io FileInputStream File]
             [javax.mail Session]
             [javax.mail.internet MimeMessage]
             [java.util Properties])
    (:require [kixi.stats.core :as stats]
              [clojure-mail.core :as mail]
              [clojure-mail.message :refer (read-message)]
              [oz.notebook.clojupyter :as oz]
              [clj-time.core :as t]
              [clj-time.coerce :as c]
              [net.cgrand.xforms :as x])
    (:use [clojure.repl :only (doc source)]
          [clojure.pprint :only (pprint print-table)]
          [opennlp.nlp :only (make-sentence-detector)]))

*ns*



#namespace[drafts.sentiment_analysis]

In [4]:
(set! *warn-on-reflection* true)

true

# Analyzing Sentiment w/ Vader

In [5]:
(def language (English.))
(def tokenizer (TokenizerEnglish.))

#'drafts.sentiment_analysis/tokenizer

In [6]:
(def sa (SentimentAnalysis. language tokenizer))

#'drafts.sentiment_analysis/sa

In [7]:
(. sa (getSentimentAnalysis "Yay!! You are the best!"))



{"negative" 0.0, "neutral" 0.261, "positive" 0.739, "compound" 0.8582}

In [8]:
;; Avoiding reflection by type hint:
(. ^SentimentAnalysis sa (getSentimentAnalysis "Yay!! You are the best!"))

{"negative" 0.0, "neutral" 0.261, "positive" 0.739, "compound" 0.8582}

# Reading Emails

In [9]:
(def maildir-path "data/enron_mail/maildir")

#'drafts.sentiment_analysis/maildir-path

In [10]:
(def sample-msg 
    (-> "data/enron_mail/maildir/arnold-j/_sent_mail/36."
        (clojure.java.io/as-file)
        (mail/file->message)
        (read-message)))

(pprint sample-msg)

{:cc (),
 :bcc (),
 :headers
 [{"Message-ID" "<33491127.1075857594966.JavaMail.evans@thyme>"}
  {"Date" "Tue, 21 Nov 2000 13:16:00 -0800 (PST)"}
  {"From" "john.arnold@enron.com"}
  {"To" "slafontaine@globalp.com"}
  {"Subject" "re:mkts"}
  {"Mime-Version" "1.0"}
  {"Content-Type" "text/plain; charset=us-ascii"}
  {"Content-Transfer-Encoding" "7bit"}
  {"X-From" "John Arnold"}
  {"X-To" "slafontaine@globalp.com @ ENRON"}
  {"X-cc" ""}
  {"X-bcc" ""}
  {"X-Folder" "\\John_Arnold_Dec2000\\Notes Folders\\'sent mail"}
  {"X-Origin" "Arnold-J"}
  {"X-FileName" "Jarnold.nsf"}],
 :date-sent #inst "2000-11-21T21:16:00.000-00:00",
 :date-received nil,
 :from ({:address "john.arnold@enron.com", :name nil}),
 :id "<33491127.1075857594966.JavaMail.evans@thyme>",
 :sender nil,
 :content-type "text/plain; charset=us-ascii",
 :multipart? false,
 :body
 {:content-type "text/plain; charset=us-ascii",
  :body
  "Hey:\nHaven't had the best of months.  Like you had some good positions but others \nwiped o

nil

# Read in Files

In [11]:
(defn get-files [start-path re]
    (->> start-path
         (clojure.java.io/as-file)
         (file-seq)
         (map #(.getPath ^File %))
         (filter #(re-matches re %))))

#'drafts.sentiment_analysis/get-files

In [12]:
#_(def xform-msg-files
    (comp (map mail/file->message)
          (map read-message)))

(defn raw-message->message-data [m]
    {:to    (-> (get m :to) (first) (get :address))
     :from  (-> (get m :from) (first) (get :address))
     :date-sent (get m :date-sent)
     :date-received (get m :date-received)
     :subject (get m :subject)
     :body  (get-in m [:body :body])})

(def xform-msg-files
    (comp (map mail/file->message)
          (map read-message)
          (map raw-message->message-data)))


#'drafts.sentiment_analysis/xform-msg-files

In [13]:
(def sent-mail-re #"data\/enron_mail\/maildir\/.*\/_sent_mail\/.*")
(def sent-msg-paths (get-files maildir-path sent-mail-re))

#'drafts.sentiment_analysis/sent-msg-paths

In [14]:
#_(defn msg-reduce
    ([] [])
    ([acc] acc)
    ([acc m]
        (conj acc {:to    (-> (get m :to) (first) (get :address))
                   :from  (-> (get m :from) (first) (get :address))
                   :date-sent (get m :date-sent)
                   :date-received (get m :date-received)
                   :subject (get m :subject)
                   :body  (get-in m [:body :body])})))

In [15]:
#_(def msgs (transduce xform-msg-files msg-reduce sent-msg-paths))

#_(def msgs (into [] xform-msg-files sent-msg-paths))

(def msgs (sequence xform-msg-files sent-msg-paths))

#'drafts.sentiment_analysis/msgs

In [16]:
(count msgs)

30237

# Add Message Sentiment

In [17]:
(defn remove-line-breaks [text]
    (clojure.string/replace text #"\n" ""))

#'drafts.sentiment_analysis/remove-line-breaks

In [18]:
(def get-sentences (make-sentence-detector "./models/en-sent.bin"))

#'drafts.sentiment_analysis/get-sentences

In [19]:
#_(defn add-sentiment
    ([] [])
    ([acc] acc)
    ([acc msg]
      (conj acc (conj msg {:avg-sentiment (->> msg
                                     (:body)
                                     (get-sentences)
                                     (map remove-line-breaks)
                                     (map #(. sa (getSentimentAnalysis %)))
                                     (map #(get % "compound"))
                                     (transduce identity stats/mean))}))))

(defn msg->avg-sentiment [msg]
  (->> msg
       (:body)
       (get-sentences)
       (transduce
        (map (fn [sentence]
               (-> sentence
                   remove-line-breaks
                   (#(. ^SentimentAnalysis sa (getSentimentAnalysis %)))
                   (get "compound"))))
        stats/mean)))

#'drafts.sentiment_analysis/msg->avg-sentiment

In [20]:
#_(def sentiment (transduce identity add-sentiment (filter #(< (count (get % :body)) 4000) msgs)))

(def sentiment 
    (sequence
          (comp 
            (filter #(< (count (get % :body)) 4000))
            (map (fn [msg] (conj msg {:avg-sentiment (msg->avg-sentiment msg)}))))
          msgs))

#'drafts.sentiment_analysis/sentiment

# Plot Sentiment Over Time

In [21]:
#_(pprint (->> (take 10 sentiment)
             (map #(select-keys % [:date-sent :avg-sentiment]))))

(->> sentiment
     (take 10)
     (map #(select-keys % [:date-sent :avg-sentiment]))
     print-table)


|                   :date-sent |       :avg-sentiment |
|------------------------------+----------------------|
| Tue May 15 03:23:00 IDT 2001 | 0.055699998778956275 |
| Fri May 11 00:07:00 IDT 2001 |  0.12040000160535176 |
| Thu May 17 00:13:00 IDT 2001 |  0.05083000175654888 |
| Thu Apr 26 21:27:00 IDT 2001 |  0.44040000438690186 |
| Wed May 02 04:54:00 IDT 2001 |   0.3900999998052915 |
| Mon Apr 30 23:46:00 IDT 2001 |  0.01218292698627565 |
| Tue May 01 00:10:00 IDT 2001 |                  0.0 |
| Thu Apr 26 04:26:00 IDT 2001 |                  0.0 |
| Thu Apr 19 03:40:00 IDT 2001 | -0.09866666793823242 |
| Sat Apr 21 06:40:00 IDT 2001 |  0.25231666505957645 |


nil

In [22]:
(defn same-day? [t1 t2]
    (t/equal? (t/floor t1 t/day) (t/floor t2 t/day)))

#'drafts.sentiment_analysis/same-day?

In [23]:
#_(def xform-get-time-data
    (comp (map #(select-keys % [:date-sent :avg-sentiment]))
          (map #(hash-map :date (-> (c/from-date (:date-sent %))
                                    (t/floor t/day)
                                    (c/to-date))
                          :avg-sentiment (:avg-sentiment %)))))

(defn get-time-data [{:keys [date-sent avg-sentiment]}]
    {:date (-> date-sent
               c/from-date
               (t/floor t/day)
               (c/to-date))
     :avg-sentiment avg-sentiment})

#'drafts.sentiment_analysis/get-time-data

In [24]:
#_(pprint (eduction xform-get-time-data (take 5 sentiment)))

(->> sentiment
     (eduction (comp (take 5)
                     (map get-time-data)))
     print-table)


|                        :date |       :avg-sentiment |
|------------------------------+----------------------|
| Tue May 15 03:00:00 IDT 2001 | 0.055699998778956275 |
| Thu May 10 03:00:00 IDT 2001 |  0.12040000160535176 |
| Wed May 16 03:00:00 IDT 2001 |  0.05083000175654888 |
| Thu Apr 26 03:00:00 IDT 2001 |  0.44040000438690186 |
| Wed May 02 03:00:00 IDT 2001 |   0.3900999998052915 |


nil

In [25]:
#_(defn reduce-daily-sentiment
    ([] {})
    ([acc] 
     (reduce #(conj %1 {(first %2) 
                        (transduce identity stats/mean (second %2))}) (sorted-map) acc))
    ([acc x]
     (let [{date :date sentiment :avg-sentiment} x]
            (if (contains? acc date)
             (update acc date conj sentiment)
             (conj acc {date [sentiment]})))))

In [26]:
#_(def average-sentiment-data (transduce xform-get-time-data reduce-daily-sentiment sentiment))

(def average-sentiment-data (into (sorted-map)
                                  (comp (map get-time-data)
                                        (x/by-key :date
                                                  :avg-sentiment
                                                   x/avg))
                                  sentiment))

#'drafts.sentiment_analysis/average-sentiment-data

In [27]:
(count average-sentiment-data)

556

In [28]:
(defn average [coll]
  (/ (reduce + coll)
      (count coll)))

(defn moving-average [period coll] 
  (lazy-cat (repeat (dec period) nil) 
            (map average (partition period 1  coll))))

#'drafts.sentiment_analysis/moving-average

In [39]:
#_(def time-series-data
    (->> average-sentiment-data
         (#(vector (map first %) (map second %)))
         (#(vector (first %) (second %) (moving-average 30 (second %))))
         (apply map vector)
         (map #(hash-map :date (str (nth % 0))
                         :avg-sentiment (nth % 1)
                         :moving-avg (nth % 2)))))

#_(def time-series-data
    (->> average-sentiment-data
         ((juxt keys vals))
         ((fn [[dates values]]
              [(map str dates) values (moving-average 30 values)]))
         (apply map vector)
         (map (partial zipmap [:date :avg-sentiment :moving-avg]))))

(def time-series-data
    (->> average-sentiment-data
         (#(vector (keys %)
                   (vals %)
                   (moving-average 30 (vals %))))
         (apply map (fn [date v smoothed-v]
                        {:date (str date)
                         :avg-sentiment v
                         :moving-avg smoothed-v}))))

#'drafts.sentiment_analysis/time-series-data

In [40]:
#_time-series-data
(print-table time-series-data)


|                        :date |         :avg-sentiment |         :moving-avg |
|------------------------------+------------------------+---------------------|
| Fri Dec 10 02:00:00 IST 1999 |    0.15384195645396806 |                     |
| Sat Dec 11 02:00:00 IST 1999 |    0.21958139434803364 |                     |
| Sun Dec 12 02:00:00 IST 1999 |    0.08632857033184596 |                     |
| Mon Dec 13 02:00:00 IST 1999 |    0.11342400834208508 |                     |
| Tue Dec 14 02:00:00 IST 1999 |    0.09556597257824022 |                     |
| Wed Dec 15 02:00:00 IST 1999 |    0.11260412064069529 |                     |
| Thu Dec 16 02:00:00 IST 1999 |     0.1545073264109946 |                     |
| Fri Dec 17 02:00:00 IST 1999 |     0.1889564860387895 |                     |
| Sun Dec 19 02:00:00 IST 1999 |     0.1463428994915329 |                     |
| Mon Dec 20 02:00:00 IST 1999 |     0.1827259939381497 |                     |
| Tue Dec 21 02:00:00 IST 1999 |    0.2

| Thu Apr 06 02:00:00 IST 2000 |     0.1104968170418557 |  0.1352384130868617 |
| Fri Apr 07 02:00:00 IST 2000 |    0.10843267507310887 | 0.13446096954145395 |
| Sat Apr 08 02:00:00 IST 2000 |    0.10700291960717286 | 0.13479156063702563 |
| Sun Apr 09 02:00:00 IST 2000 |    0.05770000070333481 | 0.13348468404749342 |
| Mon Apr 10 02:00:00 IST 2000 |    0.09728936981720447 | 0.12894963725035125 |
| Tue Apr 11 02:00:00 IST 2000 |    0.09709485746215268 | 0.12818058524185305 |
| Wed Apr 12 02:00:00 IST 2000 |    0.15329752664108645 |  0.1293290207533253 |
| Thu Apr 13 02:00:00 IST 2000 |    0.10193078426778807 | 0.12845691033836915 |
| Fri Apr 14 03:00:00 IDT 2000 |    0.15488674319472984 | 0.12934084075952726 |
| Sat Apr 15 03:00:00 IDT 2000 |     0.1658858967642499 | 0.13105247547752372 |
| Sun Apr 16 03:00:00 IDT 2000 |    0.09026000040272872 | 0.12085072569658405 |
| Mon Apr 17 03:00:00 IDT 2000 |     0.1252356682572611 |  0.1207431388461006 |
| Tue Apr 18 03:00:00 IDT 2000 |     0.1

| Fri Jul 28 03:00:00 IDT 2000 |    0.13275897068050418 | 0.12135831559484872 |
| Sat Jul 29 03:00:00 IDT 2000 |    0.10822869499557111 | 0.12019582464917893 |
| Sun Jul 30 03:00:00 IDT 2000 |    0.19473081170165332 | 0.12117460756204622 |
| Mon Jul 31 03:00:00 IDT 2000 |    0.12699937934423833 | 0.11944360560254977 |
| Tue Aug 01 03:00:00 IDT 2000 |     0.1588319608654053 | 0.11871800421779569 |
| Wed Aug 02 03:00:00 IDT 2000 |    0.10458932935903309 | 0.13131098203952654 |
| Thu Aug 03 03:00:00 IDT 2000 |    0.16316391980982878 | 0.13393748780893067 |
| Fri Aug 04 03:00:00 IDT 2000 |    0.12056141964522596 | 0.13483380255630392 |
| Sat Aug 05 03:00:00 IDT 2000 |                    0.0 | 0.13036379463373785 |
| Sun Aug 06 03:00:00 IDT 2000 |    0.10102102626368013 | 0.12897256019887762 |
| Mon Aug 07 03:00:00 IDT 2000 |    0.14645146659825878 | 0.12710751528098202 |
| Tue Aug 08 03:00:00 IDT 2000 |    0.17108359610610138 | 0.12910014306022316 |
| Wed Aug 09 03:00:00 IDT 2000 |    0.10

| Sat Nov 11 02:00:00 IST 2000 |     0.0873250039294362 | 0.12303818720870403 |
| Sun Nov 12 02:00:00 IST 2000 |    0.18910613308559707 | 0.12363563734488076 |
| Mon Nov 13 02:00:00 IST 2000 |     0.1112335227909459 | 0.12352909471948752 |
| Tue Nov 14 02:00:00 IST 2000 |     0.1376937708544332 |  0.1261146277743564 |
| Wed Nov 15 02:00:00 IST 2000 |    0.09383530030294854 |   0.127583242907388 |
| Thu Nov 16 02:00:00 IST 2000 |    0.10721468965838396 |  0.1264269981861425 |
| Fri Nov 17 02:00:00 IST 2000 |    0.10361984235003732 | 0.12535307271592833 |
| Sat Nov 18 02:00:00 IST 2000 |   0.035959071853754736 | 0.12114585925270394 |
| Sun Nov 19 02:00:00 IST 2000 |    0.21147633125111934 |  0.1241081572548311 |
| Mon Nov 20 02:00:00 IST 2000 |    0.12248170510221795 | 0.12347591759013544 |
| Tue Nov 21 02:00:00 IST 2000 |    0.12904005622124523 | 0.12803689669337862 |
| Wed Nov 22 02:00:00 IST 2000 |       0.13585386088784 |  0.1303024058481363 |
| Thu Nov 23 02:00:00 IST 2000 |    0.22

| Wed Feb 28 02:00:00 IST 2001 |    0.08935144917265257 | 0.11787898860781695 |
| Thu Mar 01 02:00:00 IST 2001 |    0.10292308612025422 | 0.11817937531595116 |
| Fri Mar 02 02:00:00 IST 2001 |    0.09171043543317085 | 0.11744821583563674 |
| Sat Mar 03 02:00:00 IST 2001 |    0.09647667015586993 | 0.11598893722510148 |
| Sun Mar 04 02:00:00 IST 2001 |    0.13288866568291646 | 0.11582158783023579 |
| Mon Mar 05 02:00:00 IST 2001 |     0.0851498614543449 |  0.1123872743541374 |
| Tue Mar 06 02:00:00 IST 2001 |    0.11438662557056122 | 0.11244323711791168 |
| Wed Mar 07 02:00:00 IST 2001 |    0.12505499244794677 |  0.1119784738585133 |
| Thu Mar 08 02:00:00 IST 2001 |    0.16252980973260134 |   0.113834260078682 |
| Fri Mar 09 02:00:00 IST 2001 |    0.14214159244649754 | 0.11564226259847038 |
| Sat Mar 10 02:00:00 IST 2001 |     0.1701032085320005 | 0.11633617744099264 |
| Sun Mar 11 02:00:00 IST 2001 | -0.0010857153683900833 | 0.11163987731878713 |
| Mon Mar 12 02:00:00 IST 2001 |    0.10

| Tue Jun 19 03:00:00 IDT 2001 |    0.27711249451256464 | 0.11383042027989852 |
| Fri Jun 22 03:00:00 IDT 2001 |    0.18745310035528112 | 0.11606612811392489 |
| Sun Jun 24 03:00:00 IDT 2001 |    0.17196257918974056 | 0.11763575160435129 |
| Mon Jun 25 03:00:00 IDT 2001 |    0.17262966614527006 | 0.12035303675652056 |
| Tue Jun 26 03:00:00 IDT 2001 |    0.16614375188946723 | 0.12080043647684822 |
| Wed Jun 27 03:00:00 IDT 2001 |    0.15508818226781756 | 0.12925893148371653 |
| Thu Jun 28 03:00:00 IDT 2001 |  -0.021562501788139343 | 0.12300374920707798 |
| Mon Jul 02 03:00:00 IDT 2001 |     0.3955333350847165 | 0.13295003728676852 |
| Tue Jul 03 03:00:00 IDT 2001 |    -0.1509000062942505 | 0.12319767525822153 |
| Fri Jul 06 03:00:00 IDT 2001 |    0.08617272973060608 | 0.12069625749210054 |
| Thu Jul 19 03:00:00 IDT 2001 |   -0.03288888931274414 | 0.11481159315450608 |
| Tue Aug 14 03:00:00 IDT 2001 |    0.05822500213980675 | 0.11034230316972206 |
| Thu Aug 16 03:00:00 IDT 2001 |  -0.036

nil

In [41]:
;; (def line-plot
;;   {:data {:values time-series-data}
;;    :width 400
;;    :height 400
;;    :encoding {:x {:field "date", :type "temporal"}
;;               :y {:field "moving-avg"}}
;;    :mark {:type "line" :stroke "red"}})

(def layered-line-plot
    {:width 600
     :height 600
     :data {:values time-series-data}
     :layer [{:mark {:type "line", :stroke "lightblue"}
              :encoding {:x {:field "date", :type "temporal"}
                         :y {:field "avg-sentiment"}}},
             {:mark {:type "line", :stroke "green"}
              :encoding {:x {:field "date", :type "temporal"}
                         :y {:field "moving-avg"}}}]})

;; Render the plot
;; (oz/view! line-plot)
(oz/view! layered-line-plot)