In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import json
import os
import sys
print(sys.version)
from __future__ import division

### Get the raw data

In [None]:
%%bash
mkdir slack_logs
cd slack_logs
mv ../NeIC_Slack_export_Dec10_2017.zip .
unzip NeIC_Slack_export_Dec10_2017.zip
cd ..


### Inspect directory structure

In [None]:
%ls slack_logs/

In [None]:
dirs = %sx ls -d slack_logs/*/
dirs

let's first try out one channel

In [None]:
dir = dirs[3]
os.listdir(dir)

Look at the structure of the json files

In [None]:
dates = os.listdir(dir)
d = dates[0] 
with open(dir+d,"r") as f:
    raw_json = json.loads(f.read())
dump = json.dumps(raw_json,indent=4)
print(dump)


Aha, `subtype` only present if it's not a regular message

In [None]:
for j in raw_json:
    if "subtype" in j.keys():
        print (j["type"], j["subtype"], j["text"])
    else:
        print(j["type"], j["text"])


We now extract all regular messages

In [None]:
dates = os.listdir(dir)
messages = []
for d in dates: 
    with open(dir+d,"r") as f:
        raw_json = json.loads(f.read())

    for j in raw_json:
        if not "subtype" in j.keys(): # exclude non-message messages
            messages.append(j["text"])

for m in messages:
    print(m)

### Simple natural language processing

Natural language toolkit tests

In [None]:
import nltk as nltk

In [None]:
import re
words = []
for m in messages:
    for w in m.split(" "):
        w = re.sub('\s+', '', w) # remove any whitespace character [ \t\n\r\f\v]
        # keep delimiters ,.!? by using re group, and keep multiple symbols together
        w = re.split(r'([,.!?]+)', w)
        for ww in w:
            if len(ww)>0: # get rid of empty strings
                words.append(ww)

#sorted(set(words))
dist1 = nltk.FreqDist(words)
dist1.most_common(20)


Cumulative plot over word use

In [None]:
plt.figure(dpi=100)
dist1.plot(20, cumulative=True)
print("Total number of words: %s"%(len(words)))

**Analyze emojis:**

In [None]:
all_words = " ".join(words)

emojis = re.findall(r":\w*:",all_words)
dist = nltk.FreqDist(emojis)
dist.most_common(20)

Let's look at lexical diversity, i.e. ratio of number of distinct words and total number of words

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

In [None]:
lexical_diversity(words)
#len(set(words))/len(words)

### LDA test case

In [None]:
import lda
import lda.datasets
X = lda.datasets.load_reuters()
lda.datasets.load_reuters_vocab?
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
print(X.shape)
print(X.sum())
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
#model.fit(X)  # model.fit_transform(X) is also available
#for i in X:
#    print i

In [None]:
import lda
import lda.datasets
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
X.shape
X.sum()
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))