# Analyzing Slack logs from NeIC channels

## TODO
- use examples from github.com/jalajthanaki/NLPython
- train sentiment analysis engine on some samples
- remove backslashes from words like it's and don't (seems to be some issue with how lda treats it, it separates don and t)
- upload slack logs zipfile to google drive
- count number of contributors in each channel (using `<u..>` string matching)
- do some analysis of time of first post each day in each channel


### Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os
import sys
print(sys.version)
#reload(sys)  
#sys.setdefaultencoding('utf8')
from __future__ import division

### Get the raw data

In [None]:
%%bash
mkdir slack_logs
cd slack_logs
mv ../NeIC_Slack_export_Dec10_2017.zip .
unzip NeIC_Slack_export_Dec10_2017.zip
cd ..

### Inspect directory structure and file format

In [None]:
dirs = %sx ls -d slack_logs/*/
for n,i in enumerate(dirs):
    print(n,i)

#### Let's first explore one channel

In [None]:
dir = dirs[13] #coderefinery
os.listdir(dir)

Look at the structure of the json files

In [None]:
import json

dates = os.listdir(dir)
d = dates[0] # pick the first date

# read in contents of json file
with open(dir+d,"r") as f:
    raw_json = json.loads(f.read())

# dump json
dump = json.dumps(raw_json,indent=4)
print(dump)

Aha, `subtype` only present if it's not a regular message

In [None]:
for j in raw_json:
    if "subtype" in j.keys():
        print (j["type"], j["subtype"], j["text"])
    else:
        print(j["type"], j["text"])


### Extracting messages

We now extract all regular messages in one channel

In [None]:
dates = os.listdir(dir) # this is still the coderefinery channel
messages = []

for d in dates: 
    with open(dir+d,"r") as f:
        raw_json = json.loads(f.read())

    for j in raw_json:
        if not "subtype" in j.keys(): # exclude non-message messages
            messages.append(j["text"])


Add all words in all messages to one list

In [None]:
import re
words = []
for m in messages:
    for w in m.split(): # split up words in messages
        #w = re.sub('\s+', '', w) # remove any whitespace character [ \t\n\r\f\v]
        # keep delimiters ,.!? by using regex group, and keep multiple symbols together
#        words.append(w.lower())
#        w = re.sub(r'([,.!?]+)','',w) remove delimiters ,.!?
        words.append(w.lower())

        #w = re.split(r'([,.!?]+)', w)
#        for ww in w: # split up delimiters
#            if len(ww)>0: # get rid of empty strings
#                words.append(ww.lower())

    # add fullstop to end of messages if needed
#    if not (re.match(r"[.,!?]", m[-1])):
#        words.append(".")
    
words[-100:]

#### Let's do this for all the Slack channels:

In [None]:
# list with all channel names
all_channels = [d.replace("slack_logs/","").replace("/","") for d in dirs]

# dictionary with channel names as keys
words_in_channels = dict.fromkeys(all_channels)

In [None]:
# function to join messages into one long array
import re
def join_messages(messages):
    words = []
    for m in messages:
        for w in m.split(): # split up words in messages
            # not needed with python split() method...
            #w = re.sub('\s+', '', w) # remove any whitespace character [ \t\n\r\f\v]
            # keep delimiters ,.!? by using re group, and keep multiple symbols together

            # try ignoring delimiters instead:
#            w = re.split(r'([,.!?]+)', w)
#            for ww in w: # split up delimiters
#                if len(ww)>0: # get rid of empty strings
#                    words.append(ww.lower()) # make all strings lowercase
#            w = re.sub(r'([,.!?]+)','',w)
            words.append(w.lower())

        # add fullstop to end of messages if needed
#        if len(m)>0: # need to exclude empty messages
#            if not (re.match(r"[.,!?]", m[-1])):
#                words.append(".")

    return words


In [None]:
# join messages in all channels into elements of words_in_channels dict
for channel in all_channels:
    #print("channel %s"%channel)
    dates = os.listdir("slack_logs/"+channel)
    messages = []
    for d in dates: 
        with open("slack_logs/"+channel+"/"+d,"r") as f:
            raw_json = json.loads(f.read())

        for j in raw_json:
            if not "subtype" in j.keys(): # exclude non-message messages
                messages.append(j["text"])
    words_in_channels[channel] = join_messages(messages)
    print("channel {} has {} words".format(channel,len(words_in_channels[channel])))

Remove empty channels

In [None]:
for i in words_in_channels.keys():
    if len(words_in_channels[i])==0:
        words_in_channels.pop(i, None)

Plot number of words in channels

In [None]:
plt.rcParams["figure.figsize"] = [12,9]
x = words_in_channels.keys()
y = [len(words_in_channels[i]) for i in words_in_channels.keys()]
ax = sns.barplot(x=y, y=x)

ax.set_xlim([0,200000])

From now on, let's focus on the largest channels

In [None]:
channels = ["tryggve","general","xt","web","random","arc-debugging","ndgf","coderefinery"]

### Simple natural language processing

Natural language toolkit tests

In [None]:
import nltk as nltk

Frequency distribution of words

In [None]:
#sorted(set(words))
most_common_words = dict.fromkeys(channels,0)
dists = dict.fromkeys(channels,0)
for channel in channels:
    words = words_in_channels[channel]
    dist = nltk.FreqDist(words)
    dists[channel] = dist
    most_common_words[channel] = dist.most_common(20)


We create a dataframe to work with:

In [None]:
df_words = pd.DataFrame(data=most_common_words)
df_words.head(30)

In [None]:
common_words = set()
for index, row in df_words.iterrows():
    for r in row:
        common_words.add(r[0])
common_words

> *Exercise: create a new dataframe with the common words as indices and number of appearances as values alon rows*

### Lexical diversity (type-token ratio)

Let's look at lexical diversity, i.e. the ratio of number of distinct words and total number of words

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

In [None]:
for channel in channels: # loop over the largest channels
    words = words_in_channels[channel]
    lex_div = lexical_diversity(words)
    print("Lexical diversity in %s is %f"%(channel,lex_div))
#len(set(words))/len(words)

Linguistic richness is clearly greatest in `random`, closely followed by `general`!

### Collocations, contexts and similar words

In [None]:
# this is how you can get help directly in the notebook!
nltk.word_tokenize?

In [None]:
nltk.Text?

In [None]:
nltk.Text.collocations?

**Collocations (sequences of words that co-occur more often than expected by chance)**

In [None]:
for channel in channels:
    words = words_in_channels[channel]
    all_words = " ".join(words)
    tokens = nltk.word_tokenize(all_words)
    text = nltk.Text(tokens)
    print(channel)
    print("------------")
    text.collocations()
    print("------------------------------------------------")
    print("")


**Searching for words**

What do people in different channels find boring?

In [None]:
for channel in channels:
    words = words_in_channels[channel]
    all_words = " ".join(words)
    tokens = nltk.word_tokenize(all_words)
    text = nltk.Text(tokens)
    print(channel)
    print("------------")
    text.concordance("boring")
    print("------------------------------------------------")
    print("")



Words that appear in similar contexts:

In [None]:
nltk.Text.similar?

In [None]:
for channel in channels:
    words = words_in_channels[channel]
    all_words = " ".join(words)
    tokens = nltk.word_tokenize(all_words)
    text = nltk.Text(tokens)
    print(channel)
    print("------------")
    text.similar("good")
    print("------------------------------------------------")
    print("")


## "Sentiment analysis": emojis

Emojis in the Slack logs are expressed like `:slightly_smiling_face:` 

In [None]:
# join words from all channels into one list, and then join into one long "sentence"
words = [j for i in words_in_channels.values() for j in i] 
all_words = " ".join(words)

emojis = re.findall(r":[a-zA-Z_]+:",all_words) # this filters out strings like :43:

dist = nltk.FreqDist(emojis)
dist.most_common(30)


Let's find the unique emojis

In [None]:
unique_emojis = set()
for i in most_common_emojis.keys():
    for j in most_common_emojis[i]:
        unique_emojis.add(j[0])
unique_emojis

Let's investigate a few key emojis

In [None]:
import emoji
common_emojis = [u":disappointed:",u":wink:",u":slightly_smiling_face:",u":simple_smile:",
              u":thumbsup:",u":clap:",u":stuck_out_tongue:",u":smile:",u":grinning:"]

for i in common_emojis:
    print(emoji.emojize('NeIC is %s'%i, use_aliases=True))
    print(i)
    print

hmm, `emoji` package doesn't understand `:simple_smile:`

In [None]:
channel_emojis = dict.fromkeys(channels,0)
 
for channel in channels:
    words = words_in_channels[channel]
    count = [words.count(i) for i in common_emojis]
    channel_emojis[channel] = count
#    for i in common_emojis:
#        count = words.count(i)
#        print(count)
    #    # need to join words for the FreqDist method
#    all_words = " ".join(words)

#    #emojis = re.findall(r":\w*:",all_words) # this includes strings like :43:
#    emojis = re.findall(r":[a-zA-Z_]+:",all_words) # this filters out strings like :43:
 
#    dist = nltk.FreqDist(emojis)
#    dist.most_common(20)
    
#    dists_emojis[channel] = dist
#    most_common_emojis[channel] = dist.most_common(40)
    
channel_emojis

In [None]:
tmp = most_common_emojis.copy()
for i in most_common_emojis:
#for i in tmp:
    excluded_emojis = [x[0] for x in most_common_emojis[i] if x[0] in key_emojis]
#    excluded_emojis = [x[0] for x in tmp[i] if x[0] in key_emojis]
    l1 = [x for x in most_common_emojis[i] if x[0] in key_emojis]
    l2 = [(x,0) for x in key_emojis if x not in excluded_emojis]
    print l1
    print l2
    newlist = l1 + l2
    newlist.sort()
    most_common_emojis[i] = newlist
    #tmp[i] = newlist.sort()
most_common_emojis

In [None]:
df_emojis = pd.DataFrame(data=channel_emojis)
df_emojis.head(10)

In [None]:
#emojis, y = zip(*df_emojis.coderefinery)
#df_emojis['emojis'] = emojis
#df_emojis.set_index('emojis',drop=True, inplace=True)
#df_emojis.head(5)

Use real emojis as indices in dataframe

In [None]:
emojis=[]
for i in common_emojis:
    x = emoji.emojize(i, use_aliases=True) 
    emojis.append(x)
df_emojis["emojis"] = emojis
df_emojis.set_index('emojis',inplace=True, drop=True)
df_emojis.head(10)

Doesn't look good with the unsupported emoji. Let's do some pandas magics

In [None]:
row_keep = df_emojis.index[2]
row_delete = df_emojis.index[3]

In [None]:
# add row_delete to row_keep, and delete row_delete
df_emojis.loc[row_keep] += df_emojis.loc[row_delete]
df_emojis.drop([row_delete], inplace=True)

df_emojis


Extract only the numbers from the dataframe tuples

In [None]:
#for x in df_emojis:
#    dummy, y = zip(*df_emojis[x])
#    df_emojis[x] = y
#df_emojis.head(5)

Normalize to total number of selected emojis in each channel

In [None]:
df_tmp = 100*df_emojis/df_emojis.sum()
df_tmp.round(1)

### Conclusions

(virtually speaking...)
- NeIC people seem to be rather happy overall 
- There's not a lot of clapping and thumbs-up-giving, except for `XT` and `Tryggve` people 
- On the other hand, `XT`-ers don't smile as much as other channels, but they laugh quite a bit
- The most ambiguous communication takes place on `random` and `general`, as evidenced by the high proportion of winking
- `NDGF`-ers are the most disappointed channel. Anything we can do to help guys? 😉 

In [None]:
# don't really need a heatmap plot

#emojis currently don't work as y-labels
#%matplotlib inline
##normalize:
#plt.rcParams["figure.figsize"] = [12.0, 8.0]
#plt.rcParams['figure.dpi'] = 300
#sns.set(font='Segoe UI Emoji')
#norm = 100*df_emojis / df_emojis.sum()

#g = sns.heatmap(norm,linewidths=.5,annot=True,cbar=True)
#plt.show()

### LDA 

First need some preprocessing

In [None]:
# we'll join all words for each channel
joined_words_in_channels = dict.fromkeys(channels)
for i in channels:
    words = words_in_channels[i]
    joined_words_in_channels[i] = " ".join(words)


In [None]:
import textmining

tdm = textmining.TermDocumentMatrix()
for channel in channels:
    tdm.add_doc(joined_words_in_channels[channel])

# write term document matrix to csv file
tdm.write_csv('matrix2.csv', cutoff=1)


In [None]:
#for row in tdm.rows(cutoff=1):
#    print type(row)
vocab = list(tdm.rows(cutoff=1))[0]
titles = channels
X = np.array(list(tdm.rows(cutoff=1))[1:])

X.shape

In [None]:
import lda

model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

From the fit model we can look at the topic-word probabilities

In [None]:
topic_word = model.topic_word_

for n in range(5):
    sum_pr = sum(topic_word[n,:])
    print("topic: {} sum: {}".format(n, sum_pr))

In [None]:
n = 15
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

The other information we get from the model is document-topic probabilities

In [None]:
doc_topic = model.doc_topic_
for n in range(8):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n,topic_most_pr,titles[n][:50]))

In [None]:
f, ax= plt.subplots(8, 1, figsize=(8, 12), sharex=True)
#for i, k in enumerate([0, 1, 2, 3, 4, 5, 6, 7]):
for i, k in enumerate([0, 1, 2, 3,4,5,6,7]):
    ax[i].stem(doc_topic[k,:], linefmt='r-',
               markerfmt='ro', basefmt='w-')
    ax[i].set_xlim(-0.5, 19.5)
    ax[i].set_xticks(range(20))
    ax[i].set_ylim(0, .5)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("{}".format(channels[k]))

ax[7].set_xlabel("Topic")

plt.tight_layout()
plt.show()

In [None]:
#f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
#for i, k in enumerate([0, 5, 9, 14, 19]):
#    ax[i].stem(topic_word[k,:], linefmt='b-',
#               markerfmt='bo', basefmt='w-')
#    ax[i].set_xlim(-50,4350)
#    ax[i].set_ylim(0, 0.08)
#    ax[i].set_ylabel("Prob")
#    ax[i].set_title("topic {}".format(k))

#ax[4].set_xlabel("word")

#plt.tight_layout()
#plt.show()