In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os, sys
import re
import json
import glob
import datetime
from collections import Counter
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import random

from nltk.corpus import stopwords
from wordcloud import WordCloud

In [None]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils

Columns we can get from a slack message
message_type, message_content, sender_id, time_sent, message_distribution, time_thread_start, reply_count, reply_user_count, time_thread_end, reply_users

From a single slack message, we can get

The message
Type (message, file, link, etc)
The sender_id (assigned by slack)
The time the message was sent
The team (i don't know what that is now)
The type of the message (broadcast message, inhouse, just messgae)
The thread the message generated (from here we can go):
7.1 Text/content of the message
7.2 The thread time of the message
7.3 The thread count (reply count)
7.4 The number of user that reply the message (count of users that participated in the thread)
7.5 The time the last thread message was sent
7.6 The users that participated in the thread (their ids are stored as well)

In [None]:
def get_top_20_user(data, channel='Random'):
    """get user with the highest number of message sent to any channel"""

    data['sender_name'].value_counts()[:20].plot.bar(figsize=(15, 7.5))
    plt.title(f'Top 20 Message Senders in #{channel} channels', size=15, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=14);
    plt.xticks(size=12); plt.yticks(size=12);
    plt.show()

    data['sender_name'].value_counts()[-10:].plot.bar(figsize=(15, 7.5))
    plt.title(f'Bottom 10 Message Senders in #{channel} channels', size=15, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=14);
    plt.xticks(size=12); plt.yticks(size=12);
    plt.show()

def draw_avg_reply_count(data, channel='Random'):
    """who commands many reply?"""

    data.groupby('sender_name')['reply_count'].mean().sort_values(ascending=False)[:20]\
        .plot(kind='bar', figsize=(15,7.5));
    plt.title(f'Average Number of reply count per Sender in #{channel}', size=20, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=18);
    plt.xticks(size=14); plt.yticks(size=14);
    plt.show()

def draw_avg_reply_users_count(data, channel='Random'):
    """who commands many user reply?"""

    data.groupby('sender_name')['reply_users_count'].mean().sort_values(ascending=False)[:20].plot(kind='bar',
     figsize=(15,7.5));
    plt.title(f'Average Number of reply user count per Sender in #{channel}', size=20, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=18);
    plt.xticks(size=14); plt.yticks(size=14);
    plt.show()

def draw_wordcloud(msg_content, week):    
    # word cloud visualization
    allWords = ' '.join([twts for twts in msg_content])
    wordCloud = WordCloud(background_color='#975429', width=500, height=300, random_state=21, max_words=500, mode='RGBA',
                            max_font_size=140, stopwords=stopwords.words('english')).generate(allWords)
    plt.figure(figsize=(15, 7.5))
    plt.imshow(wordCloud, interpolation="bilinear")
    plt.axis('off')
    plt.tight_layout()
    plt.title(f'WordCloud for {week}', size=30)
    plt.show()

def draw_user_reaction(data, channel='General'):
    data.groupby('sender_name')[['reply_count', 'reply_users_count']].sum()\
        .sort_values(by='reply_count',ascending=False)[:10].plot(kind='bar', figsize=(15, 7.5))
    plt.title(f'User with the most reaction in #{channel}', size=25);
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=18);
    plt.xticks(size=14); plt.yticks(size=14);
    plt.show()

Data Loading

In [None]:
from src.loader import SlackDataLoader

# Initialize DataLoader
data_loader = SlackDataLoader(rpath+'/data')

# Load data from a Slack channel
channels = data_loader.get_channels()
users = data_loader.get_users()
userNamesById, userIdsByName = data_loader.get_user_map()
channelNamesById, channelIdsByName = data_loader.get_channel_map()
all_channels_slack_data = {}
all_channels_slack_reactions = {}
for channel in channels:
    ch_path = rpath+'/data/anonymized/'+channel["name"]+'/'
    all_channels_slack_reactions[channel["id"]] = data_loader.parse_slack_reaction(ch_path, channel["name"])
    all_channels_slack_data[channel["id"]] = data_loader.slack_parser(ch_path)

In [None]:
[df.head() for df in all_channels_slack_data.values()]