In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datasets
import multiprocessing as mp
import torch
import torchvision

from datasets import Dataset
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.notebook import tqdm

%matplotlib inline
plt.style.use('ggplot')

classifier = pipeline("sentiment-analysis", model='distilbert-base-uncased-finetuned-sst-2-english', framework='pt')
classifier.device = torch.device("mps")
classifier.model = classifier.model.to("mps")

In [2]:
def read_json(fp):
  with open(fp, 'r') as f:
    return json.loads(f.read())

  
def import_all_messages(dir):
  if not os.path.exists(dir):
    return []
  msgs = []
  for fn in os.listdir(dir):
    msgs += read_json(os.path.join(dir, fn))
  return msgs


def get_channel_type(ch):
  if ch['is_mpim']:
    return 'mpim'
  elif ch['is_private']:
    return 'private'
  else:
    return 'public'

  
def import_slack_messages(dir):
  users = {u['id']: u for u in read_json(os.path.join(dir, "users.json"))}
  channels = {c['id']: c for c in read_json(os.path.join(dir, "channels.json"))}
  mpims = {c['id']: c for c in read_json(os.path.join(dir, "mpims.json"))}
  dms = {c['id']: c for c in read_json(os.path.join(dir, "dms.json"))}
  
  channels = channels | mpims
  
  msgs = []
  for id, ch in tqdm(channels.items(), total=len(channels), desc='channels'):
    chan_type = get_channel_type(ch)
    chan_msgs = import_all_messages(os.path.join(dir, ch['name_normalized']))
    for msg in chan_msgs:
      user = users.get(msg.get('user'))
      if not user or user['is_bot']:
        continue
      msg['channel'] = ch['name_normalized']
      msg['channel_type'] = chan_type
      msg['username'] = user['name']
      msgs.append(msg)

  for id, dm in tqdm(dms.items(), total=len(dms), desc='dms'):
    chan_msgs = import_all_messages(os.path.join(dir, id))
    usernames = [users.get(u)['name'] for u in dm['members']]
    for msg in chan_msgs:
      user = users.get(msg.get('user'))
      if not user or user['is_bot']:
        continue
      msg['channel'] = 'dm-' + '-'.join(usernames)
      msg['channel_type'] = 'dm'
      msg['username'] = user['name']
      msgs.append(msg)
  return msgs


def build_messages_df(dirs):
  msgs = []
  for dir in dirs:
    msgs += import_slack_messages(dir)
  print("imported", len(msgs), "messages")
  df = pd.DataFrame(msgs)
  df = df.set_index('ts')
  df = df[['username', 'text', 'channel', 'channel_type']]
  df = df[(df['text'].str.len() < 512) & (df['text'].str.len() > 3)]
  df['score'] = None
  return df


def read_messages_batched(dir):
  dfs = []
  for fn in os.listdir(dir):
    dfs.append(pd.read_csv(os.path.join(dir, fn)))
  df = pd.concat(dfs)
  df.sort_values(by=['ts'])
  return df


def classify_messages(df, batch_size=16):
  ds = Dataset.from_pandas(df)
  scores = []
  for out in tqdm(classifier(KeyDataset(ds, "text"), batch_size=batch_size), total=len(ds)):
    scores.append(out['score'] * (-1 if out['label'] == 'NEGATIVE' else 1))
  return scores


def write_messages_batched(df, dir):
  n = 25000
  c = 0
  for i in range(0, len(msg_df), n):
    c += 1
    df = msg_df[i:i+n]
    df.to_csv(os.path.join(dir, f'batch-{c:03d}.csv'))

# n = 25000
# c = 0
# for i in range(0, len(msg_df), n):
#   c += 1
#   df = msg_df[i:i+n]
#   scores = classify_messages(df)
#   df['score'] = scores
#   df.to_csv(os.path.join('./../localdata/slack-data/scored', f'batch-{c:03d}.csv'))


In [3]:
# Uncomment to build messages from source
# dirs = [
#   "./../localdata/slack-data/pub",
#   "./../localdata/slack-data/priv",
#   "./../localdata/slack-data/dms_031023",
# ]
# build_message_df(dirs)


msg_df = pd.read_csv('./../localdata/slack-data/messages-scored.csv')
msg_df = msg_df[(msg_df['score'] > 0.9995) | (msg_df['score'] < -0.9995)]
#msg_df.sample(100)


In [4]:
df = msg_df[msg_df['channel_type'] == 'public']
# ax = df['username'].value_counts().head(20).plot(kind='bar', figsize=(10, 5))
# plt.show()