In [133]:
import json
import hashlib
import pickle
import numpy as np
import random
import string
import os
import shutil
import subprocess
import sys
import pandas as pd
from zipfile import ZipFile
from collections import Counter
from pathlib import Path

In [145]:
def unzip_msg_files(zip_path, target_dir):
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            if file.endswith(".json"):
                zipObj.extract(file, target_dir)
                
def read_json(file):
    with open(file, "r") as f:
        return json.load(f)
    
def read_convo(file):
    """reads conversation json file to dict """
    return json.loads(read_json(file))

def hash_name(name):
    """ simplified version (no salt) """
    return hashlib.sha1(name.encode()).hexdigest()

def create_group_id(groupchat):
    """creates a group id based on participant names"""
    participant_string = "".join(sorted(groupchat["participants"]))
    return hash_name(participant_string)

def find_most_common(participant_list):
    """finds most common element in list """
    return Counter(participant_list).most_common(1)[0][0]


def fix_dropout_dict(data_path):    
    """adds name to dropout dict as well as fixes keyv"""
    file_generator = Path(data_path).glob("*.json")
    data_files = [file for file in file_generator if file.name != "dropout.json"]


    participant_list = []
    for file in data_files:
        temp_dict = json.loads(read_json(file))
        participant_list.extend(temp_dict["participants"])

    dropout_dict = read_json(Path(data_path) / "dropout.json")
    dropout_dict["still_cogsci"] = dropout_dict.pop("is_dropout")
    dropout_dict["name"] = find_most_common(participant_list)
    return dropout_dict

def add_reactions(msg, rel_list):
    if "reactions" in msg.keys():
        for reaction in msg["reactions"]:
            reaction_dict = {"from": reaction, 
                             "to": msg["sender_name"], 
                             "timestamp": msg["timestamp_ms"], 
                             "rel_type": "reaction"}
            rel_list.append(reaction_dict)

            
            
def create_member_edges(group_convo, group_id):
    return pd.DataFrame({"from": group_convo["participants"], 
                          "to": group_id, 
                          "timestamp": np.nan, 
                          "rel_type": "group"})

def process_group_messages(group_convo, group_id):
    group_msgs = pd.DataFrame(index=range(len(test_group["messages"])), 
                              columns=["from", "to", "timestamp", "rel_type"])
    group_msgs = group_msgs.assign(to = group_id, rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(test_group["messages"]):
        group_msgs.loc[i, "from"] = msg["sender_name"]
        group_msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([group_msgs, pd.DataFrame(rel_list)])

def process_group_edges(group_convo):
    group_id = create_group_id(group_convo)
    group_msgs = process_group_messages(group_convo, group_id)
    group_members = create_member_edges(group_convo, group_id)
    return pd.concat([group_msgs, group_members]).reset_index(drop=True)

In [121]:
data_dir = Path("./test_data")

data_files = data_dir.glob("*.zip")
for file in data_files:
    data_target = data_dir / f"{file.name[:-4]}_unzipped"
    unzip_msg_files(file, data_target)

In [131]:
data_path = list(data_dir.glob("./*unzipped/"))[0]

In [142]:
unzipped_dirs = data_dir.glob("./*unzipped/")
dropout_dicts = [fix_dropout_dict(dat_dir) for dat_dir in unzipped_dirs] 

In [144]:
dropout_dicts

[{'still_cogsci': '1', 'name': 'e4b3bd39cbd6fc6a9c7bd6c795351274e545981e'},
 {'still_cogsci': '1', 'name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44'}]

In [140]:
dropout_dict = fix_dropout_dict(data_path)

In [37]:
test_groupchat = data_target / "0TW8SR0DHVLTOC3HT5E58WKKWOJFIFX5.json"
test_chatname = data_target / "9Q9QMR3VY9P1VHO8W5OO1JJUB239MY8C.json"
test_group = read_convo(test_groupchat)
test_chat = read_convo(test_chatname)

In [84]:
def add_reactions(msg, rel_list):
    if "reactions" in msg.keys():
        for reaction in msg["reactions"]:
            reaction_dict = {"from": reaction, 
                             "to": msg["sender_name"], 
                             "timestamp": msg["timestamp_ms"], 
                             "rel_type": "reaction"}
            rel_list.append(reaction_dict)

            
            
def create_member_edges(group_convo, group_id):
    return pd.DataFrame({"from": group_convo["participants"], 
                          "to": group_id, 
                          "timestamp": np.nan, 
                          "rel_type": "group"})

def process_group_messages(group_convo, group_id):
    group_msgs = pd.DataFrame(index=range(len(test_group["messages"])), 
                              columns=["from", "to", "timestamp", "rel_type"])
    group_msgs = group_msgs.assign(to = group_id, rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(test_group["messages"]):
        group_msgs.loc[i, "from"] = msg["sender_name"]
        group_msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([group_msgs, pd.DataFrame(rel_list)])

def process_group_edges(group_convo):
    group_id = create_group_id(group_convo)
    group_msgs = process_group_messages(group_convo, group_id)
    group_members = create_member_edges(group_convo, group_id)
    return pd.concat([group_msgs, group_members]).reset_index(drop=True)

In [88]:
test_chat["messages"]

[{'sender_name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44',
  'timestamp_ms': 1607601841372,
  'receiver_name': '7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd'},
 {'sender_name': '7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd',
  'timestamp_ms': 1607589233444,
  'receiver_name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44'},
 {'sender_name': '7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd',
  'timestamp_ms': 1607589161096,
  'receiver_name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44'},
 {'sender_name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44',
  'timestamp_ms': 1607584776813,
  'receiver_name': '7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd'},
 {'sender_name': '7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd',
  'timestamp_ms': 1607552037011,
  'receiver_name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44'},
 {'sender_name': 'c9c16ab451702a84a0b886d34bbfe439c95e5a44',
  'timestamp_ms': 1607447349440,
  'receiver_name': '7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd'},
 {'sender_name': 'c9c16ab451702a84a0b886d34bbfe439c9

In [None]:
def process_msgs(convo):
    msgs = pd.DataFrame(index=range(len(test_chat["messages"])), 
                        columns=["from", "to", "timestamp", "rel_type"])
    msgs = msgs.assign(rel_type = "msg")
    rel_list
    for i, msg in enumerate(test_chat["messages"]):
        if "call_duration" in msg.keys():
            continue
        msgs.loc[i, "from"] = msg["sender_name"]
        msgs.loc[i, "to"] = msg["receiver_name"]
        msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([msgs.dropna(subset=["from"])
                            , pd.DataFrame(rel_list)])

In [96]:
msgs = pd.DataFrame(index=range(len(test_chat["messages"])), 
                    columns=["from", "to", "timestamp", "rel_type"])
msgs = msgs.assign(rel_type = "msg")
rel_list
for i, msg in enumerate(test_chat["messages"]):
    if "call_duration" in msg.keys():
        continue
    msgs.loc[i, "from"] = msg["sender_name"]
    msgs.loc[i, "to"] = msg["receiver_name"]
    msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
    add_reactions(msg, rel_list)
final_msgs = pd.concat([msgs.dropna(subset=["from"])
                        , pd.DataFrame(rel_list)])

In [97]:
final_msgs

Unnamed: 0,from,to,timestamp,rel_type
0,c9c16ab451702a84a0b886d34bbfe439c95e5a44,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,1607601841372,msg
1,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,c9c16ab451702a84a0b886d34bbfe439c95e5a44,1607589233444,msg
2,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,c9c16ab451702a84a0b886d34bbfe439c95e5a44,1607589161096,msg
3,c9c16ab451702a84a0b886d34bbfe439c95e5a44,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,1607584776813,msg
4,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,c9c16ab451702a84a0b886d34bbfe439c95e5a44,1607552037011,msg
...,...,...,...,...
720,c9c16ab451702a84a0b886d34bbfe439c95e5a44,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,1587333840757,reaction
721,c9c16ab451702a84a0b886d34bbfe439c95e5a44,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,1587330889386,reaction
722,c9c16ab451702a84a0b886d34bbfe439c95e5a44,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,1581797886559,reaction
723,c9c16ab451702a84a0b886d34bbfe439c95e5a44,7338bebc715f1aac0c392d7ef8b7c3c1ea2c24bd,1581240947364,reaction
