In [1]:
import json
import hashlib
import pickle
import numpy as np
import random
import string
import os
import shutil
import subprocess
import sys
import pandas as pd
from zipfile import ZipFile
from collections import Counter
from pathlib import Path

In [136]:
def unzip_msg_files(zip_path, target_dir):
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            if file.endswith(".json"):
                zipObj.extract(file, target_dir)

                
def yield_msg_files(zip_path):
    """ Creates generator for files in zipdir """
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            with zipObj.open(file, "r") as myfile:
                try:
                    yield json.loads(json.load(myfile))
                except TypeError:
                    yield file


def count_msg_files(zip_path):
    """Counts the number of conversations in zip-file"""
    with ZipFile(zip_path, "r") as zipObj:
        return len(zipObj.namelist())
    

def read_zip_file(zip_path, file_name):
    with ZipFile(zip_path, "r") as zipObj:
        with zipObj.open(file_name, "r") as f:
            return json.load(f)
                    
def read_json(file):
    with open(file, "r") as f:
        return json.load(f)
    
def read_convo(file):
    """reads conversation json file to dict """
    return json.loads(read_json(file))

def hash_name(name):
    """ simplified version (no salt) """
    return hashlib.sha1(name.encode()).hexdigest()

def create_group_id(groupchat):
    """creates a group id based on participant names"""
    participant_string = "".join(sorted(groupchat["participants"]))
    return hash_name(participant_string)

def find_most_common(participant_list):
    """finds most common element in list """
    return Counter(participant_list).most_common(1)[0][0]




def add_reactions(msg, rel_list):
    """ Appends reaction to a reaction list (preprocessing step) """
    if "reactions" in msg.keys():
        for reaction in msg["reactions"]:
            reaction_dict = {"from": reaction, 
                             "to": msg["sender_name"], 
                             "timestamp": msg["timestamp_ms"], 
                             "rel_type": "reaction"}
            rel_list.append(reaction_dict)

            
            
def create_member_edges(group_convo, group_id):
    """ Create participant --> group relations for a conversation """
    return pd.DataFrame({"from": group_convo["participants"], 
                          "to": group_id, 
                          "timestamp": np.nan, 
                          "rel_type": "group"})

def process_group_messages(group_convo, group_id):
    """ Create a nice dataframe with all the messages from group chat"""
    assert group_convo["thread_type"] == "RegularGroup"
    group_msgs = pd.DataFrame(index=range(len(group_convo["messages"])), 
                              columns=["from", "to", "timestamp", "rel_type"])
    group_msgs = group_msgs.assign(to = group_id, rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(group_convo["messages"]):
        group_msgs.loc[i, "from"] = msg["sender_name"]
        group_msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([group_msgs, pd.DataFrame(rel_list)])

def process_group_edges(group_convo):
    """ Full pipeline for processing group chats """
    group_id = create_group_id(group_convo)
    group_msgs = process_group_messages(group_convo, group_id)
    group_members = create_member_edges(group_convo, group_id)
    return pd.concat([group_msgs, group_members]).reset_index(drop=True)


def process_msgs(convo):
    """ Processes messages and returns a nice dataframe :)) """
    if len(convo["participants"]) == 1:
        return None
    assert convo["thread_type"] == "Regular"
    msgs = pd.DataFrame(index=range(len(convo["messages"])), 
                        columns=["from", "to", "timestamp", "rel_type"])
    msgs = msgs.assign(rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(convo["messages"]):
        if "call_duration" in msg.keys():
            continue
        msgs.loc[i, "from"] = msg["sender_name"]
        msgs.loc[i, "to"] = msg["receiver_name"]
        msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([msgs.dropna(subset=["from"])
                            , pd.DataFrame(rel_list)])


def fix_dropout_dict(data_path):    
    """adds name to dropout dict as well as fixes key"""
    participant_list = []
    num_two_person = 0
    stop = False
    while not stop:
        for convo in yield_msg_files(data_path):
            is_two_person = convo["thread_type"] == "Regular"
            if is_two_person:
                num_two_person += 1
                participant_list.extend(convo["participants"])      
            if num_two_person == 2:
                stop = True
                break
        
    dropout_dict = read_zip_file(data_path, "dropout.json")
    dropout_dict["still_cogsci"] = dropout_dict.pop("is_dropout")
    dropout_dict["name"] = find_most_common(participant_list)
    return dropout_dict


def process_person(data_path):
    """
    processes all conversations from one person 
    (inputs path to zip-file)
    """
    df_list = []
    for convo in yield_msg_files(data_path):
        if type(convo) == str:
            continue
        elif convo["thread_type"] == "Regular":
            df_list.append(process_msgs(convo))
        elif convo["thread_type"] == "RegularGroup":
            df_list.append(process_group_edges(convo))
        else:
            print(convo["thread_type"])
    return pd.concat(df_list)

In [127]:
DATA_DIR = Path("./data")
data_paths = list(DATA_DIR.glob("*.zip"))
dropout_list = [None for _ in range(len(data_paths))]
for i, data_path in enumerate(data_paths):
    dropout_list[i] = fix_dropout_dict(data_path)

In [132]:
data_list = [None for _ in range(len(data_paths))]
for i, data_path in enumerate(data_paths):
    print(f"processing person {i+1} out of {len(data_paths)}...")
    data_list[i] = process_person(data_path)
print("all done!")

processing person 1 out of 19...
processing convo 1 out of 174...
processing convo 2 out of 174...
processing convo 3 out of 174...
processing convo 4 out of 174...
processing convo 5 out of 174...
processing convo 6 out of 174...
processing convo 7 out of 174...
processing convo 8 out of 174...
processing convo 9 out of 174...
processing convo 10 out of 174...
processing convo 11 out of 174...
processing convo 12 out of 174...
processing convo 13 out of 174...
processing convo 14 out of 174...
processing convo 15 out of 174...
processing convo 16 out of 174...
processing convo 17 out of 174...
processing convo 18 out of 174...
processing convo 19 out of 174...
processing convo 20 out of 174...
processing convo 21 out of 174...
processing convo 22 out of 174...
processing convo 23 out of 174...
processing convo 24 out of 174...
processing convo 25 out of 174...
processing convo 26 out of 174...
processing convo 27 out of 174...
processing convo 28 out of 174...
processing convo 29 out 

processing convo 68 out of 159...
processing convo 69 out of 159...
processing convo 70 out of 159...
processing convo 71 out of 159...
processing convo 72 out of 159...
processing convo 73 out of 159...
processing convo 74 out of 159...
processing convo 75 out of 159...
processing convo 76 out of 159...
processing convo 77 out of 159...
processing convo 78 out of 159...
processing convo 79 out of 159...
processing convo 80 out of 159...
processing convo 81 out of 159...
processing convo 82 out of 159...
processing convo 83 out of 159...
processing convo 84 out of 159...
processing convo 85 out of 159...
processing convo 86 out of 159...
processing convo 87 out of 159...
processing convo 88 out of 159...
processing convo 89 out of 159...
processing convo 90 out of 159...
processing convo 91 out of 159...
processing convo 92 out of 159...
processing convo 93 out of 159...
processing convo 94 out of 159...
processing convo 95 out of 159...
processing convo 96 out of 159...
processing con

processing convo 149 out of 299...
processing convo 150 out of 299...
processing convo 151 out of 299...
processing convo 152 out of 299...
processing convo 153 out of 299...
processing convo 154 out of 299...
processing convo 155 out of 299...
processing convo 156 out of 299...
processing convo 157 out of 299...
processing convo 158 out of 299...
processing convo 159 out of 299...
processing convo 160 out of 299...
processing convo 161 out of 299...
processing convo 162 out of 299...
processing convo 163 out of 299...
processing convo 164 out of 299...
processing convo 165 out of 299...
processing convo 166 out of 299...
processing convo 167 out of 299...
processing convo 168 out of 299...
processing convo 169 out of 299...
processing convo 170 out of 299...
processing convo 171 out of 299...
processing convo 172 out of 299...
processing convo 173 out of 299...
processing convo 174 out of 299...
processing convo 175 out of 299...
processing convo 176 out of 299...
processing convo 177

processing convo 87 out of 119...
processing convo 88 out of 119...
processing convo 89 out of 119...
processing convo 90 out of 119...
processing convo 91 out of 119...
processing convo 92 out of 119...
processing convo 93 out of 119...
processing convo 94 out of 119...
processing convo 95 out of 119...
processing convo 96 out of 119...
processing convo 97 out of 119...
processing convo 98 out of 119...
processing convo 99 out of 119...
processing convo 100 out of 119...
processing convo 101 out of 119...
processing convo 102 out of 119...
processing convo 103 out of 119...
processing convo 104 out of 119...
processing convo 105 out of 119...
processing convo 106 out of 119...
processing convo 107 out of 119...
processing convo 108 out of 119...
processing convo 109 out of 119...
processing convo 110 out of 119...
processing convo 111 out of 119...
processing convo 112 out of 119...
processing convo 113 out of 119...
processing convo 114 out of 119...
processing convo 115 out of 119..

processing convo 212 out of 323...
processing convo 213 out of 323...
processing convo 214 out of 323...
processing convo 215 out of 323...
processing convo 216 out of 323...
processing convo 217 out of 323...
processing convo 218 out of 323...
processing convo 219 out of 323...
processing convo 220 out of 323...
processing convo 221 out of 323...
processing convo 222 out of 323...
processing convo 223 out of 323...
processing convo 224 out of 323...
processing convo 225 out of 323...
processing convo 226 out of 323...
processing convo 227 out of 323...
processing convo 228 out of 323...
processing convo 229 out of 323...
processing convo 230 out of 323...
processing convo 231 out of 323...
processing convo 232 out of 323...
processing convo 233 out of 323...
processing convo 234 out of 323...
processing convo 235 out of 323...
processing convo 236 out of 323...
processing convo 237 out of 323...
processing convo 238 out of 323...
processing convo 239 out of 323...
processing convo 240

processing convo 134 out of 162...
processing convo 135 out of 162...
processing convo 136 out of 162...
processing convo 137 out of 162...
processing convo 138 out of 162...
processing convo 139 out of 162...
processing convo 140 out of 162...
processing convo 141 out of 162...
processing convo 142 out of 162...
processing convo 143 out of 162...
processing convo 144 out of 162...
processing convo 145 out of 162...
processing convo 146 out of 162...
processing convo 147 out of 162...
processing convo 148 out of 162...
processing convo 149 out of 162...
processing convo 150 out of 162...
processing convo 151 out of 162...
processing convo 152 out of 162...
processing convo 153 out of 162...
processing convo 154 out of 162...
processing convo 155 out of 162...
processing convo 156 out of 162...
processing convo 157 out of 162...
processing convo 158 out of 162...
processing convo 159 out of 162...
processing convo 160 out of 162...
processing convo 161 out of 162...
processing convo 162

processing convo 104 out of 234...
processing convo 105 out of 234...
processing convo 106 out of 234...
processing convo 107 out of 234...
processing convo 108 out of 234...
processing convo 109 out of 234...
processing convo 110 out of 234...
processing convo 111 out of 234...
processing convo 112 out of 234...
processing convo 113 out of 234...
processing convo 114 out of 234...
processing convo 115 out of 234...
processing convo 116 out of 234...
processing convo 117 out of 234...
processing convo 118 out of 234...
processing convo 119 out of 234...
processing convo 120 out of 234...
processing convo 121 out of 234...
processing convo 122 out of 234...
processing convo 123 out of 234...
processing convo 124 out of 234...
processing convo 125 out of 234...
processing convo 126 out of 234...
processing convo 127 out of 234...
processing convo 128 out of 234...
processing convo 129 out of 234...
processing convo 130 out of 234...
processing convo 131 out of 234...
processing convo 132

processing convo 106 out of 189...
processing convo 107 out of 189...
processing convo 108 out of 189...
processing convo 109 out of 189...
processing convo 110 out of 189...
processing convo 111 out of 189...
processing convo 112 out of 189...
processing convo 113 out of 189...
processing convo 114 out of 189...
processing convo 115 out of 189...
processing convo 116 out of 189...
processing convo 117 out of 189...
processing convo 118 out of 189...
processing convo 119 out of 189...
processing convo 120 out of 189...
processing convo 121 out of 189...
processing convo 122 out of 189...
processing convo 123 out of 189...
processing convo 124 out of 189...
processing convo 125 out of 189...
processing convo 126 out of 189...
processing convo 127 out of 189...
processing convo 128 out of 189...
processing convo 129 out of 189...
processing convo 130 out of 189...
processing convo 131 out of 189...
processing convo 132 out of 189...
processing convo 133 out of 189...
processing convo 134

processing convo 153 out of 169...
processing convo 154 out of 169...
processing convo 155 out of 169...
processing convo 156 out of 169...
processing convo 157 out of 169...
processing convo 158 out of 169...
processing convo 159 out of 169...
processing convo 160 out of 169...
processing convo 161 out of 169...
processing convo 162 out of 169...
processing convo 163 out of 169...
processing convo 164 out of 169...
processing convo 165 out of 169...
processing convo 166 out of 169...
processing convo 167 out of 169...
processing convo 168 out of 169...
processing convo 169 out of 169...
done!
processing person 11 out of 19...
processing convo 1 out of 125...
processing convo 2 out of 125...
Pending
processing convo 3 out of 125...
processing convo 4 out of 125...
processing convo 5 out of 125...
processing convo 6 out of 125...
processing convo 7 out of 125...
processing convo 8 out of 125...
processing convo 9 out of 125...
processing convo 10 out of 125...
processing convo 11 out of

processing convo 25 out of 349...
processing convo 26 out of 349...
processing convo 27 out of 349...
processing convo 28 out of 349...
processing convo 29 out of 349...
processing convo 30 out of 349...
processing convo 31 out of 349...
processing convo 32 out of 349...
processing convo 33 out of 349...
processing convo 34 out of 349...
processing convo 35 out of 349...
processing convo 36 out of 349...
processing convo 37 out of 349...
processing convo 38 out of 349...
processing convo 39 out of 349...
processing convo 40 out of 349...
processing convo 41 out of 349...
processing convo 42 out of 349...
processing convo 43 out of 349...
processing convo 44 out of 349...
processing convo 45 out of 349...
processing convo 46 out of 349...
processing convo 47 out of 349...
processing convo 48 out of 349...
processing convo 49 out of 349...
processing convo 50 out of 349...
processing convo 51 out of 349...
processing convo 52 out of 349...
processing convo 53 out of 349...
processing con

processing convo 265 out of 349...
processing convo 266 out of 349...
processing convo 267 out of 349...
processing convo 268 out of 349...
processing convo 269 out of 349...
processing convo 270 out of 349...
processing convo 271 out of 349...
processing convo 272 out of 349...
processing convo 273 out of 349...
processing convo 274 out of 349...
processing convo 275 out of 349...
processing convo 276 out of 349...
processing convo 277 out of 349...
processing convo 278 out of 349...
processing convo 279 out of 349...
processing convo 280 out of 349...
processing convo 281 out of 349...
processing convo 282 out of 349...
processing convo 283 out of 349...
processing convo 284 out of 349...
processing convo 285 out of 349...
processing convo 286 out of 349...
processing convo 287 out of 349...
processing convo 288 out of 349...
processing convo 289 out of 349...
processing convo 290 out of 349...
processing convo 291 out of 349...
processing convo 292 out of 349...
processing convo 293

processing convo 153 out of 176...
processing convo 154 out of 176...
processing convo 155 out of 176...
processing convo 156 out of 176...
processing convo 157 out of 176...
processing convo 158 out of 176...
processing convo 159 out of 176...
processing convo 160 out of 176...
processing convo 161 out of 176...
processing convo 162 out of 176...
processing convo 163 out of 176...
processing convo 164 out of 176...
processing convo 165 out of 176...
processing convo 166 out of 176...
processing convo 167 out of 176...
processing convo 168 out of 176...
processing convo 169 out of 176...
processing convo 170 out of 176...
processing convo 171 out of 176...
processing convo 172 out of 176...
processing convo 173 out of 176...
processing convo 174 out of 176...
processing convo 175 out of 176...
processing convo 176 out of 176...
done!
processing person 15 out of 19...
processing convo 1 out of 219...
processing convo 2 out of 219...
processing convo 3 out of 219...
processing convo 4 ou

processing convo 218 out of 219...
processing convo 219 out of 219...
done!
processing person 16 out of 19...
processing convo 1 out of 461...
processing convo 2 out of 461...
processing convo 3 out of 461...
processing convo 4 out of 461...
processing convo 5 out of 461...
processing convo 6 out of 461...
processing convo 7 out of 461...
processing convo 8 out of 461...
processing convo 9 out of 461...
processing convo 10 out of 461...
processing convo 11 out of 461...
processing convo 12 out of 461...
processing convo 13 out of 461...
processing convo 14 out of 461...
processing convo 15 out of 461...
processing convo 16 out of 461...
processing convo 17 out of 461...
processing convo 18 out of 461...
processing convo 19 out of 461...
processing convo 20 out of 461...
processing convo 21 out of 461...
processing convo 22 out of 461...
processing convo 23 out of 461...
processing convo 24 out of 461...
processing convo 25 out of 461...
processing convo 26 out of 461...
processing conv

processing convo 244 out of 461...
processing convo 245 out of 461...
processing convo 246 out of 461...
processing convo 247 out of 461...
processing convo 248 out of 461...
processing convo 249 out of 461...
processing convo 250 out of 461...
processing convo 251 out of 461...
processing convo 252 out of 461...
processing convo 253 out of 461...
processing convo 254 out of 461...
processing convo 255 out of 461...
processing convo 256 out of 461...
processing convo 257 out of 461...
processing convo 258 out of 461...
processing convo 259 out of 461...
processing convo 260 out of 461...
processing convo 261 out of 461...
processing convo 262 out of 461...
processing convo 263 out of 461...
processing convo 264 out of 461...
processing convo 265 out of 461...
processing convo 266 out of 461...
processing convo 267 out of 461...
processing convo 268 out of 461...
processing convo 269 out of 461...
processing convo 270 out of 461...
processing convo 271 out of 461...
processing convo 272

processing convo 19 out of 285...
processing convo 20 out of 285...
processing convo 21 out of 285...
processing convo 22 out of 285...
processing convo 23 out of 285...
processing convo 24 out of 285...
processing convo 25 out of 285...
processing convo 26 out of 285...
processing convo 27 out of 285...
processing convo 28 out of 285...
processing convo 29 out of 285...
processing convo 30 out of 285...
processing convo 31 out of 285...
processing convo 32 out of 285...
processing convo 33 out of 285...
processing convo 34 out of 285...
processing convo 35 out of 285...
processing convo 36 out of 285...
processing convo 37 out of 285...
processing convo 38 out of 285...
processing convo 39 out of 285...
processing convo 40 out of 285...
processing convo 41 out of 285...
processing convo 42 out of 285...
processing convo 43 out of 285...
processing convo 44 out of 285...
processing convo 45 out of 285...
processing convo 46 out of 285...
processing convo 47 out of 285...
processing con

processing convo 254 out of 285...
processing convo 255 out of 285...
processing convo 256 out of 285...
processing convo 257 out of 285...
processing convo 258 out of 285...
processing convo 259 out of 285...
processing convo 260 out of 285...
processing convo 261 out of 285...
processing convo 262 out of 285...
processing convo 263 out of 285...
processing convo 264 out of 285...
processing convo 265 out of 285...
processing convo 266 out of 285...
processing convo 267 out of 285...
processing convo 268 out of 285...
processing convo 269 out of 285...
processing convo 270 out of 285...
processing convo 271 out of 285...
processing convo 272 out of 285...
processing convo 273 out of 285...
processing convo 274 out of 285...
processing convo 275 out of 285...
processing convo 276 out of 285...
processing convo 277 out of 285...
processing convo 278 out of 285...
processing convo 279 out of 285...
processing convo 280 out of 285...
processing convo 281 out of 285...
processing convo 282

processing convo 13 out of 370...
processing convo 14 out of 370...
processing convo 15 out of 370...
processing convo 16 out of 370...
processing convo 17 out of 370...
processing convo 18 out of 370...
processing convo 19 out of 370...
processing convo 20 out of 370...
processing convo 21 out of 370...
processing convo 22 out of 370...
processing convo 23 out of 370...
processing convo 24 out of 370...
processing convo 25 out of 370...
processing convo 26 out of 370...
processing convo 27 out of 370...
processing convo 28 out of 370...
processing convo 29 out of 370...
processing convo 30 out of 370...
processing convo 31 out of 370...
processing convo 32 out of 370...
processing convo 33 out of 370...
processing convo 34 out of 370...
processing convo 35 out of 370...
processing convo 36 out of 370...
processing convo 37 out of 370...
processing convo 38 out of 370...
processing convo 39 out of 370...
processing convo 40 out of 370...
processing convo 41 out of 370...
processing con

processing convo 254 out of 370...
processing convo 255 out of 370...
processing convo 256 out of 370...
processing convo 257 out of 370...
processing convo 258 out of 370...
processing convo 259 out of 370...
Pending
processing convo 260 out of 370...
processing convo 261 out of 370...
processing convo 262 out of 370...
processing convo 263 out of 370...
processing convo 264 out of 370...
processing convo 265 out of 370...
processing convo 266 out of 370...
processing convo 267 out of 370...
processing convo 268 out of 370...
processing convo 269 out of 370...
processing convo 270 out of 370...
processing convo 271 out of 370...
processing convo 272 out of 370...
processing convo 273 out of 370...
processing convo 274 out of 370...
processing convo 275 out of 370...
processing convo 276 out of 370...
processing convo 277 out of 370...
processing convo 278 out of 370...
processing convo 279 out of 370...
processing convo 280 out of 370...
processing convo 281 out of 370...
processing c

In [137]:
master_df = pd.concat(data_list)
unique_master = master_df.drop_duplicates()

In [141]:
unique_master["timestamp"].isna().mean()

0.009979331963010723

In [115]:
dropout_df = pd.DataFrame(dropout_list)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
Name: name, dtype: bool