In [1]:
import json
import hashlib
import pickle
import numpy as np
import random
import string
import os
import shutil
import subprocess
import sys
import pandas as pd
from zipfile import ZipFile
from collections import Counter
from pathlib import Path

In [126]:
def unzip_msg_files(zip_path, target_dir):
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            if file.endswith(".json"):
                zipObj.extract(file, target_dir)

                
def yield_msg_files(zip_path):
    """ Creates generator for files in zipdir """
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            with zipObj.open(file, "r") as myfile:
                try:
                    yield json.loads(json.load(myfile))
                except TypeError:
                    yield file


def count_msg_files(zip_path):
    """Counts the number of conversations in zip-file"""
    with ZipFile(zip_path, "r") as zipObj:
        return len(zipObj.namelist())
    

def read_zip_file(zip_path, file_name):
    with ZipFile(zip_path, "r") as zipObj:
        with zipObj.open(file_name, "r") as f:
            return json.load(f)
                    
def read_json(file):
    with open(file, "r") as f:
        return json.load(f)
    
def read_convo(file):
    """reads conversation json file to dict """
    return json.loads(read_json(file))

def hash_name(name):
    """ simplified version (no salt) """
    return hashlib.sha1(name.encode()).hexdigest()

def create_group_id(groupchat):
    """creates a group id based on participant names"""
    participant_string = "".join(sorted(groupchat["participants"]))
    return hash_name(participant_string)

def find_most_common(participant_list):
    """finds most common element in list """
    return Counter(participant_list).most_common(1)[0][0]




def add_reactions(msg, rel_list):
    """ Appends reaction to a reaction list (preprocessing step) """
    if "reactions" in msg.keys():
        for reaction in msg["reactions"]:
            reaction_dict = {"from": reaction, 
                             "to": msg["sender_name"], 
                             "timestamp": msg["timestamp_ms"], 
                             "rel_type": "reaction"}
            rel_list.append(reaction_dict)

            
            
def create_member_edges(group_convo, group_id):
    """ Create participant --> group relations for a conversation """
    return pd.DataFrame({"from": group_convo["participants"], 
                          "to": group_id, 
                          "timestamp": np.nan, 
                          "rel_type": "group"})

def process_group_messages(group_convo, group_id):
    """ Create a nice dataframe with all the messages from group chat"""
    assert convo["thread_type"] == "RegularGroup"
    group_msgs = pd.DataFrame(index=range(len(group_convo["messages"])), 
                              columns=["from", "to", "timestamp", "rel_type"])
    group_msgs = group_msgs.assign(to = group_id, rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(group_convo["messages"]):
        group_msgs.loc[i, "from"] = msg["sender_name"]
        group_msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([group_msgs, pd.DataFrame(rel_list)])

def process_group_edges(group_convo):
    """ Full pipeline for processing group chats """
    group_id = create_group_id(group_convo)
    group_msgs = process_group_messages(group_convo, group_id)
    group_members = create_member_edges(group_convo, group_id)
    return pd.concat([group_msgs, group_members]).reset_index(drop=True)


def process_msgs(convo):
    """ Processes messages and returns a nice dataframe :)) """
    if len(convo["participants"]) == 1:
        return None
    assert convo["thread_type"] == "Regular"
    msgs = pd.DataFrame(index=range(len(convo["messages"])), 
                        columns=["from", "to", "timestamp", "rel_type"])
    msgs = msgs.assign(rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(convo["messages"]):
        if "call_duration" in msg.keys():
            continue
        msgs.loc[i, "from"] = msg["sender_name"]
        msgs.loc[i, "to"] = msg["receiver_name"]
        msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([msgs.dropna(subset=["from"])
                            , pd.DataFrame(rel_list)])


def fix_dropout_dict(data_path):    
    """adds name to dropout dict as well as fixes key"""
    participant_list = []
    num_two_person = 0
    stop = False
    while not stop:
        for convo in yield_msg_files(data_path):
            is_two_person = convo["thread_type"] == "Regular"
            if is_two_person:
                num_two_person += 1
                participant_list.extend(convo["participants"])      
            if num_two_person == 2:
                stop = True
                break
        
    dropout_dict = read_zip_file(data_path, "dropout.json")
    dropout_dict["still_cogsci"] = dropout_dict.pop("is_dropout")
    dropout_dict["name"] = find_most_common(participant_list)
    return dropout_dict

In [80]:
DATA_DIR = Path("./data")
data_paths = list(DATA_DIR.glob("*.zip"))
dropout_list = [None for _ in range(len(data_paths))]
for i, data_path in enumerate(data_paths):
    dropout_list[i] = fix_dropout_dict(data_path)

In [125]:
data_path = data_paths[0]
num_files = count_msg_files(data_path)
df_list = []
for i, convo in enumerate(yield_msg_files(data_path)):
    print(f"processing convo {i+1} out of {num_files}...")
    if type(convo) == str:
        continue
    elif convo["thread_type"] == "Regular":
        df_list.append(process_msgs(convo))
    elif convo["thread_type"] == "RegularGroup":
        df_list.append(process_group_edges(convo))
    else:
        print(convo["thread_type"])


processing convo 1 out of 173...
processing convo 2 out of 173...
processing convo 3 out of 173...
processing convo 4 out of 173...
processing convo 5 out of 173...
processing convo 6 out of 173...
processing convo 7 out of 173...
processing convo 8 out of 173...
processing convo 9 out of 173...
processing convo 10 out of 173...
processing convo 11 out of 173...
processing convo 12 out of 173...
processing convo 13 out of 173...
processing convo 14 out of 173...
processing convo 15 out of 173...
processing convo 16 out of 173...
processing convo 17 out of 173...
processing convo 18 out of 173...
processing convo 19 out of 173...
processing convo 20 out of 173...
processing convo 21 out of 173...
processing convo 22 out of 173...
processing convo 23 out of 173...
processing convo 24 out of 173...
processing convo 25 out of 173...
processing convo 26 out of 173...
processing convo 27 out of 173...
processing convo 28 out of 173...
processing convo 29 out of 173...
processing convo 30 out

In [124]:
count_msg_files(data_path)

173

In [108]:
master_df = pd.concat(df_list)

In [112]:
master_df[["to", "rel_type"]].drop_duplicates()

sus_id = "d9fb861e9674011cd75bc0313afe1672bdd3d759"

master_df[master_df["to"] == sus_id]

Unnamed: 0,from,to,timestamp,rel_type
0,4b01401aca70648e91556d8666179b84ab1af68b,d9fb861e9674011cd75bc0313afe1672bdd3d759,1577092657453.0,msg
1,23f06d0b724d5b5ff893a269281d1a79f0bcb6b2,d9fb861e9674011cd75bc0313afe1672bdd3d759,1577087169190.0,msg
2,12688c448b8135515026f4d932de436b37083fae,d9fb861e9674011cd75bc0313afe1672bdd3d759,,group
3,23f06d0b724d5b5ff893a269281d1a79f0bcb6b2,d9fb861e9674011cd75bc0313afe1672bdd3d759,,group
4,4b01401aca70648e91556d8666179b84ab1af68b,d9fb861e9674011cd75bc0313afe1672bdd3d759,,group


In [119]:
master_df[master_df["timestamp"].isna()]

Unnamed: 0,from,to,timestamp,rel_type
2,12688c448b8135515026f4d932de436b37083fae,d9fb861e9674011cd75bc0313afe1672bdd3d759,,group
3,23f06d0b724d5b5ff893a269281d1a79f0bcb6b2,d9fb861e9674011cd75bc0313afe1672bdd3d759,,group
4,4b01401aca70648e91556d8666179b84ab1af68b,d9fb861e9674011cd75bc0313afe1672bdd3d759,,group
127,96f594c25021c24ecac85799a70c168db6a5d4b5,fb17367811aa92906f8b12e767d35744efd94978,,group
128,1809bac6d0946f66ff2aca1d43a64cb451081d17,fb17367811aa92906f8b12e767d35744efd94978,,group
...,...,...,...,...
8,df674372f1074947d7976f02bd0f680cc16598b7,9ea0b6a4c29d19dfd4ba04d3c4c963c7b33c34d6,,group
9,4b01401aca70648e91556d8666179b84ab1af68b,9ea0b6a4c29d19dfd4ba04d3c4c963c7b33c34d6,,group
10,574a35bc5bc5040d1c5c99bf9a3bff3b3318d638,9ea0b6a4c29d19dfd4ba04d3c4c963c7b33c34d6,,group
11,42f809348fb5f78489f0a8e11a49129c6d127db4,9ea0b6a4c29d19dfd4ba04d3c4c963c7b33c34d6,,group


In [115]:
dropout_df = pd.DataFrame(dropout_list)
dropout_df

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
Name: name, dtype: bool