In [1]:
import json
import hashlib
import pickle
import numpy as np
import random
import string
import os
import shutil
import subprocess
import sys
import pandas as pd
from zipfile import ZipFile
from collections import Counter
from pathlib import Path

In [2]:
def unzip_msg_files(zip_path, target_dir):
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            if file.endswith(".json"):
                zipObj.extract(file, target_dir)

                
def yield_msg_files(zip_path):
    """ Creates generator for files in zipdir """
    with ZipFile(zip_path, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        all_files = zipObj.namelist()
        for file in all_files:
            with zipObj.open(file, "r") as myfile:
                try:
                    yield json.loads(json.load(myfile))
                except TypeError:
                    yield file


def count_msg_files(zip_path):
    """Counts the number of conversations in zip-file"""
    with ZipFile(zip_path, "r") as zipObj:
        return len(zipObj.namelist())
    

def read_zip_file(zip_path, file_name):
    with ZipFile(zip_path, "r") as zipObj:
        with zipObj.open(file_name, "r") as f:
            return json.load(f)
                    
def read_json(file):
    with open(file, "r") as f:
        return json.load(f)
    
def read_convo(file):
    """reads conversation json file to dict """
    return json.loads(read_json(file))

def hash_name(name):
    """ simplified version (no salt) """
    return hashlib.sha1(name.encode()).hexdigest()

def create_group_id(groupchat):
    """creates a group id based on participant names"""
    participant_string = "".join(sorted(groupchat["participants"]))
    return hash_name(participant_string)

def find_most_common(participant_list):
    """finds most common element in list """
    return Counter(participant_list).most_common(1)[0][0]




def add_reactions(msg, rel_list):
    """ Appends reaction to a reaction list (preprocessing step) """
    if "reactions" in msg.keys():
        for reaction in msg["reactions"]:
            reaction_dict = {"from": reaction, 
                             "to": msg["sender_name"], 
                             "timestamp": msg["timestamp_ms"], 
                             "rel_type": "reaction"}
            rel_list.append(reaction_dict)

            
            
def create_member_edges(group_convo, group_id):
    """ 
    Create participant --> group relations for a conversation 
    NB: These will have timestamp as nan!
    """
    return pd.DataFrame({"from": group_convo["participants"], 
                          "to": group_id, 
                          "timestamp": np.nan, 
                          "rel_type": "group"})

def process_group_messages(group_convo, group_id):
    """ Create a nice dataframe with all the messages from group chat"""
    assert group_convo["thread_type"] == "RegularGroup"
    group_msgs = pd.DataFrame(index=range(len(group_convo["messages"])), 
                              columns=["from", "to", "timestamp", "rel_type"])
    group_msgs = group_msgs.assign(to = group_id, rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(group_convo["messages"]):
        group_msgs.loc[i, "from"] = msg["sender_name"]
        group_msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([group_msgs, pd.DataFrame(rel_list)])

def process_group_edges(group_convo):
    """ Full pipeline for processing group chats """
    group_id = create_group_id(group_convo)
    group_msgs = process_group_messages(group_convo, group_id)
    group_members = create_member_edges(group_convo, group_id)
    return pd.concat([group_msgs, group_members]).reset_index(drop=True)


def process_msgs(convo):
    """ Processes messages and returns a nice dataframe :)) """
    if len(convo["participants"]) == 1:
        return None
    assert convo["thread_type"] == "Regular"
    msgs = pd.DataFrame(index=range(len(convo["messages"])), 
                        columns=["from", "to", "timestamp", "rel_type"])
    msgs = msgs.assign(rel_type = "msg")
    rel_list = []
    for i, msg in enumerate(convo["messages"]):
        if "call_duration" in msg.keys():
            continue
        msgs.loc[i, "from"] = msg["sender_name"]
        msgs.loc[i, "to"] = msg["receiver_name"]
        msgs.loc[i, "timestamp"] = msg["timestamp_ms"]
        add_reactions(msg, rel_list)
    return pd.concat([msgs.dropna(subset=["from"])
                            , pd.DataFrame(rel_list)])


def fix_dropout_dict(data_path):    
    """adds name to dropout dict as well as fixes key"""
    participant_list = []
    num_two_person = 0
    stop = False
    while not stop:
        for convo in yield_msg_files(data_path):
            is_two_person = convo["thread_type"] == "Regular"
            if is_two_person:
                num_two_person += 1
                participant_list.extend(convo["participants"])      
            if num_two_person == 2:
                stop = True
                break
        
    dropout_dict = read_zip_file(data_path, "dropout.json")
    dropout_dict["still_cogsci"] = dropout_dict.pop("is_dropout")
    dropout_dict["name"] = find_most_common(participant_list)
    return dropout_dict


def process_person(data_path):
    """
    processes all conversations from one person 
    (inputs path to zip-file)
    """
    df_list = []
    for convo in yield_msg_files(data_path):
        if type(convo) == str:
            continue
        elif convo["thread_type"] == "Regular":
            df_list.append(process_msgs(convo))
        elif convo["thread_type"] == "RegularGroup":
            df_list.append(process_group_edges(convo))
        else:
            print(convo["thread_type"])
    try:
        return pd.concat(df_list)
    except ValueError:
        return None


def create_dropout_df(data_paths):
    """Full pipeline for creating df with from the dropout.json """
    dropout_list = [None for _ in range(len(data_paths))]
    for i, data_path in enumerate(data_paths):
        dropout_list[i] = fix_dropout_dict(data_path)
    return pd.DataFrame(dropout_list)

def anonymize_filename(data_path):
    """ Removes the actual name from the filename (weird google thing)"""
    problem_part = data_path.name.find(" -")
    new_name = data_path.parent / Path(data_path.name[:problem_part] + ".zip")
    data_path.rename(new_name)
    
def anonymize_folder(data_folder):
    """Anonymizes all filenames in folder (from google thing)"""
    problem_paths = data_folder.glob("*-*.zip")
    for file in problem_paths:
        anonymize_filename(file)
        
def find_unique_ids(dropout_df, full_df):
    """ Returns unique ids (groups and people) for filtering"""
    unique_people = dropout_df["name"].unique()
    unique_groups = unique_master.loc[unique_master["rel_type"] == "group", "to"].unique()
    return set(np.concatenate((unique_people, unique_groups)))


def filter_consent(full_df):
    unique_ids = find_unique_ids(dropout_df, full_df)
    consenting_filter = full_df["from"].isin(unique_ids) & full_df["to"].isin(unique_ids)
    return full_df[consenting_filter]

In [12]:
DATA_DIR = Path("./data")
anonymize_folder(DATA_DIR)
data_paths = list(DATA_DIR.glob("*.zip"))

In [13]:
dropout_df = create_dropout_df(data_paths)

In [14]:
data_list = [None for _ in range(len(data_paths))]

In [15]:
for i, data_path in enumerate(data_paths):
    if data_list[i] is None:
        print(f"processing person {i+1} out of {len(data_paths)}...")
        try:
            data_list[i] = process_person(data_path)
        except FileNotFoundError:
            print("no file here")
            continue
print("all done!")

processing person 1 out of 29...
Pending
processing person 2 out of 29...
processing person 3 out of 29...
PendingGroup
PendingGroup
Pending
processing person 4 out of 29...
Pending
Pending
Pending
processing person 5 out of 29...
processing person 6 out of 29...
Pending
Pending
Pending
processing person 7 out of 29...
Pending
PendingGroup
Pending
Pending
Pending
processing person 8 out of 29...
processing person 9 out of 29...
processing person 10 out of 29...
processing person 11 out of 29...
PendingGroup
processing person 12 out of 29...
PendingGroup
processing person 13 out of 29...
Pending
processing person 14 out of 29...
Pending
Pending
processing person 15 out of 29...
PendingGroup
Pending
processing person 16 out of 29...
Pending
processing person 17 out of 29...
processing person 18 out of 29...
Pending
processing person 19 out of 29...
processing person 20 out of 29...
processing person 21 out of 29...
PendingGroup
Pending
processing person 22 out of 29...
Pending
Pending
pr

In [19]:
master_df = pd.concat(data_list)

In [20]:
unique_master = master_df.drop_duplicates(subset=['timestamp'])
consent_df = filter_consent(unique_master)

In [21]:
consent_df.to_csv("all_messages.csv")