Here we will take a directory of chat logs and merge them. We will take our desired directory path and output file name and return a single json file with all logs.

Note that we run file formatting on each singular json file to add the proper delimiters and formatting, then run it again on our output file to ensure our formatting is consistent.

In [6]:
import json 
import os
import argparse
import dateutil.parser
import numpy

#Formats file into JSON file
def file_formatting(fp):
    parser = json.JSONDecoder()
    parsed = []  # a list to hold individually parsed JSON structures
    with open(fp) as f:
        data = f.read()
    head = 0  # hold the current position as we parse
    while True:
        head = (data.find('{', head) + 1 or data.find('[', head) + 1) - 1
        try:
            struct, head = parser.raw_decode(data, head)
            parsed.append(struct)
        except (ValueError, json.JSONDecodeError):  # no more valid JSON structures
            break
    json_obj = json.dumps(parsed, indent=2)
    return json_obj

#Merges a directory of JSON files into one
def merge_chats(dir, output):
    result = []
    with open(output, "w") as out:
        for file in os.listdir(dir):
            #print(file)
            #input()
            f = os.path.join(dir, file)
            if os.path.isfile(f):
                out.write(file_formatting(f))
    result = file_formatting(output)
    with open(output, "w") as out:
        out.write(result)

#Merge all chats into one json file
chat_dir = "Conti Chat Logs 2020"
chat_out = "logs/chat_logs.json"
merge_chats(chat_dir,chat_out)

#Merge all jabber into one json file
jabber_dir = "Conti Jabber Chat Logs 2021 - 2022"
jabber_out = "logs/jabber_logs.json"
merge_chats(jabber_dir,jabber_out )

Here we will find a unique list of users from a JSON file we input. 

In [7]:
#Returns a unique np array of users from a JSON file
def user_parser(data_json):
    users = []
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', action='store')
    filename = data_json
    with open(filename) as f:
        json_data = json.load(f)
    for i in (json_data):
        user_str = i['from'].lower()
        lst = user_str.split('@')
        users.append(lst[0]) 
        i['from'] = lst[0]
        user_str = i['to'].lower()
        lst = user_str.split('@')
        users.append(lst[0]) 
        i['to'] = lst[0]
    users_np = numpy.array(users)
    return numpy.unique(users_np), json_data

#Updates master files with only username
def clean_users(data_json, output):
    with open(output, 'w') as out:
        out.write(json.dumps(data_json))

jabber_users, jab_json_with_usernames = user_parser(jabber_out)
clean_users(jab_json_with_usernames, jabber_out)
chat_users, chat_json_with_usernames = user_parser(chat_out)
clean_users(chat_json_with_usernames, chat_out)

Our time parser accepts a json file and will return a json object with the timestamps converted to datetime type.

In [8]:
#Returns JSON object with datetime value 
def time_parser(data_json):
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', action='store')
    filename = data_json
    with open(filename) as f:
        json_data = json.load(f)
    for i in (json_data):
        i['ts'] = dateutil.parser.isoparse(i['ts']) # ISO 8601 extended format
    return json_data


Some basic analysis/ numbers about our users

In [10]:
#Evaluates user lists
both = [i for i in jabber_users if i in chat_users]
unique_chat = [i for i in chat_users if i not in jabber_users]
unique_jabber = [i for i in jabber_users if i not in chat_users]
all_users = numpy.union1d(jabber_users, chat_users)
with open('user_lists/users.txt', 'w') as file:
    numpy.savetxt(file, all_users, fmt='%s')
with open('user_lists/jabber_unique_users.txt', 'w') as file:
    numpy.savetxt(file, unique_jabber, fmt='%s')
with open('user_lists/chat_unique_users.txt', 'w') as file:
    numpy.savetxt(file, unique_chat, fmt='%s')

print("Number of users in both:", len(both))
print(numpy.array(both))
print("Number of users only in Jabber:", len(unique_jabber))
print(numpy.array(unique_jabber))
print("Number of users only in Chat:", len(unique_chat))
print(numpy.array(unique_chat))

Number of users in both: 183
['admin' 'admintest' 'admu' 'ahtyng' 'alarm' 'alarm2' 'alaska' 'ali'
 'alter' 'andy' 'atlant' 'axel' 'azot' 'baget' 'baly' 'band' 'batka'
 'baxter' 'bekeeper' 'bentley' 'bill' 'bob' 'boby' 'bonen' 'bourbon' 'bra'
 'braun' 'brom' 'buggati' 'bullet' 'bumer' 'buza' 'calmar' 'cany' 'carter'
 'casper' 'ceram' 'chaos' 'chip' 'clipper' 'contisupport' 'cooler'
 'cosmos' 'cruz' 'dandis' 'dandmen' 'darc' 'defender' 'deploy' 'derek'
 'dick' 'dominik' 'doomsday' 'dove' 'driver' 'electronic' 'elon' 'fast'
 'flip' 'food' 'forbes' 'ford' 'forest' 'fox' 'frank' 'frog' 'ganesh'
 'germes' 'ghost' 'globus' 'grajdanin' 'grant' 'green' 'gringo' 'grom'
 'gus' 'hash' 'hlor' 'hof' 'idgo' 'jumbo' 'kagas' 'kaktus' 'kent'
 'kerasid' 'kevin' 'killer' 'kingston' 'kramer' 'kran' 'lemur' 'leo'
 'licor' 'log' 'logan' 'longer' 'love' 'lucas' 'mango' 'many' 'marcus'
 'marsel' 'matiz' 'mavelek' 'mavemat' 'max' 'mentos' 'merch' 'miguel'
 'modar' 'modnik' 'moms' 'moon' 'morgan' 'mors' 'muchach