In [None]:
import os
import json
from datetime import datetime
import pandas as pd

def convert_to_datetime(timestamp):
    # Convert timestamp from milliseconds to seconds
    timestamp_in_seconds = timestamp / 1000
    # Create a datetime object from the timestamp
    dt = datetime.fromtimestamp(timestamp_in_seconds)
    return dt

def convert_string_to_datetime(date_string):
    """
    Converts a string in the format '2023-08-08T23:26:55.761Z' to a Python datetime object.

    Args:
        date_string (str): The date string to be converted.

    Returns:
        datetime: A Python datetime object.
    """
    # Define the format of the input string
    date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
    
    # Use strptime to parse the string into a datetime object
    date_object = datetime.strptime(date_string, date_format)
    
    return date_object


def return_channel_chats(base_dir):
    chat_rooms = os.listdir(base_dir)


    chat_result = []
    for name in chat_rooms:
        subdir = f"{base_dir}/{name}"
        files = os.listdir(subdir)

        for file in files:

            with open(f"{subdir}/{file}", "r") as fh:

                output = fh.readlines()

                        
            clean = []
            for x in output:
                clean.append(json.loads(x))


            for x in clean:
                for y in x:
                    
                    if 'content' not in y:
                        continue

                    if 'timestamp' not in y and 'edited' not in y:
                        # print(y)
                        continue
                    author = y['author']
                    channel = y['channel']
                    
                    id = y['_id']
                    react_dict = y.get('reaction_counts', {})

                    if len(react_dict.keys()) > 0:
                        react_cnt = sum(react_dict.values())
                    else:
                        react_cnt = 0
                    mentions = y.get('mentions', [])
                    replies = y.get('replies', [])

                    if 'edited' in y:
                        dt = convert_string_to_datetime(y['edited'])
                    else:
                        dt = convert_to_datetime(y['timestamp'])

                    chat_result.append({'channel': channel,
                                        'author': author,
                                        'id': id,
                                        'mention_list': mentions,
                                        'replies_list': replies, 
                                        'timestamp': dt,
                                        'content': y['content'],
                                        'react_cnt': react_cnt,
                                        'react_dict': react_dict
                                        })

    return chat_result


def return_staff_chats(base_dir):

    chat_result = []
    files = os.listdir(base_dir)

    for file in files:

        with open(f"{base_dir}/{file}", "r") as fh:

            output = fh.readlines()

        try:
            clean = []
            for x in output:
                clean.append(json.loads(x))
        except:
            print(output)
            continue
            

        for x in clean:
            for y in x:
                
                if 'content' not in y:
                    continue

                if 'timestamp' not in y and 'edited' not in y:
                    # print(y)
                    continue
                author = y['author']
                channel = y['channel']
                
                id = y['_id']
                react_dict = y.get('reaction_counts', {})

                if len(react_dict.keys()) > 0:
                    react_cnt = sum(react_dict.values())
                else:
                    react_cnt = 0
                mentions = y.get('mentions', [])
                replies = y.get('replies', [])

                if 'edited' in y:
                    dt = convert_string_to_datetime(y['edited'])
                else:
                    dt = convert_to_datetime(y['timestamp'])

                chat_result.append({'channel': channel,
                                    'author': author,
                                    'id': id,
                                    'mention_list': mentions,
                                    'replies_list': replies, 
                                    'timestamp': dt,
                                    'content': y['content'],
                                    'react_cnt': react_cnt,
                                    'react_dict': react_dict
                                    })

    return chat_result

def sort_by_react_count(data):
    return sorted(data, key=lambda x: x['react_cnt'], reverse=True)

base_dir = "D:/andrew_tate/staff_chats"
chat_result_staff_chat = return_staff_chats(base_dir)
chat_result_staff_chat = pd.DataFrame(chat_result_staff_chat)

base_dir = "D:/andrew_tate/private"
chat_result_private = return_channel_chats(base_dir)
chat_result_private = pd.DataFrame(chat_result_private)

base_dir = "D:/andrew_tate/public"
chat_result_public = return_channel_chats(base_dir)
chat_result_public = pd.DataFrame(chat_result_public)

# Function to sort the data by 'react_cnt'
# sorted_chat = sort_by_react_count(chat_result)

In [None]:
import pickle
with open('staff_chat_df.pkl', 'wb') as fh:
    pickle.dump(chat_result_staff_chat, fh)

with open('private_chat_df.pkl', 'wb') as fh:
    pickle.dump(chat_result_private, fh)

with open('public_chat_df.pkl', 'wb') as fh:
    pickle.dump(chat_result_public, fh)        

In [None]:
users_fpath = "D:/andrew_tate/users.json"

with open(users_fpath, 'r') as fh:
    _users = fh.readlines()

users = []
for user in _users:
    users.append(json.loads(user))

# Define the columns to extract
user_df = []
# Loop through users and extract values
for i, user in enumerate(users):
    user_data = {
        'user_id': user['user']['_id'],
        'coin_balance': user['user']['coin_balance'],
        'score': user['user']['score'],
        'username': user['user']['username'],
        'joined_at': convert_string_to_datetime(user['member']['joined_at']),
        'attributes': user['user'].get('attributes', []),
        'roles': user['member'].get('roles', [])
    }

    user_df.append(user_data)

user_df = pd.DataFrame(user_df)

with open('user_df.pkl', 'wb') as fh:
    pickle.dump(user_df, fh)       