In [None]:
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime

# String Finder

In [None]:
def search_strings_in_json(directory, strings_to_find):
    matches = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                filepath = os.path.join(root, file)
                with open(filepath, 'r', encoding='utf-8') as f:
                    try:
                        content = f.read()
                        for string in strings_to_find:
                            if string in content.lower():
                                matches.append((filepath, string))
                    except Exception as e:
                        print(f"Error reading {filepath}: {e}")
    return matches

# strings_to_find = ['2307148732639320', '2090275401330376', '2090275404663709', '100056956201608']
strings_to_find=['2328713727338469']

#company info and company group:61565211886319 --> people_sets_1 & child_groups, 1910535509304367

#7775445322516960
#565159963841935
#1910351919322726
#100040716736805
# directory = '/Users/dan/Documents/local_code/test/data/woolies/workplace/organization'
# directory = '/Users/dan/Documents/local_code/test/data/woolies/workplace/groups'
directory = '/Users/dan/Documents/local_code/test/data/woolies/workplace/user_profiles'
matches = search_strings_in_json(directory, strings_to_find)

count = 0

for match in matches:
    count+=1
    print(f"Found '{match[1]}' in file: {match[0]}")
print(count)

# JSON to pkl converter

In [None]:
import os
import json
import pandas as pd
import itertools
import pickle

def read_json_files(root_dir):
    data = {}
    if root_dir.endswith('.json'):
        temp_key_name = root_dir.split('/')[-1].split('.')[0]
        with open(root_dir, 'r', encoding='utf-8') as f:
            try:
                json_data = json.load(f)
                data[temp_key_name] = json_data
            except json.JSONDecodeError as e:
                print(f"Error reading {root_dir}: {e}")
    else:
        for root, dirs, files in os.walk(root_dir):
            for file in files:
                if file.endswith('.json'):
                    filepath = os.path.join(root, file)
                    with open(filepath, 'r', encoding='utf-8') as f:
                        temp_key_name = root_dir + '/' + file
                        try:
                            json_data = json.load(f)
                            data[temp_key_name] = json_data
                        except json.JSONDecodeError as e:
                            print(f"Error reading {filepath}: {e}")
    return data

def extract_data(data):
    records = []
    columns = set()

    def check_exists_append(record, key, value, columns):
        if record.get(key, False):
            if isinstance(record[key], list):
                record[key].append(value)
            else:
                record[key] = [record[key]] + [value]
        else:
            record[key] = value
            columns.add(key)

    def process_data(item, parent_record, parent_labels):
        new_record = 0
        # Copy the parent record to avoid mutation
        record = parent_record.copy()
        labels = parent_labels.copy()

        # Make a new entry if we're at the top level
        for key in ['timestamp', 'media', 'fbid', 'ent_name']:
            if key in item:
                new_record = 1

        if new_record == 1:
            record = {}.copy()

        # Extract basic fields
        for key in ['timestamp', 'media', 'fbid', 'ent_name']:
            if key in item:
                record[key] = item[key]
                columns.add((key, ""))

        # Process 'label_values'
        label_values = item.get('label_values', [])
        if label_values:
            for lv in label_values:
                ent_field_name = lv.get('ent_field_name', '')
                label = lv.get('label', '')
                key_name = [ent_field_name, label]

                if 'value' in lv or 'timestamp_value' in lv:
                    value = lv.get('value') or lv.get('timestamp_value')
                    if labels:
                        check_exists_append(record, tuple(labels), value, columns)

                        # records.append(record.copy())
                    else:
                        check_exists_append(record, tuple(key_name), value, columns)
                elif 'vec' in lv:
                    # Handle 'dict' and 'vec' recursively
                    nested_items = lv.get('vec')
                    if nested_items:
                        for nested_item in nested_items:
                            process_data({'label_values': [nested_item]}, parent_record, key_name )
                elif 'dict' in lv:
                    # print(lv)
                    for entry in lv.get('dict', {}):
                        value = entry.get('value') or entry.get('timestamp_value')
                        temp_label = entry.get('label', '')
                        temp_key_name = (ent_field_name, temp_label)
                        if labels:
                            check_exists_append(record, tuple(labels + [temp_label]), value, columns)
                        else:
                            check_exists_append(record, tuple(key_name + [temp_label]), value, columns)
        else:
            # Handle case for generic JSON file
            for key, value in item.items():
                check_exists_append(record, tuple([key]), value, columns)
        records.append(record.copy())

    # Iterate through a list
    for data_value in data.values():
        if isinstance(data_value, list):
            for item in data_value:
                process_data(item, {}, [])
        else:
            process_data(data_value, {}, [])


    def create_multiindex_df(data):
        # Function to flatten dictionary and preserve both tuple elements
        def flatten_dict(d):
            flattened = {}
            for k, v in d.items():
                if isinstance(k, tuple):
                    # Store both parts of the tuple
                    flattened[k] = v
                else:
                    # For non-tuple keys, create a tuple with same value
                    flattened[(k, k)] = v
            return flattened
        
        # Flatten all dictionaries
        flattened_data = [flatten_dict(d) for d in data]
        
        # Create initial DataFrame
        df = pd.DataFrame(flattened_data)
        
        # Get all unique column tuples
        columns = df.columns.tolist()
        

        # Create MultiIndex columns
        multi_index = pd.MultiIndex.from_tuples(columns)
        
        # Create final DataFrame with MultiIndex
        final_df = pd.DataFrame(df.values, columns=multi_index)
        
        return final_df
    return create_multiindex_df(records)


def convert_to_csv(root_dir):
    assert not root_dir.endswith('.json'), f"ERROR: Path must be a directory not a file {root_dir}"
    root_temp = ""
    folder_loc = -1
    new_name = "default_csv"
    
    for root, dirs, files in os.walk(root_dir):
        # On first iteration we're setting up temp variables
        if not root_temp:
            root_temp = root.split("/")
            folder_loc = len(root_temp) - 1
            new_name = root_temp[folder_loc] + "_pkl_converted"
        
        # For each file in each directory
        for file in files:
            if file.endswith('.json'):
                # Create new output path
                output_root = root.split("/")
                output_root[folder_loc] = new_name
                output_root = "/".join(output_root)
                
                # Process the file
                filepath = os.path.join(root, file)
                temp_data = read_json_files(filepath)
                temp_records = extract_data(temp_data)
                
                # Create output directory if it doesn't exist
                os.makedirs(output_root, exist_ok=True)
                
                # Save each DataFrame in the dictionary
                output_filename = file.replace('.json', '.pkl')
                output_path = os.path.join(output_root, output_filename)
                temp_records.to_pickle(output_path, protocol=pickle.HIGHEST_PROTOCOL)
                print(f"Saved {output_path}")

# Example use

In [None]:
# Extract and view single file
temp_data = read_json_files('/workplace/organization/post_campaigns_1.json')
temp_records = extract_data(temp_data)
temp_records

# Convert whole directory to pkl

In [None]:
root_dir = 'workplace'
convert_to_csv(root_dir)