## Libraries

In [1]:
import pandas as pd
import numpy as np
import json

## Functions

### Flattener function

In [2]:
def flatten_nested_df(df):
    list_error = []
    dict_error = []

    try:
        s = (df.applymap(type) == list).any()
        list_columns = s[s].index.tolist()

        s = (df.applymap(type) == dict).any()
        dict_columns = s[s].index.tolist()

        while (len(list_columns) > 0 or len(dict_columns) > 0) and ((set(list_columns) != set(list_error)) or (set(dict_columns) != set(dict_error))):
            for x in list_columns:
                try:
                    df = df.explode(x, ignore_index=False)
                    df = df.reset_index(drop=True)
                except:
                    print(f"Error en: {x}")
                    list_error.append(x)
                    pass

            for y in dict_columns:
                try:
                    df_new = pd.json_normalize(df[y].dropna()).add_prefix(f'{y}.')
                    df = df.merge(df_new, how='left', left_index=True, right_index=True)
                    del(df[y])
                except:
                    print(f"Error en: {y}")
                    dict_error.append(y)
                    pass

            s = (df.applymap(type) == list).any()
            list_columns = s[s].index.tolist()

            s = (df.applymap(type) == dict).any()
            dict_columns = s[s].index.tolist()
        
        return df
    except:
        return df




### Verify flattening

In [3]:
def verify_flattening(df):
    s = (df.applymap(type) == list).any()
    list_columns = s[s].index.tolist()
    
    s = (df.applymap(type) == dict).any()
    dict_columns = s[s].index.tolist()

    print(f"Listas: {list_columns}, Diccionarios: {dict_columns}")

### Iteration pipeline

In [10]:
def smasher(json_data):
    stage_df = pd.DataFrame()

    for x in range(0, len(json_data)):
        j_norm = pd.json_normalize(json_data[x])
        j_flat = flatten_nested_df(j_norm)
        stage_df = pd.concat([j_flat, stage_df])
        
        print(f"Progreso: {x+1} / {len(json_data)}")
        print(stage_df.shape)
        
    return stage_df

## Read json file

In [5]:
with open('large-file.json', 'r', encoding="utf8") as f:
    json_data = json.load(f)

## Execute

In [11]:
final_flat = smasher(json_data)

Progreso: 1 / 11351
(1, 17)
Progreso: 2 / 11351
(2, 28)
Progreso: 3 / 11351
(3, 28)
Progreso: 4 / 11351
(4, 34)
Progreso: 5 / 11351
(5, 34)
Progreso: 6 / 11351
(6, 34)
Progreso: 7 / 11351
(7, 34)
Progreso: 8 / 11351
(8, 34)
Progreso: 9 / 11351
(9, 34)
Progreso: 10 / 11351
(13, 94)
Progreso: 11 / 11351
(14, 94)
Progreso: 12 / 11351
(15, 94)
Progreso: 13 / 11351
(16, 94)
Progreso: 14 / 11351
(17, 94)
Progreso: 15 / 11351
(18, 359)
Progreso: 16 / 11351
(19, 376)
Progreso: 17 / 11351
(20, 411)
Progreso: 18 / 11351
(21, 411)
Progreso: 19 / 11351
(22, 411)
Progreso: 20 / 11351
(23, 411)
Progreso: 21 / 11351
(24, 411)
Progreso: 22 / 11351
(25, 411)
Progreso: 23 / 11351
(26, 411)
Progreso: 24 / 11351
(29, 411)
Progreso: 25 / 11351
(30, 411)
Progreso: 26 / 11351
(31, 411)
Progreso: 27 / 11351
(32, 411)
Progreso: 28 / 11351
(33, 411)
Progreso: 29 / 11351
(39, 411)
Progreso: 30 / 11351
(40, 411)
Progreso: 31 / 11351
(41, 411)
Progreso: 32 / 11351
(42, 411)
Progreso: 33 / 11351
(45, 411)
Progreso:

## Save to file

In [15]:
import os

dir_path = os.getcwd()
file_name = 'final_flat.csv'
csv_path = os.path.join(dir_path, file_name)


In [16]:
final_flat.to_csv(csv_path, index=False, encoding="utf-8")