In [10]:
import pandas as pd
import glob

In [11]:
def read_partials(key, **kargs):
    all_files = glob.glob(key)
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, **kargs)
        li.append(df)

    return pd.concat(li, axis=0, ignore_index=True)

# Reading Tablas

In [12]:
tablas = read_partials("tablas*.csv", delimiter=";")

In [13]:
tablas.head()

Unnamed: 0,environment,database,path,last_modified,year,month,day
0,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1
1,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1
2,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1
3,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200201120000,2021,2,1
4,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200201120000,2021,2,1


# Reading Columns

In [14]:
colunas = read_partials("colunas*.csv", delimiter=";")

In [15]:
colunas.head()

Unnamed: 0,environment,database,column,year,month,day
0,production,tb_raw_mydatabase_01,coluna1,2021,1,1
1,production,tb_raw_mydatabase_01,coluna2,2021,1,1
2,production,tb_raw_mydatabase_01,coluna3,2021,1,1
3,production,tb_raw_mydatabase_01,coluna4,2021,1,1
4,production,tb_raw_mydatabase_01,coluna5,2021,1,1


# Merging Files

In [17]:
tablas_cols = tablas.merge(colunas, left_on=['database', 'year', 'month', 'day'], right_on=['database', 'year', 'month', 'day'], suffixes=('_left', '_right'))

In [18]:
tablas_cols.head()

Unnamed: 0,environment_left,database,path,last_modified,year,month,day,environment_right,column
0,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna1
1,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna2
2,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna3
3,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna4
4,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna5


# Isolating data

In [19]:
last = 20200501120000

In [20]:
tablas_cols.dtypes

environment_left     object
database             object
path                 object
last_modified         int64
year                  int64
month                 int64
day                   int64
environment_right    object
column               object
dtype: object

In [21]:
tablas_atual = tablas_cols.loc[(tablas_cols['last_modified'] > last)]

In [36]:
tablas_atual.head(100)

Unnamed: 0,environment_left,database,path,last_modified,year,month,day,environment_right,column
0,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200601120000,2021,6,1,production,coluna1
1,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200601120000,2021,6,1,production,coluna2
2,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200601120000,2021,6,1,production,coluna3
3,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200601120000,2021,6,1,production,coluna4
4,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200601120000,2021,6,1,production,coluna5
5,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200601120000,2021,6,1,production,coluna1
6,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200601120000,2021,6,1,production,coluna2
7,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200601120000,2021,6,1,production,coluna3
8,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200601120000,2021,6,1,production,coluna4
9,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200601120000,2021,6,1,production,coluna5


# Reset Index

# Saving JSON

In [40]:
result = tablas_atual.to_json()
count  = len(tablas_atual.index)

In [49]:
result

'{"environment_left":{"0":"production","1":"production","2":"production","3":"production","4":"production","5":"production","6":"production","7":"production","8":"production","9":"production","10":"production","11":"production","12":"production","13":"production","14":"production"},"database":{"0":"tb_raw_mydatabase_01","1":"tb_raw_mydatabase_01","2":"tb_raw_mydatabase_01","3":"tb_raw_mydatabase_01","4":"tb_raw_mydatabase_01","5":"tb_raw_mydatabase_02","6":"tb_raw_mydatabase_02","7":"tb_raw_mydatabase_02","8":"tb_raw_mydatabase_02","9":"tb_raw_mydatabase_02","10":"tb_raw_mydatabase_03","11":"tb_raw_mydatabase_03","12":"tb_raw_mydatabase_03","13":"tb_raw_mydatabase_03","14":"tb_raw_mydatabase_03"},"path":{"0":"s3:\\/tb_raw_mydatabase_01","1":"s3:\\/tb_raw_mydatabase_01","2":"s3:\\/tb_raw_mydatabase_01","3":"s3:\\/tb_raw_mydatabase_01","4":"s3:\\/tb_raw_mydatabase_01","5":"s3:\\/tb_raw_mydatabase_02","6":"s3:\\/tb_raw_mydatabase_02","7":"s3:\\/tb_raw_mydatabase_02","8":"s3:\\/tb_raw_myda

# Reading JSON

In [43]:
import json

In [44]:
tb = json.loads(result) 

In [59]:
tb.keys()

dict_keys(['environment_left', 'database', 'path', 'last_modified', 'year', 'month', 'day', 'environment_right', 'column'])

In [63]:
tablas_json = {}
for i in range(0, count):
    obj = {}
    obj['environment_left'] = tb['environment_left'][f'{i}']
    obj['database'] = tb['database'][f'{i}']
    obj['path'] = tb['path'][f'{i}']
    obj['last_modified'] = tb['last_modified'][f'{i}']
    obj['year'] = tb['year'][f'{i}']
    obj['month'] = tb['month'][f'{i}']
    obj['day'] = tb['day'][f'{i}']
    obj['column'] = tb['column'][f'{i}']
    obj['description'] = "descricao"
    obj['type'] = "string"
    
    tablas_json[tb['database'][f'{i}']] = obj

print(tablas_json)


{'tb_raw_mydatabase_01': {'environment_left': 'production', 'database': 'tb_raw_mydatabase_01', 'path': 's3:/tb_raw_mydatabase_01', 'last_modified': 20200601120000, 'year': 2021, 'month': 6, 'day': 1, 'column': 'coluna5'}, 'tb_raw_mydatabase_02': {'environment_left': 'production', 'database': 'tb_raw_mydatabase_02', 'path': 's3:/tb_raw_mydatabase_02', 'last_modified': 20200601120000, 'year': 2021, 'month': 6, 'day': 1, 'column': 'coluna5'}, 'tb_raw_mydatabase_03': {'environment_left': 'production', 'database': 'tb_raw_mydatabase_03', 'path': 's3:/tb_raw_mydatabase_03', 'last_modified': 20200601120000, 'year': 2021, 'month': 6, 'day': 1, 'column': 'coluna5'}}
