In [1]:
import pandas as pd
import glob

In [2]:
def read_partials(key, **kargs):
    all_files = glob.glob(key)
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, **kargs)
        li.append(df)

    return pd.concat(li, axis=0, ignore_index=True)

# Reading Tablas

In [3]:
tablas = read_partials("tablas*.csv", delimiter=";")

In [4]:
tablas.head()

Unnamed: 0,environment,database,path,last_modified,year,month,day
0,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1
1,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1
2,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1
3,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200201120000,2021,2,1
4,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200201120000,2021,2,1


# Reading Columns

In [5]:
colunas = read_partials("colunas*.csv", delimiter=";")

In [6]:
colunas.head()

Unnamed: 0,environment,database,column,year,month,day
0,production,tb_raw_mydatabase_01,coluna1,2021,1,1
1,production,tb_raw_mydatabase_01,coluna2,2021,1,1
2,production,tb_raw_mydatabase_01,coluna3,2021,1,1
3,production,tb_raw_mydatabase_01,coluna4,2021,1,1
4,production,tb_raw_mydatabase_01,coluna5,2021,1,1


# Merging Files

In [7]:
tablas_cols = tablas.merge(colunas, left_on=['database', 'year', 'month', 'day'], right_on=['database', 'year', 'month', 'day'], suffixes=('_left', '_right'))

In [8]:
tablas_cols.head()

Unnamed: 0,environment_left,database,path,last_modified,year,month,day,environment_right,column
0,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna1
1,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna2
2,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna3
3,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna4
4,production,tb_raw_mydatabase_01,s3:/tb_raw_mydatabase_01,20200101120000,2021,1,1,production,coluna5


# Isolating data

In [9]:
last = 20200501120000

In [10]:
tablas_cols.dtypes

environment_left     object
database             object
path                 object
last_modified         int64
year                  int64
month                 int64
day                   int64
environment_right    object
column               object
dtype: object

In [165]:
tablas_atual = tablas_cols.loc[(tablas_cols['last_modified'] > last)]

In [166]:
tablas_atual.head(100)

Unnamed: 0,environment_left,database,path,last_modified,year,month,day,environment_right,column
5,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1,production,coluna1
6,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1,production,coluna2
7,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1,production,coluna3
8,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1,production,coluna4
9,production,tb_raw_mydatabase_02,s3:/tb_raw_mydatabase_02,20200101120000,2021,1,1,production,coluna5
10,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1,production,coluna1
11,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1,production,coluna2
12,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1,production,coluna3
13,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1,production,coluna4
14,production,tb_raw_mydatabase_03,s3:/tb_raw_mydatabase_03,20200101120000,2021,1,1,production,coluna5


# Reset Index

# Saving JSON

In [33]:
tablas_atual = tablas_atual.reset_index(drop=True)
result = tablas_atual.to_json()
count  = len(tablas_atual.index)

In [34]:
result

'{"environment_left":{"0":"production","1":"production","2":"production","3":"production","4":"production","5":"production","6":"production","7":"production","8":"production","9":"production","10":"production","11":"production","12":"production","13":"production","14":"production"},"database":{"0":"tb_raw_mydatabase_01","1":"tb_raw_mydatabase_01","2":"tb_raw_mydatabase_01","3":"tb_raw_mydatabase_01","4":"tb_raw_mydatabase_01","5":"tb_raw_mydatabase_02","6":"tb_raw_mydatabase_02","7":"tb_raw_mydatabase_02","8":"tb_raw_mydatabase_02","9":"tb_raw_mydatabase_02","10":"tb_raw_mydatabase_03","11":"tb_raw_mydatabase_03","12":"tb_raw_mydatabase_03","13":"tb_raw_mydatabase_03","14":"tb_raw_mydatabase_03"},"path":{"0":"s3:\\/tb_raw_mydatabase_01","1":"s3:\\/tb_raw_mydatabase_01","2":"s3:\\/tb_raw_mydatabase_01","3":"s3:\\/tb_raw_mydatabase_01","4":"s3:\\/tb_raw_mydatabase_01","5":"s3:\\/tb_raw_mydatabase_02","6":"s3:\\/tb_raw_mydatabase_02","7":"s3:\\/tb_raw_mydatabase_02","8":"s3:\\/tb_raw_myda

# Reading JSON

In [35]:
import json

In [36]:
tb = json.loads(result) 

In [37]:
tb.keys()

dict_keys(['environment_left', 'database', 'path', 'last_modified', 'year', 'month', 'day', 'environment_right', 'column'])

In [38]:
tb

{'environment_left': {'0': 'production',
  '1': 'production',
  '2': 'production',
  '3': 'production',
  '4': 'production',
  '5': 'production',
  '6': 'production',
  '7': 'production',
  '8': 'production',
  '9': 'production',
  '10': 'production',
  '11': 'production',
  '12': 'production',
  '13': 'production',
  '14': 'production'},
 'database': {'0': 'tb_raw_mydatabase_01',
  '1': 'tb_raw_mydatabase_01',
  '2': 'tb_raw_mydatabase_01',
  '3': 'tb_raw_mydatabase_01',
  '4': 'tb_raw_mydatabase_01',
  '5': 'tb_raw_mydatabase_02',
  '6': 'tb_raw_mydatabase_02',
  '7': 'tb_raw_mydatabase_02',
  '8': 'tb_raw_mydatabase_02',
  '9': 'tb_raw_mydatabase_02',
  '10': 'tb_raw_mydatabase_03',
  '11': 'tb_raw_mydatabase_03',
  '12': 'tb_raw_mydatabase_03',
  '13': 'tb_raw_mydatabase_03',
  '14': 'tb_raw_mydatabase_03'},
 'path': {'0': 's3:/tb_raw_mydatabase_01',
  '1': 's3:/tb_raw_mydatabase_01',
  '2': 's3:/tb_raw_mydatabase_01',
  '3': 's3:/tb_raw_mydatabase_01',
  '4': 's3:/tb_raw_mydatabas

## Lendo as tabelas existentes no JSON

In [39]:
databases   = {}

for i in range(0, count):
    if tb['database'][f'{i}'] not in databases.keys():
        databases[tb['database'][f'{i}']] = {}


print(databases)

{'tb_raw_mydatabase_01': {}, 'tb_raw_mydatabase_02': {}, 'tb_raw_mydatabase_03': {}}


## Corrigindo formato do JSON e agrupando colunas

In [40]:
for i in range(0, count):
    databases[tb['database'][f'{i}']]['path'] = tb['path'][f'{i}']
    databases[tb['database'][f'{i}']]['last_modified'] = tb['last_modified'][f'{i}']
    databases[tb['database'][f'{i}']]['year'] = tb['year'][f'{i}']
    databases[tb['database'][f'{i}']]['month'] = tb['month'][f'{i}']
    databases[tb['database'][f'{i}']]['day'] = tb['day'][f'{i}']
    
    if 'columns' not in databases[tb['database'][f'{i}']].keys():
        databases[tb['database'][f'{i}']]['columns']  = []
    
    databases[tb['database'][f'{i}']]['columns'].append({
        "id": tb['column'][f'{i}'],
        "type": "text",
        "info": {
            "label": tb['column'][f'{i}'],
            "notes": tb['column'][f'{i}']
        }
    })
    

databases['tb_raw_mydatabase_01']

{'path': 's3:/tb_raw_mydatabase_01',
 'last_modified': 20200601120000,
 'year': 2021,
 'month': 6,
 'day': 1,
 'columns': [{'id': 'coluna1',
   'type': 'text',
   'info': {'label': 'coluna1', 'notes': 'coluna1'}},
  {'id': 'coluna2',
   'type': 'text',
   'info': {'label': 'coluna2', 'notes': 'coluna2'}},
  {'id': 'coluna3',
   'type': 'text',
   'info': {'label': 'coluna3', 'notes': 'coluna3'}},
  {'id': 'coluna4',
   'type': 'text',
   'info': {'label': 'coluna4', 'notes': 'coluna4'}},
  {'id': 'coluna5',
   'type': 'text',
   'info': {'label': 'coluna5', 'notes': 'coluna5'}}]}

# Motor

In [41]:
import http.client

In [42]:
import http.client

connection = http.client.HTTPConnection('18.219.144.92', 80, timeout=10)
print(connection)

<http.client.HTTPConnection object at 0x0000023D67ADDC18>


## Validando a conexao

In [43]:
import http.client

AUTHORIZATION="0090ee76-cc91-4956-ab94-616b0ddaa1c9"
SERVER="18.219.144.92"

conn = http.client.HTTPConnection(SERVER)

### Package

In [54]:
payload = json.dumps({
  "id": "tb_raw_mydatabase_03"
})

headers = {
    'Authorization': AUTHORIZATION,
    'Content-Type': 'application/json'
}

conn.request("POST", "/api/3/action/package_show", payload, headers)
res = conn.getresponse()
data = res.read().decode("utf-8")
print(data)

{"help": "http://18.219.144.92/api/3/action/help_show?name=package_show", "success": true, "result": {"author": "dodmunl", "author_email": "dodmunl@teste.com", "creator_user_id": "3541244f-6c60-497d-84ba-0aba791450e4", "id": "e5f5a43a-d0b7-42f4-887c-2c87dfc2e7b8", "isopen": false, "license_id": null, "license_title": null, "maintainer": null, "maintainer_email": null, "metadata_created": "2021-04-30T12:49:21.705081", "metadata_modified": "2021-04-30T12:49:22.090919", "name": "tb_raw_mydatabase_03", "notes": "Base carregada por processo automatico", "num_resources": 1, "num_tags": 0, "organization": {"id": "6b35db05-c162-44aa-badf-0d0c35b08814", "name": "bank", "title": "Bank", "type": "organization", "description": "", "image_url": "", "created": "2021-04-30T00:30:38.243966", "is_organization": true, "approval_status": "approved", "state": "active"}, "owner_org": "6b35db05-c162-44aa-badf-0d0c35b08814", "private": true, "state": "active", "title": "tb_raw_mydatabase_03", "type": "datase

### Resource

In [45]:
payload = json.dumps({
  "resource_id": "9177cd00-7edb-4e56-8361-dea4a37edd65"
})
headers = {
  'Authorization': AUTHORIZATION,
  'Content-Type': 'application/json'
}
conn.request("POST", "/api/3/action/datastore_search", payload, headers)
res = conn.getresponse()
data = res.read().decode("utf-8")
print(data)

{"help": "http://18.219.144.92/api/3/action/help_show?name=datastore_search", "success": true, "result": {"include_total": true, "limit": 100, "records_format": "objects", "resource_id": "9177cd00-7edb-4e56-8361-dea4a37edd65", "total_estimation_threshold": null, "records": [], "fields": [{"id": "_id", "type": "int"}, {"id": "coluna1", "type": "text", "info": {"label": "coluna1", "notes": "coluna1"}}, {"id": "coluna2", "type": "text", "info": {"label": "coluna2", "notes": "coluna2"}}, {"id": "coluna3", "type": "text", "info": {"label": "coluna3", "notes": "coluna3"}}, {"id": "coluna4", "type": "text", "info": {"label": "coluna4", "notes": "coluna4"}}, {"id": "coluna5", "type": "text", "info": {"label": "coluna5", "notes": "coluna5"}}], "_links": {"start": "/api/3/action/datastore_search", "next": "/api/3/action/datastore_search?offset=100"}, "total": 0, "total_was_estimated": false}}


## Main

In [62]:
#
# Retorna o pacote
#
def get_package(package_id: str):
    payload = json.dumps({
      "id": package_id
    })

    headers = {
        'Authorization': AUTHORIZATION,
        'Content-Type': 'application/json'
    }

    conn.request("POST", "/api/3/action/package_show", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    
    return json.loads(data)

In [63]:
#
# Retorna o recurso
#
def get_datastore(resource_id: str):
    payload = json.dumps({
      "resource_id": resource_id
    })

    headers = {
        'Authorization': AUTHORIZATION,
        'Content-Type': 'application/json'
    }

    conn.request("POST", "/api/3/action/datastore_search", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    
    return json.loads(data)

In [47]:
#
# Cria um novo pacote
#
def set_package(data):
    payload = json.dumps(data)

    headers = {
        'Authorization': AUTHORIZATION,
        'Content-Type': 'application/json'
    }

    conn.request("POST", "/api/3/action/package_create", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    
    return json.loads(data)

In [167]:
#
# Cria um novo datastore
#
def set_datastore(data):
    payload = json.dumps(data)

    headers = {
        'Authorization': AUTHORIZATION,
        'Content-Type': 'application/json'
    }

    conn.request("POST", "/api/3/action/datastore_create", payload, headers)
    res = conn.getresponse()
    data = res.read().decode("utf-8")
    
    return json.loads(data)

In [155]:
def set_package_values(file, package, keys):
        
    package_custom_json = read_partials(file, delimiter=";")
    package_custom = json.loads(package_custom_json.to_json(orient="records"))[0]

    for key in keys:
        print(key)
        print(package[key])
        print(package_custom[key])
        package[key] = package_custom[key] if key in package_custom else package[key]

    return package

In [154]:

if os.path.isfile(f'package_{database}.csv'):
    print(database)

tb_raw_mydatabase_03


In [161]:
def create_new_package(database):
    package = {
        "name": database['name'],
        "title": database['name'],
        "owner_org": "bank",
        "notes": "Base carregada por processo automatico",
        "author": "dorotheu",
        "author_email": "dorotheu@teste.com",
        "private": True,
        "extras": [
            {
                "key": "badge",
                "value": "gold"
            },
            {
                "key": "last_modified",
                "value": database['last_modified']
            },
            {
                "key": "path",
                "value": database['path']
            }
        ]
    }
    print(package)

    import os.path
    name = database['name']
    if os.path.isfile(f'package_{name}.csv'):
        print("entrou aqui")
        package = set_package_values(f'package_{name}.csv', package, ['name', 'title', 'notes'])

    print(package)
    return set_package(package)

In [None]:
def create_new_resource(database):
    datastore = {
        "resource": {
            "package_id": database,
            "name": database,
            "description": "Dicionario de Dados",
            "format": "Redshift"
        },
        "fields": databases[database]['columns'],
    }

    return set_datastore(datastore)

In [164]:
for database in databases:
    is_new_package   = False
    is_new_datastore = False
    
    # Procurar pacote no CKAN
    ckan_data = get_package(database)
    
    # Se nao houver nenhum pacote com o id especificado, entao os dados sao importados
    if "error" in ckan_data and ckan_data['error']['__type'] == "Not Found Error":
        
        databases[database]['name'] = database
        ckan_data = create_new_package(databases[database])
        is_new_package = True
    
    
    # Carregando recursos
    resource = {}
    for res in ckan_data['result']['resources']:
        if res['format'] == 'Redshift':
            resource = res
            
    
    # Se nao houver nenhum recurso do tipo Redshift no pacote, entao os dados sao importados
    if len(resource) == 0:
        
        is_new_datastore = True
    
    # Se for um novo pacote ou se o pacote ainda nao tiver recursos, os processos acima irao
    # incluir os dados e nenhuma outra modificacao deve ser feita.
    #if is_new_package or is_new_datastore:
    #    continue
        
    
    # Get resource
    #for resource in ckan_data['result']['resources']:
    #    resource['data'] = get_datastore(resource['id'])
    
    


{'name': 'tb_raw_mydatabase_03', 'title': 'tb_raw_mydatabase_03', 'owner_org': 'bank', 'notes': 'Base carregada por processo automatico', 'author': 'dorotheu', 'author_email': 'dorotheu@teste.com', 'private': True, 'extras': [{'key': 'badge', 'value': 'gold'}, {'key': 'last_modified', 'value': 20200601120000}, {'key': 'path', 'value': 's3:/tb_raw_mydatabase_03'}]}
entrou aqui
name
tb_raw_mydatabase_03
tb_raw_mydatabase_03
title
tb_raw_mydatabase_03
Minha tabela
notes
Base carregada por processo automatico
Essa e a minha tabela
{'name': 'tb_raw_mydatabase_03', 'title': 'Minha tabela', 'owner_org': 'bank', 'notes': 'Essa e a minha tabela', 'author': 'dorotheu', 'author_email': 'dorotheu@teste.com', 'private': True, 'extras': [{'key': 'badge', 'value': 'gold'}, {'key': 'last_modified', 'value': 20200601120000}, {'key': 'path', 'value': 's3:/tb_raw_mydatabase_03'}]}
