## Pré-processamento dos arquivos coletados de métricas

Esse pré-processamento visa gerar um novo arquivo a partir do arquivo original extraído via Prometheus adicionando  novos campos para posterior processamento.
Os novos campos gerados compreendem um hash de identificação da aplicação além de colunas de tempo (hora e minuto).

### Métricas coletadas:
- Consumo de CPU
- Consumo de Memóroa
- Pods encerrados por Estouro de memória (OOMKilled)
- Consumo excessivo de CPU




In [1]:
import pandas as pd
import hashlib
import glob
import os

## Funções gerais        

In [2]:
# Extrai a identificação do sistema a partir do nome do ambiente
def extract_system_name(namespace):
    environment = namespace.split('-')[-1]
    system = namespace.split(f'-{environment}')[0]
    return system

# Cria os diretórios
def create_folder(path):
    try:
       os.makedirs(path)
    except FileExistsError:
       # directory already exists
       pass

def hash(ambiente, pod):
    return hashlib.md5(f'{ambiente}{pod}'.encode('utf-8')).hexdigest()


## Pré-processamento de coletas de consumo de cpu

In [3]:
# Processamento de arquivos de CPU
header_list = ["Sistema", "Ambiente", "Modulo", "Pod", "Uso_CPU"]
path_processados = '/dados/metrics/cpu/processados'
path_projeto = '/home/56740050368/Treinamento/IA-PUC_Minas/Trabalho_Cientifico'
path_metricas = '/dados/metrics/cpu/'
csv_files = glob.glob(path_projeto+path_metricas+ "*.gz")

# Cria diretorios
create_folder(path_projeto+path_metricas)
create_folder(path_projeto+path_processados)

for file in csv_files:    
    data = pd.read_csv(file, sep=';', header=None, names=header_list)
    # obtem horario
    filename = file.replace(path_projeto+path_metricas, "")
    hora = int(filename[4:6])
    min = int(filename[6:8])
    print(filename)
        
    # acrescenta coluna de horário    
    data['Hora'] = hora
    
    # acrescenta coluna de minuto    
    data['Minuto'] = min
    
    # acrescenta coluna de hash
    data['Hash'] = [hash(x, y) for x, y in zip(data['Ambiente'], data['Pod'])]
    
    # salva arquivo
    data.to_csv(f'{path_projeto}{path_processados}/cpu_{hora}_{min}.csv', index=False)
    
print('Fim processamento de CPU')



cpu_17000403112022.csv.gz
cpu_10000503112022.csv.gz
cpu_00150503112022.csv.gz
cpu_21300503112022.csv.gz
cpu_06450503112022.csv.gz
cpu_22450603112022.csv.gz
cpu_20300503112022.csv.gz
cpu_16000403112022.csv.gz
cpu_14300403112022.csv.gz
cpu_04000503112022.csv.gz
cpu_13300503112022.csv.gz
cpu_23000603112022.csv.gz
cpu_05450503112022.csv.gz
cpu_14000403112022.csv.gz
cpu_20450403112022.csv.gz
cpu_14150603112022.csv.gz
cpu_15000803112022.csv.gz
cpu_12150403112022.csv.gz
cpu_19150503112022.csv.gz
cpu_23450503112022.csv.gz
cpu_23300403112022.csv.gz
cpu_23150303112022.csv.gz
cpu_00450403112022.csv.gz
cpu_11150403112022.csv.gz
cpu_18300503112022.csv.gz
cpu_08450403112022.csv.gz
cpu_22000403112022.csv.gz
cpu_09150503112022.csv.gz
cpu_13000603112022.csv.gz
cpu_09000503112022.csv.gz
cpu_17450403112022.csv.gz
cpu_05000403112022.csv.gz
cpu_18450703112022.csv.gz
cpu_01000503112022.csv.gz
cpu_15150503112022.csv.gz
cpu_02300603112022.csv.gz
cpu_19300403112022.csv.gz
cpu_06300603112022.csv.gz
cpu_02450403

## Pré-processamento de coletas de consumo de Memória

In [4]:
# Processamento de dados de memória
header_list = ["Sistema", "Ambiente", "Modulo", "Pod", "Uso_Memoria"]
path_processados = '/dados/metrics/memoria/processados'
path_metricas = '/dados/metrics/memoria/'
csv_files = glob.glob(path_projeto+path_metricas+ "*.gz")

# Cria diretorios
create_folder(path_projeto+path_metricas)
create_folder(path_projeto+path_processados)

for file in csv_files:    
    data = pd.read_csv(file, sep=';', header=None, names=header_list)
    # obtem horario
    filename = file.replace(path_projeto+path_metricas, "")
    print(filename)
    hora = int(filename[7:9])
    min = int(filename[9:11])
        
    # acrescenta coluna de horário    
    data['Hora'] = hora
    data['Minuto'] = min
    # acrescenta coluna de hash
    data['Hash'] = [hash(x, y) for x, y in zip(data['Ambiente'], data['Pod'])]
    
    # salva arquivo
    data.to_csv(f'{path_projeto}{path_processados}/memoria_{hora}_{min}.csv', index=False)
    
print('Fim processsamento de Memoria')


memory_12000603112022.csv.gz
memory_20450403112022.csv.gz
memory_12450403112022.csv.gz
memory_08150503112022.csv.gz
memory_12300403112022.csv.gz
memory_01450403112022.csv.gz
memory_10150303112022.csv.gz
memory_15300403112022.csv.gz
memory_19000403112022.csv.gz
memory_01300303112022.csv.gz
memory_05150303112022.csv.gz
memory_03150403112022.csv.gz
memory_11300503112022.csv.gz
memory_14000403112022.csv.gz
memory_05300603112022.csv.gz
memory_11000503112022.csv.gz
memory_07450603112022.csv.gz
memory_03300403112022.csv.gz
memory_08300503112022.csv.gz
memory_13300503112022.csv.gz
memory_15450503112022.csv.gz
memory_10300303112022.csv.gz
memory_07150403112022.csv.gz
memory_02000503112022.csv.gz
memory_23450503112022.csv.gz
memory_14150603112022.csv.gz
memory_18300503112022.csv.gz
memory_00000303112022.csv.gz
memory_16300803112022.csv.gz
memory_21300503112022.csv.gz
memory_22150403112022.csv.gz
memory_04300403112022.csv.gz
memory_23150403112022.csv.gz
memory_20300503112022.csv.gz
memory_1730040

## Pré-processamento das coletas de pods que foram encerrados por erros de memória (OOMKilled)

In [5]:
# Processamento da coleta de pods encerrados por estouro de memória
header_list = ["Sistema", "Ambiente", "Modulo", "Pod", "Error", "Qtd"]
path_processados = '/dados/metrics/error/processados'
path_metricas = '/dados/metrics/error/'
csv_files = glob.glob(path_projeto+path_metricas+ "*.gz")

# Cria diretorios
create_folder(path_projeto+path_metricas)
create_folder(path_projeto+path_processados)

for file in csv_files:    
    data = pd.read_csv(file, sep=';', header=None, names=header_list)
    # obtem horario
    filename = file.replace(path_projeto+path_metricas, "")
    print(filename)   
    
    hora = int(filename[6:8])
    min = int(filename[8:10])   
        
    # acrescenta coluna de horário    
    data['Hora'] = hora
    data['Minuto'] = min
    # acrescenta coluna de hash
    data['Hash'] = [hash(x, y) for x, y in zip(data['Ambiente'], data['Pod'])]
    
    # salva arquivo
    data.to_csv(f'{path_projeto}{path_processados}/error_{hora}_{min}.csv', index=False)
    
print('Fim processsamento de Erros de Memória')

error_04000503112022.csv.gz
error_01150303112022.csv.gz
error_17300303112022.csv.gz
error_09150503112022.csv.gz
error_05150303112022.csv.gz
error_06150303112022.csv.gz
error_01450403112022.csv.gz
error_04450303112022.csv.gz
error_22150303112022.csv.gz
error_19150403112022.csv.gz
error_14450503112022.csv.gz
error_01000403112022.csv.gz
error_09450403112022.csv.gz
error_14300303112022.csv.gz
error_05000403112022.csv.gz
error_15450403112022.csv.gz
error_00450403112022.csv.gz
error_10150303112022.csv.gz
error_12300403112022.csv.gz
error_10000403112022.csv.gz
error_23450403112022.csv.gz
error_20300403112022.csv.gz
error_00300503112022.csv.gz
error_04300303112022.csv.gz
error_23000403112022.csv.gz
error_13150403112022.csv.gz
error_03150303112022.csv.gz
error_08000303112022.csv.gz
error_22300503112022.csv.gz
error_17150403112022.csv.gz
error_10300203112022.csv.gz
error_17450303112022.csv.gz
error_18150403112022.csv.gz
error_07150303112022.csv.gz
error_05450503112022.csv.gz
error_00000303112022

## Pré-processamento das coletas de pods que excederam o consumo de CPU configurado (throttled)

In [6]:
# Processamento da coleta de pods encerrados por estouro de memória
header_list = ["Sistema", "Ambiente", "Modulo", "Pod", "Uso_CPU"]
path_processados = '/dados/metrics/throttled/processados'
path_metricas = '/dados/metrics/throttled/'
csv_files = glob.glob(path_projeto+path_metricas+ "*.gz")

# Cria diretorios
create_folder(path_projeto+path_metricas)
create_folder(path_projeto+path_processados)

for file in csv_files:    
    data = pd.read_csv(file, sep=';', header=None, names=header_list)
    # obtem horario
    filename = file.replace(path_projeto+path_metricas, "")
    print(filename)
    hora = int(filename[14:16])
    min = int(filename[16:18])   
        
    # acrescenta coluna de horário    
    data['Hora'] = hora
    data['Minuto'] = min
    # acrescenta coluna de hash
    data['Hash'] = [hash(x, y) for x, y in zip(data['Ambiente'], data['Pod'])]
    
    # salva arquivo
    data.to_csv(f'{path_projeto}{path_processados}/cpu_throttled_{hora}_{min}.csv', index=False)
    
print('Fim processsamento de CPU Throttled')

cpu_throttled_23000503112022.csv.gz
cpu_throttled_06300603112022.csv.gz
cpu_throttled_10150303112022.csv.gz
cpu_throttled_13150503112022.csv.gz
cpu_throttled_10000403112022.csv.gz
cpu_throttled_05450503112022.csv.gz
cpu_throttled_12000603112022.csv.gz
cpu_throttled_08150803112022.csv.gz
cpu_throttled_10300303112022.csv.gz
cpu_throttled_10450403112022.csv.gz
cpu_throttled_20450403112022.csv.gz
cpu_throttled_00450403112022.csv.gz
cpu_throttled_02450403112022.csv.gz
cpu_throttled_19000403112022.csv.gz
cpu_throttled_00000303112022.csv.gz
cpu_throttled_09150503112022.csv.gz
cpu_throttled_07150403112022.csv.gz
cpu_throttled_17300403112022.csv.gz
cpu_throttled_23300403112022.csv.gz
cpu_throttled_21300603112022.csv.gz
cpu_throttled_22000403112022.csv.gz
cpu_throttled_03150403112022.csv.gz
cpu_throttled_15150603112022.csv.gz
cpu_throttled_23150303112022.csv.gz
cpu_throttled_19150403112022.csv.gz
cpu_throttled_15000703112022.csv.gz
cpu_throttled_08300503112022.csv.gz
cpu_throttled_23450503112022

In [9]:
# Quantidade de sistemas, ambientes, modulos e aplicações (pods)
path_processados = '/dados/metrics/cpu/processados'
dados = pd.read_csv(f'{path_projeto}{path_processados}/cpu_0_0.csv', sep=',')
dados.head()

Unnamed: 0,Sistema,Ambiente,Modulo,Pod,Uso_CPU,Hora,Minuto,Hash
0,spunet,spunet-pro,acervo-backend,acervo-backend-c84f8db5c-6pz9k,0.000659,0,0,ce4468ee8389709c20b40ed1c09a36c1
1,spunet,spunet-pro,acervo-backend,acervo-backend-c84f8db5c-pq4t5,0.001051,0,0,0dcf61756612a9edfc06d254802e4b28
2,spunet,spunet-pro,acervo-backend,acervo-backend-c84f8db5c-6lvqr,0.000866,0,0,19d9963e0460ed384a60ef724eb79ba0
3,spunet,spunet-hom,acervo-backend,acervo-backend-5b4555f876-96qpp,0.000601,0,0,71fda736ff39dfe174e01df9d489ee11
4,spunet,spunet-pro,acervo-backend,acervo-backend-c84f8db5c-sjg2h,0.000955,0,0,1e300feefc2f05d7e69bbae4dd0a7bda


In [10]:
print('Quantidade de sistemas: ', len(dados['Sistema'].unique()))

Quantidade de sistemas:  599


In [11]:
print('Quantidade de ambientes: ', len(dados['Ambiente'].unique()))

Quantidade de ambientes:  2174


In [12]:
print('Quantidade de Modulos: ', len(dados['Modulo'].unique()))

Quantidade de Modulos:  2946


In [13]:
print('Quantidade de Aplicações (Pods): ', len(dados['Pod'].unique()))

Quantidade de Aplicações (Pods):  11488
