# Challenge FIAP 2024 | 1TSCPV-2024
## Grupo DataStorm
#### Ana Beatriz Azevedo RM 557420
#### Heloiza Oliveira RM 558881
#### Isabelle Nahas RM 557405
#### Matheus Madrid RM 555799
#### Sara Sitta RM 555113

# O objetivo desta etapa, dentro do Big Query, é integrar todas as tabelas em um único dataframe que será usado para treinar o modelo. Para isso, vamos desprezar algumas subprefeituras que não são comuns a todos os DFs e demais colunas que não serão usadas.

# Leitura dos dataframes

In [None]:
import pandas as pd
from pandas_gbq import to_gbq
import bigframes.pandas as bf


In [None]:
bf.options.bigquery.location = "US"
bf.options.bigquery.project = "elated-drive-432523-s4"

In [None]:
df_alagamento = bf.read_gbq("elated-drive-432523-s4.TRUSTED.DF_ALAGAMENTO_TT")
df_interrupcoes = bf.read_gbq("elated-drive-432523-s4.TRUSTED.DF_INTERRUPCOES_TT")
df_ocorrencias = bf.read_gbq("elated-drive-432523-s4.TRUSTED.DF_OCORRENCIAS_TT")
df_clima = bf.read_gbq("elated-drive-432523-s4.TRUSTED.DF_CLIMA_TT")




In [None]:
print(df_clima.dtypes)
print(df_interrupcoes.dtypes)
print(df_ocorrencias.dtypes)
print(df_alagamento.dtypes)

DATA                    timestamp[us, tz=UTC][pyarrow]
IndiceDePluviometria                           Float64
SUB                                    string[pyarrow]
Tem Max                                        Float64
Tem Min                                        Float64
UR                                             Float64
dtype: object
SUB                            string[pyarrow]
DATA            timestamp[us, tz=UTC][pyarrow]
INTERRUPCOES                             Int64
dtype: object
SUB                               string[pyarrow]
DATA               timestamp[us, tz=UTC][pyarrow]
ALAGAMENTO                                  Int64
DESLIZAMENTO                                Int64
INUNDACAO                                   Int64
QUEDA DE ARVORE                             Int64
dtype: object
SUB                             string[pyarrow]
DATA             timestamp[us, tz=UTC][pyarrow]
ALAGAMENTO                                Int64
INTRANSITAVEL                             I

# Identificação das Subprefeituras comuns a todos os Dataframes, ajustes das SUBs com nomes distintos e remoção das SUBs não coincidentes

In [None]:
subs_alagamento = set(df_alagamento['SUB'].unique())
subs_interrupcoes = set(df_interrupcoes['SUB'].unique())
subs_ocorrencias = set(df_ocorrencias['SUB'].unique())
subs_clima = set(df_clima['SUB'].unique())

In [None]:
quantidade_nulos = df_clima['SUB'].isnull().sum()
quantidade_nulos

0

In [None]:
# Interseção de todos os conjuntos (valores comuns a todos)
comuns = subs_alagamento & subs_ocorrencias & subs_interrupcoes & subs_clima

# União de todos os conjuntos
todos_valores = subs_alagamento | subs_ocorrencias | subs_interrupcoes | subs_clima

# Elementos que não são comuns a todos
diferencas = todos_valores - comuns

print("Subs que não são comuns aos DataFrames:", diferencas)

Subs que não são comuns aos DataFrames: {'PA', 'EM', 'GU', 'SA', 'MP', 'IQ', 'VM', 'ST', 'PR', 'SM', 'MARC', 'NA'}


In [None]:
# Replacement dos nomes de SUBs que estavam errados/fora do padrão
df_clima['SUB'] = df_clima['SUB'].replace('Co', 'SE')
df_alagamento['SUB'] = df_alagamento['SUB'].replace('Co', 'SE')
df_interrupcoes['SUB'] = df_interrupcoes['SUB'].replace('Co', 'SE')
df_ocorrencias['SUB'] = df_ocorrencias['SUB'].replace('Co', 'SE')

In [None]:
# Exclusão das SUBS que não são comuns e não vão servir para a análise
df_alagamento = df_alagamento[df_alagamento['SUB'].isin(comuns)]
df_ocorrencias = df_ocorrencias[df_ocorrencias['SUB'].isin(comuns)]
df_interrupcoes = df_interrupcoes[df_interrupcoes['SUB'].isin(comuns)]
df_clima = df_clima[df_clima['SUB'].isin(comuns)]

# Padronização do formato da coluna DATA

In [None]:
df_interrupcoes['DATA'] = df_interrupcoes['DATA'].dt.date
df_clima['DATA'] = df_clima['DATA'].dt.date
df_alagamento['DATA'] = df_alagamento['DATA'].dt.date
df_ocorrencias['DATA'] = df_ocorrencias['DATA'].dt.date

In [None]:
print(df_alagamento['DATA'].dtype)
print(df_interrupcoes['DATA'].dtype)
print(df_ocorrencias['DATA'].dtype)
print(df_clima['DATA'].dtype)


date32[day][pyarrow]
date32[day][pyarrow]
date32[day][pyarrow]
date32[day][pyarrow]


In [None]:
df_alagamento = df_alagamento.to_pandas()
df_interrupcoes = df_interrupcoes.to_pandas()
df_ocorrencias = df_ocorrencias.to_pandas()
df_clima = df_clima.to_pandas()

# Junção de todos os dataframes em uma tabela única e tratamento final

In [None]:
dataframes = [df_alagamento, df_interrupcoes, df_ocorrencias, df_clima]

df_refined = dataframes[0]
for df in dataframes[1:]:
    df_refined = pd.merge(df_refined, df, on=['SUB', 'DATA'], how='outer')

df_refined


Unnamed: 0,SUB,DATA,ALAGAMENTO_x,INTRANSITAVEL,TRANSITAVEL,INTERRUPCOES,ALAGAMENTO_y,DESLIZAMENTO,INUNDACAO,QUEDA DE ARVORE,IndiceDePluviometria,Tem Max,Tem Min,UR
0,AD,2022-01-01,,,,,,,,,5.5,28.9,17.9,54.9
1,AD,2022-01-02,,,,1,,,,,0.7,32.3,19.9,42.7
2,AD,2022-01-03,,,,1,0,0,0,1,3.0,32.1,20.7,47.1
3,AD,2022-01-04,,,,2,,,,,8.4,28.9,20.8,58.4
4,AD,2022-01-05,,,,1,,,,,16.5,28.7,20.3,60.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56710,VP,2024-06-28,,,,1,,,,,,,,
56711,VP,2024-06-28,,,,1,,,,,,,,
56712,VP,2024-06-28,,,,1,,,,,,,,
56713,VP,2024-06-28,,,,1,,,,,,,,


In [None]:
columns_to_fill = [
    'ALAGAMENTO_x',
    'INTRANSITAVEL',
    'TRANSITAVEL',
    'INTERRUPCOES',
    'ALAGAMENTO_y',
    'DESLIZAMENTO',
    'INUNDACAO',
    'QUEDA DE ARVORE'
]

df_refined[columns_to_fill] = df_refined[columns_to_fill].fillna(0)
df_refined

Unnamed: 0,SUB,DATA,ALAGAMENTO_x,INTRANSITAVEL,TRANSITAVEL,INTERRUPCOES,ALAGAMENTO_y,DESLIZAMENTO,INUNDACAO,QUEDA DE ARVORE,IndiceDePluviometria,Tem Max,Tem Min,UR
0,AD,2022-01-01,0,0,0,0,0,0,0,0,5.5,28.9,17.9,54.9
1,AD,2022-01-02,0,0,0,1,0,0,0,0,0.7,32.3,19.9,42.7
2,AD,2022-01-03,0,0,0,1,0,0,0,1,3.0,32.1,20.7,47.1
3,AD,2022-01-04,0,0,0,2,0,0,0,0,8.4,28.9,20.8,58.4
4,AD,2022-01-05,0,0,0,1,0,0,0,0,16.5,28.7,20.3,60.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56710,VP,2024-06-28,0,0,0,1,0,0,0,0,,,,
56711,VP,2024-06-28,0,0,0,1,0,0,0,0,,,,
56712,VP,2024-06-28,0,0,0,1,0,0,0,0,,,,
56713,VP,2024-06-28,0,0,0,1,0,0,0,0,,,,


In [None]:
# Substituindo valores maiores ou iguais a 1 por 1
df_refined.loc[df_refined['INTERRUPCOES'] >= 1, 'INTERRUPCOES'] = 1

In [None]:
df_refined = df_refined.drop(columns=['INTRANSITAVEL','TRANSITAVEL','ALAGAMENTO_y','DESLIZAMENTO','INUNDACAO'])

In [None]:
df_refined = df_refined.dropna(subset=['IndiceDePluviometria', 'Tem Max', 'Tem Min', 'UR'], how='all')
df_refined

Unnamed: 0,SUB,DATA,ALAGAMENTO_x,INTERRUPCOES,QUEDA DE ARVORE,IndiceDePluviometria,Tem Max,Tem Min,UR
0,AD,2022-01-01,0,0,0,5.5,28.9,17.9,54.9
1,AD,2022-01-02,0,1,0,0.7,32.3,19.9,42.7
2,AD,2022-01-03,0,1,1,3.0,32.1,20.7,47.1
3,AD,2022-01-04,0,1,0,8.4,28.9,20.8,58.4
4,AD,2022-01-05,0,1,0,16.5,28.7,20.3,60.2
...,...,...,...,...,...,...,...,...,...
56665,VP,2024-05-29,0,1,0,0.0,19.68,10.37,55.47
56666,VP,2024-05-29,0,1,0,0.0,19.68,10.37,55.47
56667,VP,2024-05-30,0,1,1,0.0,18.64,10.51,70.64
56668,VP,2024-05-31,0,1,0,0.0,20.62,10.8,67.62


In [None]:
# Convertendo a DATA

df_refined['DATA'] = pd.to_datetime(df_refined['DATA'])

In [None]:
print(df_refined['DATA'].dtype)

datetime64[ns]


In [None]:

df_refined['DATA'] = pd.to_datetime(df_refined['DATA'])

# Transferência para a camada final REFINED

In [None]:
from pandas_gbq import to_gbq

In [None]:
# Salvando na camada Refined

import pandas as pd
from pandas_gbq import to_gbq

project_id = "elated-drive-432523-s4"
table_id = "REFINED.DF_REFINED"

to_gbq(df_refined, table_id, project_id=project_id, if_exists='replace')


100%|██████████| 1/1 [00:00<00:00, 5882.61it/s]
