# Conheça o Colab — H₂ Verde

EDA, features, índice de atratividade e georreferenciamento com saída reprodutível.

In [None]:
!pip -q install reverse_geocode pycountry

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import reverse_geocode as rg
import pycountry
from google.colab import drive
drive.mount('/content/drive')

In [None]:
BASE = '/content/drive/MyDrive/h2_project'
DATASET_PATH = os.path.join(BASE, 'renewable_hydrogen_dataset', 'renewable_hydrogen_dataset_2535.csv')
OUT_DIR = os.path.join(BASE, 'outputs')
os.makedirs(OUT_DIR, exist_ok=True)
if not os.path.exists(DATASET_PATH):
    DATASET_PATH = '/content/renewable_hydrogen_dataset_2535.csv'

## Carregar dados

In [None]:
df = pd.read_csv(DATASET_PATH)
df.shape

## EDA básica

In [None]:
shape = df.shape
cols = df.columns.tolist()
desc_num = df.describe().T
desc_obj = df.describe(include='object').T
missing = df.isna().sum()
corr = df.select_dtypes(include=[np.number]).corr()
desc_num.to_csv(os.path.join(OUT_DIR, 'rh_desc_numeric.csv'))
desc_obj.to_csv(os.path.join(OUT_DIR, 'rh_desc_object.csv'))
missing.to_csv(os.path.join(OUT_DIR, 'rh_missing.csv'))
corr.to_csv(os.path.join(OUT_DIR, 'rh_corr.csv'))
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, cmap='viridis', ax=ax)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'rh_corr_heatmap.png'))
plt.close()

## Engenharia de features

In [None]:
df['Total_Renewable_Power_kW'] = df['PV_Power_kW'] + df['Wind_Power_kW']
df['H2_per_kW'] = df['Hydrogen_Production_kg/day'] / df['Total_Renewable_Power_kW'].replace(0, np.nan)
df['Solar_to_Wind_Ratio'] = df['PV_Power_kW'] / df['Wind_Power_kW'].replace(0, np.nan)
df = df.loc[:, ~df.columns.duplicated()].copy()

## Normalização e índice de atratividade

In [None]:
score_cols = ['Hydrogen_Production_kg/day', 'System_Efficiency_%', 'Electrolyzer_Efficiency_%', 'Feasibility_Score', 'H2_per_kW']
scaler = MinMaxScaler()
norm = scaler.fit_transform(df[score_cols])
norm_df = pd.DataFrame(norm, columns=[c + '_norm' for c in score_cols])
df = pd.concat([df.reset_index(drop=True), norm_df.reset_index(drop=True)], axis=1)
w = {'prod': 0.30, 'feas': 0.30, 'sys': 0.20, 'elec': 0.10, 'h2kw': 0.10}
df['Attractiveness_Index'] = (
    w['prod']  * df['Hydrogen_Production_kg/day_norm'] +
    w['feas']  * df['Feasibility_Score_norm'] +
    w['sys']   * df['System_Efficiency_%_norm'] +
    w['elec']  * df['Electrolyzer_Efficiency_%_norm'] +
    w['h2kw']  * df['H2_per_kW_norm']
)

## Georreferenciamento (reverse geocoding)

In [None]:
coords = list(zip(df['Latitude'], df['Longitude']))
results = rg.search(coords)
df['country_name'] = [r.get('country', '') for r in results]
df['city_name'] = [r.get('city', '') for r in results]
def to_iso2(name):
    try:
        return pycountry.countries.lookup(name).alpha_2
    except:
        return name
df['country_iso2'] = df['country_name'].map(to_iso2)

## Ranking e exportação

In [None]:
ranking_cols = ['City','country_name','country_iso2','Hydrogen_Production_kg/day','System_Efficiency_%','Electrolyzer_Efficiency_%','Feasibility_Score','H2_per_kW','Attractiveness_Index']
ranking = df.sort_values('Attractiveness_Index', ascending=False)[ranking_cols]
ranking.to_csv(os.path.join(OUT_DIR, 'h2_hub_ranking.csv'), index=False)
top20 = ranking.head(20)
plt.figure(figsize=(10,6))
plt.barh(top20['City'], top20['Attractiveness_Index'])
plt.gca().invert_yaxis()
plt.xlabel('Attractiveness_Index')
plt.title('Top 20 cidades para H₂ verde')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, 'top20_attractiveness.png'))
plt.close()