In [5]:
# ============================================
# 00 - DATA INGEST
# ============================================

import pandas as pd
from pathlib import Path

# Caminho do dataset
DATA_PATH = Path('../data/internet_adoption.csv')

# Carregar dados
df = pd.read_csv(DATA_PATH)

# -------------------------
# Visão inicial
# -------------------------
print("Formato inicial:", df.shape)
display(df.head())

# -------------------------
# Tipos e valores nulos
# -------------------------
print("\nINFO:")
print(df.info())

print("\nNULOS POR COLUNA:")
print(df.isna().sum())

# -------------------------
# Converter colunas numéricas
# -------------------------

# Lista completa de colunas numéricas REAIS
num_cols = [
    "Population",
    "Internet_Penetration (%)",
    "Broadband_Speed (Mbps)",
    "Mobile_Broadband_Speed (Mbps)",
    "GDP_Per_Capita (USD)",
    "Education_Attainment_Index (%)",
    "Mobile_Data_Usage (GB per User)",
    "Digital_Investment (M USD)",
    "Digital_Literacy_Score (%)",
    "Sentiment_Score (Social Media)",
    "Urban_Population_Split (%)",
    "Rural_Population_Split (%)",
    "Latitude",
    "Longitude",
    "Internet_Access_Cost (USD per Mbps)",
    "Device_Penetration (%)",
    "Cybersecurity_Incidents (Count)",
    "E_Commerce_Penetration (%)",
    "Government_Digital_Policy_Index (%)",
    "Network_Latency (ms)",
    "Cloud_Service_Adoption (%)",
    "IoT_Device_Density (per 1000 people)",
    "AI_Adoption_Index (%)",
    "Data_Privacy_Regulation_Strength (%)",
    "Energy_Consumption_for_Connectivity (kWh)"
]

for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# -------------------------
# Tratamento de nulos
# -------------------------

# Exemplo: remover entradas sem valor de adoção da internet
df = df.dropna(subset=["Internet_Penetration (%)"])

# Ou preencher valores ausentes
df = df.fillna(0)

# -------------------------
# Salvar versão tratada
# -------------------------
OUTPUT_PATH = Path('../data/internet_adoption_clean.csv')
df.to_csv(OUTPUT_PATH, index=False)

print("\nArquivo limpo salvo em:", OUTPUT_PATH)
print("Novo formato:", df.shape)


Formato inicial: (6350, 28)


Unnamed: 0,Country,Date,Population,Internet_Penetration (%),Broadband_Speed (Mbps),Mobile_Broadband_Speed (Mbps),GDP_Per_Capita (USD),Education_Attainment_Index (%),Mobile_Data_Usage (GB per User),Digital_Investment (M USD),...,Device_Penetration (%),Cybersecurity_Incidents (Count),E_Commerce_Penetration (%),Government_Digital_Policy_Index (%),Network_Latency (ms),Cloud_Service_Adoption (%),IoT_Device_Density (per 1000 people),AI_Adoption_Index (%),Data_Privacy_Regulation_Strength (%),Energy_Consumption_for_Connectivity (kWh)
0,Algeria,2015-01-01,25258314,37.436774,46.839032,8.533871,5327.303548,45.489677,2.167742,164.128065,...,68.104839,4174,33.408387,37.136774,77.050645,27.834194,22.950645,15.652581,27.985161,248.871935
1,Algeria,2015-02-01,25258314,37.484286,47.2275,8.6875,5331.729286,45.555714,2.185357,165.386071,...,68.1225,3753,33.52,37.128214,77.045,27.856429,22.936429,15.681429,27.989643,248.7575
2,Algeria,2015-03-01,25258314,37.629677,47.436129,8.852903,5342.169032,45.60871,2.182581,166.200645,...,68.108065,4148,33.545806,37.155806,77.120968,27.870968,23.034839,15.686452,27.993548,248.524839
3,Algeria,2015-04-01,25258314,37.257667,47.5,9.058333,5354.909667,45.643333,2.199,166.618667,...,68.147,4032,33.579333,37.12,77.255333,27.880333,23.083667,15.706667,28.01,248.497333
4,Algeria,2015-05-01,25258314,37.492258,47.659677,8.886452,5360.973226,45.662258,2.205484,167.409032,...,68.143548,4195,33.59871,37.155484,77.075161,27.930323,23.11129,15.729355,28.018065,248.983548



INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6350 entries, 0 to 6349
Data columns (total 28 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Country                                    6350 non-null   object 
 1   Date                                       6350 non-null   object 
 2   Population                                 6350 non-null   int64  
 3   Internet_Penetration (%)                   6350 non-null   float64
 4   Broadband_Speed (Mbps)                     6350 non-null   float64
 5   Mobile_Broadband_Speed (Mbps)              6350 non-null   float64
 6   GDP_Per_Capita (USD)                       6350 non-null   float64
 7   Education_Attainment_Index (%)             6350 non-null   float64
 8   Mobile_Data_Usage (GB per User)            6350 non-null   float64
 9   Digital_Investment (M USD)                 6350 non-null   float64
 10  Digital_Literacy_