In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_excel("../data/cia_countries.xlsx", engine='openpyxl')

In [3]:
df.head()

Unnamed: 0,Name,Continent,Area (km²),Population,GDP (USD),Unemployment Rate,Taxes (% of GDP),External Debt (USD),Exchange Rate (USD),Internet Users,Internet Users Percentage of Population,Airports,Roadways (km),Militar Expenditures (% of GDP)
0,Afghanistan,Asia,652230,37466414,2065,23.9,23.9,7.0,7.87,4717013.0,13.5,46.0,34903.0,1.2
1,Akrotiri,Middle East,123,18195,0,,,,0.885,,,1.0,,
2,Albania,Europe,28748,3088385,13965,5.83,5.83,71.8,102.43,2196613.0,71.85,3.0,3945.0,1.3
3,Algeria,Africa,2381740,43576691,11511,11.7,11.7,27.5,131.085,24819531.0,59.58,149.0,104000.0,6.0
4,American Samoa,Oceania,224,46366,11200,29.8,29.8,12.2,1.0,17000.0,31.3,3.0,241.0,


# Data Wrangling

**Mover la columna *"GDP"* al final para mejorar la visualización antes y después de emplear OHE**

In [65]:
moved_column = 'GDP (USD)'
columns = list(df.columns)
columns.remove(moved_column)
columns.append(moved_column)

df = df.reindex(columns=columns)

**Convertir a valores categóricos y eliminar los valores atípicos de la columna de interés _'GDP (USD)'_**

In [66]:
df = df[df['GDP (USD)'] > 50]

In [67]:
bins = [50, 5000, 25000, float('inf')]
labels = ['Ingreso bajo', 'Ingreso medio', 'Ingreso alto']

df['GDP (USD)'] = pd.cut(df['GDP (USD)'], bins=bins, labels=labels, right=True)

In [68]:
df['GDP (USD)'].unique()

['Ingreso bajo', 'Ingreso medio', 'Ingreso alto']
Categories (3, object): ['Ingreso bajo' < 'Ingreso medio' < 'Ingreso alto']

In [69]:
df.shape

(229, 14)

## Análisis exploratorio de datos

In [70]:
# Visualizar cantidad de datos faltantes por columna
null_cols = []
for c in df.columns:

    miss = df[c].isnull().sum()
    if miss > 0:
        null_cols.append(c) # Lista con las columnas a imputar posteriormente al EDA
        print(f"Columna {c}: {miss} valores faltantes")

Columna Unemployment Rate: 10 valores faltantes
Columna Taxes (% of GDP): 10 valores faltantes
Columna External Debt (USD): 18 valores faltantes
Columna Exchange Rate (USD): 1 valores faltantes
Columna Internet Users: 4 valores faltantes
Columna Internet Users Percentage of Population: 4 valores faltantes
Columna Airports: 6 valores faltantes
Columna Roadways (km): 6 valores faltantes
Columna Militar Expenditures (% of GDP): 62 valores faltantes


In [36]:
df[null_cols].describe()

Unnamed: 0,Unemployment Rate,Taxes (% of GDP),External Debt (USD),Exchange Rate (USD),Internet Users,Internet Users Percentage of Population,Airports,Roadways (km),Militar Expenditures (% of GDP)
count,218.0,218.0,210.0,228.0,224.0,224.0,222.0,222.0,166.0
mean,10.284266,10.493807,53.685238,83.344899,18781530.0,56.778125,185.144144,205317.3,2.055241
std,10.385827,10.794723,33.325176,181.293256,68624710.0,28.414128,963.708522,955577.6,2.17933
min,0.3,0.3,0.0,0.3049,805.0,1.31,1.0,0.0,0.2
25%,3.7925,3.7925,33.225,1.0,269707.5,31.1925,7.0,2617.5,1.0775
50%,6.9,6.9,48.9,3.8757,2353472.0,60.19,31.5,21124.5,1.6
75%,11.9225,11.9825,69.125,48.97875,9301003.0,80.7475,100.25,86574.0,2.315
max,77.0,77.0,237.6,839.1,751886100.0,99.65,13513.0,10582650.0,24.0


## Imputación de datos faltantes (KNNImputer)

In [71]:
imp_cols = df.iloc()[:,2:13].columns
imp_cols

Index(['Area (km²)', 'Population', 'Unemployment Rate', 'Taxes (% of GDP)',
       'External Debt (USD)', 'Exchange Rate (USD)', 'Internet Users',
       'Internet Users Percentage of Population', 'Airports', 'Roadways (km)',
       'Militar Expenditures (% of GDP)'],
      dtype='object')

In [72]:
df['Area (km²)'] = df['Area (km²)'].astype(str) 
df['Area (km²)'] = df['Area (km²)'].str.replace(',', '', regex=False)
df['Area (km²)'] = df['Area (km²)'].str.extract(r'(\d+\.?\d*)')[0]
df['Area (km²)'] = pd.to_numeric(df['Area (km²)'], errors='coerce')

**Se ha hecho lo anterior inmediato debido a que una de las entradas en la variable _'Area (km²)'_ presenta un formato distinto que impide escalar correctamente los datos**

**Escalado de los datos para la imputación, e imputación**

In [73]:
X = df[imp_cols]

In [75]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [78]:
imputer = KNNImputer(n_neighbors=5, weights='distance')
X_imputed = imputer.fit_transform(X_scaled)

In [100]:
df[imp_cols] = X_imputed

In [84]:
for c in df.columns:

    miss = df[c].isnull().sum()
    if miss > 0:
        print(f"Columna {c}: {miss} valores faltantes")

# Aprendizaje automático: Modelos, Validación cruzada y Métricas

In [102]:
X = df.iloc[:,2:13]
y = df['GDP (USD)']

**Definición de modelos**

In [86]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

**Validación cruzada de 5 partes**

In [87]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

**Métricas de evaluación**

In [88]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1_macro': make_scorer(f1_score, average='macro')
}

**Evaluar modelos**

In [105]:
results = {}

for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
    results[name] = {metric: scores[f'test_{metric}'].mean() for metric in scoring}

**Mostrar resultados**

In [111]:
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df

Unnamed: 0,accuracy,precision,recall,f1_macro
Logistic Regression,0.7593,0.7714,0.7615,0.7615
Random Forest,0.7597,0.7741,0.7485,0.7555
KNN,0.6853,0.7002,0.6987,0.6902


**Conclusión**

Primeramente, se puede ver que el modelo _K-Nearest Neighbors_ fue el de peor desempeño.<br> 
Para los modelos de _Logistic Regression_ y _Random Forest_ las métricas _precision, recall y F1_ apuntan a que el ganador es el modelo de _Logistic Regression_; por otro lado la métrica _accuracy_ es despreciable, para fines prácticos, por la diferencia poco significativa entre ambos modelos.<br>
Por lo anterior, se concluye que el mejor modelo es el de _Logistic Regression_