In [6]:
import pandas as pd
import re

# ==========================================
# 1. CARGA DE DATOS
# ==========================================
print("Cargando datasets...")
df_cost = pd.read_csv('./data/cost-of-living-clean.csv')
df_circle = pd.read_csv('./data/digital-nomad-index-cicleloop-clean.csv')
df_moving = pd.read_csv('./data/digital-nomad-index-movingto-clean.csv')

# ==========================================
# 2. FEATURE ENGINEERING 
# ==========================================
print("Calculando variables agregadas...")
# A. Housing
df_cost['nomad_housing_cost'] = (df_cost['rent_1br_city_center'] + df_cost['rent_1br_outside_center']) / 2
# B. Meal
df_cost['daily_meal_cost'] = (df_cost['meal_inexpensive_restaurant'] + df_cost['cappuccino_restaurant'])
# C. Basket
grocery_cols = ["milk_1l", "bread_white_500g", "rice_white_1kg", "eggs_12", "cheese_local_1kg", 
                "chicken_fillet_1kg", "beef_1kg", "apples_1kg", "bananas_1kg", "oranges_1kg", 
                "tomatoes_1kg", "potatoes_1kg", "onions_1kg", "lettuce_1unit", "water_1_5l_supermarket"]
df_cost['basic_basket_cost'] = df_cost[grocery_cols].sum(axis=1)

# ==========================================
# 3. LIMPIEZA IMPUESTOS
# ==========================================
print("Procesando impuestos...")
def procesar_impuestos(texto):
    if pd.isna(texto): return pd.Series([None, None])
    numeros = re.findall(r"(\d+(?:\.\d+)?)", str(texto))
    numeros = [float(n) for n in numeros]
    if len(numeros) >= 2: return pd.Series([min(numeros), max(numeros)])
    elif len(numeros) == 1: return pd.Series([numeros[0], numeros[0]])
    else: return pd.Series([None, None])

df_moving[['tax_min', 'tax_max']] = df_moving['taxes'].apply(procesar_impuestos)

# ==========================================
# 4. ESTANDARIZACIÓN DE PAÍSES
# ==========================================
print("Corrigiendo nombres de países (USA vs United States)...")

# Diccionario de correcciones: {Nombre_Ranking : Nombre_Ciudades}
mapa_correcciones = {
    'USA': 'United States',
    'UK': 'United Kingdom',
    'Great Britain': 'United Kingdom',
    'U.S.': 'United States',
    'United Arab Emirates': 'United Arab Emirates', 
    'UAE': 'United Arab Emirates',
    'Czechia': 'Czech Republic'
}

# Aplicamos la corrección a los datasets de rankings
df_circle = df_circle.rename(columns={'country': 'country_name'})
df_moving = df_moving.rename(columns={'country': 'country_name'})

df_circle['country_name'] = df_circle['country_name'].replace(mapa_correcciones)
df_moving['country_name'] = df_moving['country_name'].replace(mapa_correcciones)

# ==========================================
# 5. EL MERGE (LA FUSIÓN)
# ==========================================
print("Unificando tablas...")
# Merge 1: Coste + CircleLoop
df_master = df_cost.merge(df_circle, on='country_name', how='left')
# Merge 2: Master + MovingTo (Usamos sufijos para no machacar columnas repetidas)
df_master = df_master.merge(df_moving, on='country_name', how='left', suffixes=('_circle', '_moving'))

# ==========================================
# 6. VERIFICACIÓN FINAL
# ==========================================

# Comprobamos si USA y UK ya tienen datos
test_countries = ['United States', 'United Kingdom']
print("Verificando países clave (Deberían tener datos ahora):")
cols_ver = ['country_name', 'digital_nomad_score', 'overall_score']
display(df_master[df_master['country_name'].isin(test_countries)][cols_ver].drop_duplicates().head())

# Guardar
df_master.to_csv('./data/MASTER_DATASET_EDA.csv', index=False)
print("Archivo guardado como: ./data/MASTER_DATASET_EDA.csv")

Cargando datasets...
Calculando variables agregadas...
Procesando impuestos...
Corrigiendo nombres de países (USA vs United States)...
Unificando tablas...
Verificando países clave (Deberían tener datos ahora):


Unnamed: 0,country_name,digital_nomad_score,overall_score
21,United States,49.27,
28,United Kingdom,63.43,


Archivo guardado como: ./data/MASTER_DATASET_EDA.csv


In [7]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4742 entries, 0 to 4741
Data columns (total 81 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   city_name                            4742 non-null   object 
 1   country_name                         4742 non-null   object 
 2   meal_inexpensive_restaurant          4742 non-null   float64
 3   meal_midrange_restaurant_2p          4742 non-null   float64
 4   mcmeal_fastfood                      4742 non-null   float64
 5   beer_domestic_restaurant_0_5l        4742 non-null   float64
 6   beer_imported_restaurant_0_33l       4742 non-null   float64
 7   cappuccino_restaurant                4742 non-null   float64
 8   soda_restaurant_0_33l                4742 non-null   float64
 9   water_restaurant_0_33l               4742 non-null   float64
 10  milk_1l                              4742 non-null   float64
 11  bread_white_500g              

In [5]:
df_master.sample(20)

Unnamed: 0,city_name,country_name,meal_inexpensive_restaurant,meal_midrange_restaurant_2p,mcmeal_fastfood,beer_domestic_restaurant_0_5l,beer_imported_restaurant_0_33l,cappuccino_restaurant,soda_restaurant_0_33l,water_restaurant_0_33l,...,overall_score,internet_speed,cost_of_living,safety,visa_ease,quality_of_life,taxes,tax_free_period,tax_min,tax_max
2835,Cagnes-sur-Mer,France,14.75,53.74,9.48,5.8,5.01,2.77,2.46,1.66,...,,,,,,,,,,
1881,Obihiro,Japan,4.47,14.89,6.7,2.23,2.23,3.05,1.0,0.79,...,68.0,90.0,50.0,95.0,70.0,95.0,5-45%,183 days/year,5.0,45.0
93,Guadalajara,Mexico,7.74,30.94,6.19,1.8,3.09,2.48,0.97,0.64,...,87.0,82.0,90.0,75.0,94.0,86.0,1.92-35%,183 days/year,1.92,35.0
1004,Arusha,Tanzania,2.57,25.72,5.04,0.86,2.06,2.21,0.64,0.32,...,,,,,,,,,,
4532,Rovinj,Croatia,12.56,44.67,4.89,3.21,4.19,1.81,2.47,1.72,...,84.0,84.0,76.0,88.0,88.0,89.0,24%,1 year,24.0,24.0
417,Raleigh,United States,15.0,65.0,8.0,5.25,7.0,4.96,2.28,1.74,...,,,,,,,,,,
4442,Dadeldhura,Nepal,1.54,11.53,5.38,1.92,1.38,1.305,0.62,0.15,...,,,,,,,,,,
507,Willemstad,Curacao,22.18,83.18,10.26,2.43,4.44,3.3,2.26,1.71,...,,,,,,,,,,
2126,Andimeshk,Iran,3.94,20.16,4.01,0.95,1.0,1.51,0.93,0.51,...,,,,,,,,,,
3219,Paro,Bhutan,3.08,18.45,3.08,9.92,2.46,1.32,0.47,0.16,...,,,,,,,,,,
