# Extracción del conjunto de datos

Se instala la librería de Meteostar y se verifican las estaciones mas cercanas a la ciudad de Bogotá.

In [7]:
!pip install meteostat



In [8]:
from meteostat import Stations
import pandas as pd

# Crear el objeto Stations
stations = Stations()

# Filtrar estaciones cercanas a Bogotá (lat: 4.61, lon: -74.08)
stations = stations.nearby(4.61, -74.08)

# Obtener las 10 estaciones más cercanas
stations_df = stations.fetch(10)

# Seleccionar columnas clave para visualizar mejor
cols = ['name', 'country', 'region', 'latitude', 'longitude', 'elevation']
stations_pretty = stations_df[cols]

# Mostrar tabla ordenada y legible
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(stations_pretty)


                             name country region  latitude  longitude  elevation
id                                                                              
80222           Bogota / Eldorado      CO    CAM    4.7167   -74.1500     2547.0
SKGY0       Guaymaral / Quirotama      CO    BOL    4.8123   -74.0649     2557.0
80234  Villavicencio / Vanguardia      CO    MET    4.1667   -73.6167      423.0
69377   Apiay (TMQ-53) / La Palma      CO    MET    4.0700   -73.5500      378.0
80219   Girardot / Santiago Villa      CO    CAM    4.2833   -74.8000      286.0
SKQU0      Mariquita / San Rafael      CO    TOL    5.2167   -74.8833      467.0
80214            Ibague / Perales      CO    TOL    4.4333   -75.1500      928.0
80149        Manizales / La Nubia      CO    CAL    5.0333   -75.4667     2080.0
80211           Armenia / El Eden      CO    QUI    4.5000   -75.7167     1204.0
80210          Pereira / Matecana      CO    RIS    4.8167   -75.7333     1342.0


In [9]:
from meteostat import Daily
from datetime import datetime

# Estación con histórico largo (ejemplo)
station_id = '80222'

start = datetime(1900, 1, 1)
end = datetime.now()

# Descargar datos diarios
data = Daily(station_id, start, end)
df1 = data.fetch()

# Guardar en CSV
df1.to_csv(f'datos_meteorologicos_{station_id}.csv', index=True)
print(df1.head())





            tavg  tmin  tmax  prcp  snow  wdir  wspd  wpgt  pres  tsun
time                                                                  
1941-03-02  17.1   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
1941-03-12  16.9   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
1941-03-13  19.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
1941-03-16  16.5   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN
1941-03-19  17.3   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN


En segundo lugar, se utiliza la librería Open-Meteo para imputar los valores faltantes en la serie de tiempo, aprovechando datos horarios disponibles que permiten calcular promedios diarios y así completar de manera más precisa las observaciones ausentes en el conjunto de datos original.

In [10]:
pip install openmeteo-requests


Collecting openmeteo-requests
  Downloading openmeteo_requests-1.4.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Downloading openmeteo_sdk-1.20.0-py3-none-any.whl.metadata (935 bytes)
Downloading openmeteo_requests-1.4.0-py3-none-any.whl (6.0 kB)
Downloading openmeteo_sdk-1.20.0-py3-none-any.whl (15 kB)
Installing collected packages: openmeteo-sdk, openmeteo-requests
Successfully installed openmeteo-requests-1.4.0 openmeteo-sdk-1.20.0


In [11]:
pip install requests-cache retry-requests numpy pandas

Collecting requests-cache
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting retry-requests
  Downloading retry_requests-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting cattrs>=22.2 (from requests-cache)
  Downloading cattrs-24.1.3-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache)
  Downloading url_normalize-2.2.0-py3-none-any.whl.metadata (4.9 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retry_requests-2.0.0-py3-none-any.whl (15 kB)
Downloading cattrs-24.1.3-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading url_normalize-2.2.0-py3-none-any.whl (14 kB)
Installing collected packages: url-normalize, cattrs, retry-requests, requests-cache
Successfully installe

In [12]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 4.6097,
    "longitude": -74.0817,
    "start_date": "1972-01-01",
    "end_date": "1973-12-31",
    "hourly": "temperature_2m"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
    start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
    end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
    freq = pd.Timedelta(seconds = hourly.Interval()),
    inclusive = "left"
)}

hourly_data["temperature_2m"] = hourly_temperature_2m

hourly_dataframe = pd.DataFrame(data = hourly_data)

# Guardar el DataFrame en un archivo CSV
hourly_dataframe.to_csv('temperature_data_1972_1973.csv', index=False)

# Mostrar el DataFrame
print(hourly_dataframe)



Coordinates 4.604569435119629°N -73.97866821289062°E
Elevation 2582.0 m asl
Timezone NoneNone
Timezone difference to GMT+0 0 s
                           date  temperature_2m
0     1972-01-01 00:00:00+00:00       12.859000
1     1972-01-01 01:00:00+00:00       12.509001
2     1972-01-01 02:00:00+00:00       12.159000
3     1972-01-01 03:00:00+00:00       11.859000
4     1972-01-01 04:00:00+00:00       11.859000
...                         ...             ...
17539 1973-12-31 19:00:00+00:00       15.609000
17540 1973-12-31 20:00:00+00:00       15.309000
17541 1973-12-31 21:00:00+00:00       14.209001
17542 1973-12-31 22:00:00+00:00       13.559000
17543 1973-12-31 23:00:00+00:00       11.609000

[17544 rows x 2 columns]


In [13]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# Setup del cliente de Open-Meteo con caché y reintentos automáticos
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Parámetros para la consulta a la API de Open-Meteo
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 4.6097,
    "longitude": -74.0817,
    "start_date": "1940-01-01",
    "end_date": "2025-03-31",
    "hourly": "temperature_2m"
}
responses = openmeteo.weather_api(url, params=params)

# Procesar la respuesta para una sola ubicación
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Extraer datos horarios
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()

# Construir DataFrame con datos horarios
hourly_data = {"date": pd.date_range(
    start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
    end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
    freq = pd.Timedelta(seconds = hourly.Interval()),
    inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_dataframe = pd.DataFrame(data = hourly_data)

# Guardar el DataFrame horario
hourly_dataframe.to_csv('temperature_data_1972_1973.csv', index=False)
print("Datos horarios guardados en 'temperature_data_1972_1973.csv'")
print(hourly_dataframe.head())

# Calcular el promedio diario de temperatura
hourly_dataframe["date_only"] = hourly_dataframe["date"].dt.date
daily_avg_temperature = hourly_dataframe.groupby("date_only")["temperature_2m"].mean().reset_index()
daily_avg_temperature.columns = ["date", "avg_temperature_2m"]

# Guardar el DataFrame diario
daily_avg_temperature.to_csv("Promedio_openmeteo.csv", index=False)
print("Promedio diario de temperatura guardado en 'daily_avg_temperature_1972_1973.csv'")
print(daily_avg_temperature.head())


Coordinates 4.604569435119629°N -73.97866821289062°E
Elevation 2582.0 m asl
Timezone NoneNone
Timezone difference to GMT+0 0 s
Datos horarios guardados en 'temperature_data_1972_1973.csv'
                       date  temperature_2m
0 1940-01-01 00:00:00+00:00       11.667000
1 1940-01-01 01:00:00+00:00       11.516999
2 1940-01-01 02:00:00+00:00       11.117000
3 1940-01-01 03:00:00+00:00       11.266999
4 1940-01-01 04:00:00+00:00       10.816999
Promedio diario de temperatura guardado en 'daily_avg_temperature_1972_1973.csv'
         date  avg_temperature_2m
0  1940-01-01           12.314916
1  1940-01-02           12.083667
2  1940-01-03           12.000333
3  1940-01-04           11.898250
4  1940-01-05           12.131583


In [14]:
import pandas as pd

print("Columnas del DataFrame df1:")
print(df1.columns)

missing_tavg = df1['tavg'].isnull().sum()
print(f"\nCantidad de valores faltantes en la columna 'tavg' de df1: {missing_tavg}")


Columnas del DataFrame df1:
Index(['tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun'], dtype='object')

Cantidad de valores faltantes en la columna 'tavg' de df1: 492


In [17]:
df1 = df1.reset_index()

print("Columnas del DataFrame stations_pretty:")
print(stations_pretty.columns)

print("\nColumnas del DataFrame df1:")
print(df1.columns)

print("\nColumnas del DataFrame hourly_dataframe:")
print(hourly_dataframe.columns)
daily_avg_temperature = daily_avg_temperature.rename(columns={'date': 'time'})

print("\nColumnas del DataFrame daily_avg_temperature:")
daily_avg_temperature.columns

# Eliminar la columna que es de tipo int64
df1 = df1.loc[:, ~df1.columns.duplicated()]

Columnas del DataFrame stations_pretty:
Index(['name', 'country', 'region', 'latitude', 'longitude', 'elevation'], dtype='object')

Columnas del DataFrame df1:
Index(['index', 'time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun'], dtype='object')

Columnas del DataFrame hourly_dataframe:
Index(['date', 'temperature_2m', 'date_only'], dtype='object')

Columnas del DataFrame daily_avg_temperature:


Finalmente se unen los dos dataframes para imputar los valroes faltantes, los cuales en su mayoría se encuentran en 1972, por esta razón se decidió usar

In [16]:
import pandas as pd

# Asegurar que 'time' sea datetime en ambos DataFrames
df1['time'] = pd.to_datetime(df1['time'])
daily_avg_temperature['time'] = pd.to_datetime(daily_avg_temperature['time'])

# Hacer merge de df1 y daily_avg_temperature por 'time'
merged_df = pd.merge(df1, daily_avg_temperature, on='time', how='left')

# Rellenar los NaN de 'tavg' con los valores de 'avg_temperature_2m'
merged_df['tavg'] = merged_df['tavg'].fillna(merged_df['avg_temperature_2m'])

# Eliminar la columna auxiliar 'avg_temperature_2m'
merged_df = merged_df.drop(columns=['avg_temperature_2m'])

# Resultado final
df1 = merged_df
df1.to_csv('temperaturas_final.csv', index=False)
