In [1]:
import time
import datetime
import re
import pandas as pd
import numpy as np
from huggingface_hub import login
from datasets import Dataset, load_dataset
import matplotlib.pyplot as plt
import requests

# Log in to Hugging Face
HUGGINGFACE_TOKEN = "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"  
login(token=HUGGINGFACE_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def download_dataset_from_huggingface(repo_name, file_name):
    """
    Downloads the dataset from a Hugging Face repository.

    Parameters:
        repo_name (str): Name of the Hugging Face repository.
        file_name (str): Name of the dataset file in the repository.

    Returns:
        pd.DataFrame: The dataset as a Pandas DataFrame.
    """
    # Push the dataset to Hugging Face Hub
    repo_name = "andreitut/merged_kth_dataset"  # Replace with your desired repository name
   
    repo_url = f"https://huggingface.co/datasets/{repo_name}/resolve/main/{file_name}"
    dataset = pd.read_csv(repo_url, parse_dates=["time"], index_col="time")
    return dataset


In [3]:
def get_incremental_weather_data(latitude, longitude, start_date, end_date):
    """
    Fetch incremental weather data from Open-Meteo API for a specific date range.
    """
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"],
        "timezone": "auto",
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()

    # Extract hourly data
    hourly_data = data["hourly"]
    weather_df = pd.DataFrame(hourly_data)

    # Ensure 'time' exists and is in proper datetime format
    if "time" in weather_df.columns:
        weather_df['time'] = pd.to_datetime(weather_df['time'], errors='coerce')
        if weather_df['time'].isnull().any():
            raise ValueError("Invalid time format detected in API response.")
    else:
        raise KeyError("'time' column missing in the API response.")

    # Set 'time' as the index
    weather_df.set_index('time', inplace=True)

    # Debug outputs
    print("Index type:", weather_df.index.dtype)  # Confirm datetime
    print("Columns:", weather_df.columns)  # Verify other columns
    return weather_df


In [4]:
def update_and_push_dataset(repo_name, latitude, longitude):
    """
    Updates the weather dataset and pushes it to Hugging Face.
    """
    # Step 1: Load the existing dataset
    print("Loading existing dataset from Hugging Face...")
    existing_data = load_existing_dataset(repo_name)

    if not existing_data.empty:
        last_date = existing_data.index.max().strftime('%Y-%m-%d')  # Access index
    else:
        last_date = "2024-01-01"  # Default start date if dataset is empty

    # Step 2: Fetch new data if needed
    start_date = pd.Timestamp(last_date) + pd.Timedelta(days=1)
    end_date = pd.Timestamp.now().strftime('%Y-%m-%d')

    if start_date <= pd.Timestamp(end_date):
        print(f"Fetching weather data from {start_date} to {end_date}...")
        new_data = get_incremental_weather_data(latitude, longitude, start_date.strftime('%Y-%m-%d'), end_date)

        print("Fetched data columns:", new_data.columns)

        # Combine existing data with the new data
        updated_data = pd.concat([existing_data, new_data])

        # Handle duplicates and sort by index (time)
        updated_data = updated_data[~updated_data.index.duplicated(keep='last')].sort_index()
    else:
        print("No new data to fetch. Dataset is already up-to-date.")
        return

    # Step 3: Push updated dataset to Hugging Face
    print("Uploading updated dataset to Hugging Face...")
    hf_dataset = Dataset.from_pandas(updated_data)
    hf_dataset.push_to_hub(repo_name)
    print(f"Dataset successfully uploaded to Hugging Face Hub: {repo_name}")


In [5]:
# Function to load the dataset from Hugging Face
def load_existing_dataset(repo_name):
    """
    Loads the existing dataset from Hugging Face, if available.
    """
    try:
        # Load the dataset from Hugging Face
        hf_dataset = Dataset.load_from_hub(repo_name)
        df = hf_dataset.to_pandas()
        df['time'] = pd.to_datetime(df['time'])
        return df
    except:
        print("No existing dataset found. Starting fresh.")
        return pd.DataFrame()

In [6]:
# Parameters
repo_name = "andreitut/weatherDatasetProject"  # Replace with your repository name
latitude, longitude = 59.3293, 18.0686  # Stockholm coordinates


# Run the update process
update_and_push_dataset(repo_name, latitude, longitude)

Loading existing dataset from Hugging Face...
No existing dataset found. Starting fresh.
Fetching weather data from 2024-01-02 00:00:00 to 2024-12-28...


NameError: name 'requests' is not defined