In [1]:
import time
import datetime
import re
import pandas as pd
import numpy as np
from huggingface_hub import login
from datasets import Dataset, load_dataset
import matplotlib.pyplot as plt
import requests



In [2]:
from datasets import Dataset, load_dataset
import pandas as pd
import hopsworks
import os

In [3]:
# Hopsworks setup
project = hopsworks.login()
fs = project.get_feature_store()

# Get or create feature group
feature_group = fs.get_or_create_feature_group(
    name="weather_data_history",
    version=1,
    description="Weather data history",
    primary_key=['id'],
    event_time=['time'],
    online_enabled=True
)

# Load existing data
df_main = feature_group.read()
df_main = df_main.sort_values(by='time', ascending=True)
df_main

if 'id' in df_main.columns:
    df_main.set_index('id', inplace=True)

2025-01-05 20:04:00,106 INFO: Initializing external client
2025-01-05 20:04:00,108 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 20:04:01,457 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1205426
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.08s) 


In [4]:
def download_dataset_from_huggingface(repo_name, file_name):
    """
    Downloads the dataset from a Hugging Face repository.

    Parameters:
        repo_name (str): Name of the Hugging Face repository.
        file_name (str): Name of the dataset file in the repository.

    Returns:
        pd.DataFrame: The dataset as a Pandas DataFrame.
    """
    # Push the dataset to Hugging Face Hub
    repo_name = "andreitut/merged_kth_dataset"  # Replace with your desired repository name
   
    repo_url = f"https://huggingface.co/datasets/{repo_name}/resolve/main/{file_name}"
    dataset = pd.read_csv(repo_url, parse_dates=["time"], index_col="time")
    return dataset


In [5]:
def get_incremental_weather_data(latitude, longitude, start_date, end_date):
    """
    Fetch incremental weather data from Open-Meteo API for a specific date range.
    """
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"],
        "timezone": "auto",
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()

    # Extract hourly data
    hourly_data = data["hourly"]
    weather_df = pd.DataFrame(hourly_data)

    # Ensure 'time' exists and is in proper datetime format
    if "time" in weather_df.columns:
        weather_df['time'] = pd.to_datetime(weather_df['time'], errors='coerce')
        if weather_df['time'].isnull().any():
            raise ValueError("Invalid time format detected in API response.")
    else:
        raise KeyError("'time' column missing in the API response.")

    # Set 'time' as the index
    weather_df.set_index('time', inplace=True)

    # Debug outputs
    print("Index type:", weather_df.index.dtype)  # Confirm datetime
    print("Columns:", weather_df.columns)  # Verify other columns
    return weather_df


In [6]:
def update_and_push_dataset(repo_name, latitude, longitude):
    """
    Updates the weather dataset and pushes it to Hugging Face.
    """
    # Step 1: Load the existing dataset
    print("Loading existing dataset from Hugging Face...")
    existing_data = load_existing_dataset(repo_name)

    if not existing_data.empty:
        last_date = existing_data.index.max().strftime('%Y-%m-%d')  # Access index
    else:
        last_date = "2024-01-01"  # Default start date if dataset is empty

    # Step 2: Fetch new data if needed
    start_date = pd.Timestamp(last_date) + pd.Timedelta(days=1)
    end_date = pd.Timestamp.now().strftime('%Y-%m-%d')

    if start_date <= pd.Timestamp(end_date):
        print(f"Fetching weather data from {start_date} to {end_date}...")
        new_data = get_incremental_weather_data(latitude, longitude, start_date.strftime('%Y-%m-%d'), end_date)

        print("Fetched data columns:", new_data.columns)

        # Combine existing data with the new data
        updated_data = pd.concat([existing_data, new_data])

        # Handle duplicates and sort by index (time)
        updated_data = updated_data[~updated_data.index.duplicated(keep='last')].sort_index()
    else:
        print("No new data to fetch. Dataset is already up-to-date.")
        return

    # Step 3: Push updated dataset to Hugging Face
    print("Uploading updated dataset to Hugging Face...")
    return updated_data



In [7]:
# Function to load the dataset from Hugging Face
def load_existing_dataset(repo_name):
    """
    Loads the existing dataset from Hugging Face, if available.
    """
    try:
        # Load the dataset from Hugging Face
        hf_dataset = Dataset.load_from_hub(repo_name)
        df = hf_dataset.to_pandas()
        df['time'] = pd.to_datetime(df['time'])
        return df
    except:
        print("No existing dataset found. Starting fresh.")
        return pd.DataFrame()

In [8]:
# Parameters
repo_name = "andreitut/weatherDatasetProject"  # Replace with your repository name
latitude, longitude = 59.3293, 18.0686  # Stockholm coordinates


# Run the update process
df = update_and_push_dataset(repo_name, latitude, longitude)
df.dropna(inplace=True)
df.tail()

Loading existing dataset from Hugging Face...
No existing dataset found. Starting fresh.
Fetching weather data from 2024-01-02 00:00:00 to 2025-01-05...
Index type: datetime64[ns]
Columns: Index(['temperature_2m', 'precipitation', 'wind_speed_10m',
       'wind_direction_10m'],
      dtype='object')
Fetched data columns: Index(['temperature_2m', 'precipitation', 'wind_speed_10m',
       'wind_direction_10m'],
      dtype='object')
Uploading updated dataset to Hugging Face...


Unnamed: 0_level_0,temperature_2m,precipitation,wind_speed_10m,wind_direction_10m
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-01-03 21:00:00,-9.5,0.0,15.8,316.0
2025-01-03 22:00:00,-9.8,0.0,14.9,307.0
2025-01-03 23:00:00,-10.0,0.0,16.1,299.0
2025-01-04 00:00:00,-10.0,0.0,16.2,296.0
2025-01-04 01:00:00,-9.4,0.0,13.3,291.0


In [9]:
import pandas as pd

# Assuming 'df' is your existing DataFrame
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df

Unnamed: 0,time,temperature_2m,precipitation,wind_speed_10m,wind_direction_10m
0,2024-01-02 00:00:00,-4.0,0.1,16.6,85.0
1,2024-01-02 01:00:00,-4.0,0.2,17.2,79.0
2,2024-01-02 02:00:00,-4.4,0.2,17.0,84.0
3,2024-01-02 03:00:00,-4.6,0.2,17.3,86.0
4,2024-01-02 04:00:00,-4.8,0.1,18.8,86.0
...,...,...,...,...,...
8829,2025-01-03 21:00:00,-9.5,0.0,15.8,316.0
8830,2025-01-03 22:00:00,-9.8,0.0,14.9,307.0
8831,2025-01-03 23:00:00,-10.0,0.0,16.1,299.0
8832,2025-01-04 00:00:00,-10.0,0.0,16.2,296.0


In [10]:
# Assuming 'df' is your existing DataFrame
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df

Unnamed: 0,id,time,temperature_2m,precipitation,wind_speed_10m,wind_direction_10m
0,0,2024-01-02 00:00:00,-4.0,0.1,16.6,85.0
1,1,2024-01-02 01:00:00,-4.0,0.2,17.2,79.0
2,2,2024-01-02 02:00:00,-4.4,0.2,17.0,84.0
3,3,2024-01-02 03:00:00,-4.6,0.2,17.3,86.0
4,4,2024-01-02 04:00:00,-4.8,0.1,18.8,86.0
...,...,...,...,...,...,...
8829,8829,2025-01-03 21:00:00,-9.5,0.0,15.8,316.0
8830,8830,2025-01-03 22:00:00,-9.8,0.0,14.9,307.0
8831,8831,2025-01-03 23:00:00,-10.0,0.0,16.1,299.0
8832,8832,2025-01-04 00:00:00,-10.0,0.0,16.2,296.0


In [11]:
# Hopsworks setup
os.environ["HOPSWORKS_API_KEY"] = "pJymTzms8OtLxz9l.fj2klQnbFO3ZmkYrgeAGMEAKguhmEYo3wcmb6rDXd3NRJu1Xv9WNDGAUj4TFgh3Z"
project = hopsworks.login()
fs = project.get_feature_store()

# Get or create feature group
feature_group = fs.get_or_create_feature_group(
    name="weather_data_history",
    version=1,
    description="Weather data history",
    primary_key=['id'],
    event_time=['time'],
    online_enabled=True
)

2025-01-05 20:04:10,294 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-05 20:04:10,300 INFO: Initializing external client
2025-01-05 20:04:10,301 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 20:04:11,628 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1205426


In [12]:
feature_group.insert(df)
print("Dataset successfully uploaded to Hopsworks Feature Store.")

Uploading Dataframe: 100.00% |██████████| Rows 8834/8834 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_data_history_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1205426/jobs/named/weather_data_history_1_offline_fg_materialization/executions
Dataset successfully uploaded to Hopsworks Feature Store.
