In [2]:
%load_ext autoreload
%autoreload 2

In [29]:
import requests
from datetime import datetime
import os
from urllib.parse import urlencode
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt

## Purpose

#### The purpose of this notebook is to fetch output data from pvoutput and process it into useable CSVs.

The data being fetched is from the solar array used in [this](https://www.researchgate.net/publication/348641326_Two-Stage_VoltVar_Control_in_Active_Distribution_Networks_With_Multi-Agent_Deep_Reinforcement_Learning_Method) paper.

The code here is reasonably general and could be used to fetch other arrays if so desired. 



In [12]:
BASE_URL = "https://pvoutput.org"
API_KEY = os.environ["API_KEY"]
SYSTEM_ID = "86989"
TARGET_SYSTEM_ID = "48702"

In [4]:
def add_missing_datetimes(df, freq="5T", closed="left"):
    start_date = df["datetime"][0].date()
    end_date = start_date + pd.Timedelta("1D")
    date_range = pd.date_range(start_date, end_date, freq=freq, closed=closed)
    date_range = set(date_range)
    df_date_range = set(df["datetime"])
    missing_datetimes = pd.DataFrame(list(date_range.difference(df_date_range)), columns=["datetime"])
    df = df.append(missing_datetimes)
    return df

In [61]:
class PVOutput:
    def __init__(self, api_key, system_id):
        self.api_key = api_key
        self.system_id = system_id

    def _process_site_info_data(self, request_response):
        data_rows = request_response.split(";")
        columns = [
            "date",
            "time",
            "energy_kwh",
            "efficiency_kwh/kw",
            "power_w",
            "average_power_w",
            "normalised_output_kw/kw",
            "temperature_c",
            "voltage",
        ]
        # Drop last two columns, they contain no data
        split_rows = [r.split(",")[:-2] for r in data_rows]
        df = pd.DataFrame(data=split_rows, columns=columns)
        df["datetime"] = df["date"] + " " + df["time"]
        df["datetime"] = pd.to_datetime(df["datetime"], format="%Y%m%d %H:%M")
        df = df.drop(columns=["date", "time"])
        dtypes = {
            "energy_kwh": "float",
            "efficiency_kwh/kw": "float",
            "power_w": "float",
            "average_power_w": "float",
            "normalised_output_kw/kw": "float",
            "temperature_c": "float",
            "voltage": "float"
        }
        df = df.astype(dtypes)
        df = add_missing_datetimes(df)
        df = df.sort_values("datetime").reset_index(drop=True)
        # Forward fill to fill small holes during the day and data at the end of the day
        df = df.fillna(method="ffill")
        # Backward fill to fill missing data at the beginning of the day
        df = df.fillna(method="bfill")
        return df

    def get_site_info(self, pv_system_id, date):
        """Get site production data
        pv_system_id: ID of the system to fetch data for
        date: the date to fetch data for in YYYYMMDD format, localtime of the PV system
        """
        request_url = f"{BASE_URL}/service/r2/getstatus.jsp"
        headers = {
            "X-Rate-Limit": "1",
            "X-Pvoutput-Apikey": self.api_key,
            "X-Pvoutput-SystemId": self.system_id,
        }
        params = {
            "d": date,
            "h": 1,  # We want historical data.
            "limit": 288,  # API limit is 288 (num of 5-min periods per day).
            "ext": 0,  # Extended data; we don't want extended data.
            "sid1": pv_system_id,
        }
        r = requests.get(request_url, headers=headers, params=params)
        r.raise_for_status()
        return self._process_site_info_data(r.text)
        

In [32]:
def plot_data(df, column="average_power_w"):
    plt.figure(figsize = (15,8))
    ax = sns.lineplot(x="datetime", y=column, data=df)
    plt.xticks(rotation=45)

In [58]:
pv = PVOutput(API_KEY, SYSTEM_ID)

In [60]:
out_dir = Path("./out/raw_data")
out_dir.mkdir(parents=True, exist_ok=True)
start_date = "2020-04-01"
end_date = "2020-04-30"
for date in pd.date_range(start_date, end_date):
    try:
        formatted_date = datetime.strftime(date, "%Y%m%d")
        pv_data = pv.get_site_info(TARGET_SYSTEM_ID, formatted_date)
        pv_data.to_csv(f"./out/{formatted_date}.csv", index=False)
    except Exception as e:
        print(e)
        print(formatted_date)
        continue
    