In [29]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os 
import sys
sys.path.append(os.path.abspath('../..'))

# Merging PV Values and Weather Data together

In [34]:
def preprocess_weather(df):
    """ 
    Scales the weather data and extracts the relevant time interval.
    Keeps only neccessary columns.
    """
     # Convert the column to datetime format
    df["validdate"] = pd.to_datetime(df["validdate"])
    df['datetime'] = df['validdate'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # Extract the date and time components into separate columns
    df["time"] = df["validdate"].dt.strftime("%H:%M")
    # keep only values where time is between 08:00 and 16:00
    df = df[(df['time'] >= '08:00') & (df['time'] <= '16:00')]
    df = df.drop(columns=['validdate', 'time'], axis=1)

    # standardise the data
    for col in df.columns:
        if col in ['datetime', 'lat', 'lon']:
            continue

        df[col] = (df[col]  - df[col].min())/ (df[col].max() - df[col].min())

    return df

def load_weather_data():
    # get path to directory
    current_directory = os.getcwd().split("Dissertation")[0] + 'pv_data/weather_data'
    # get list of files in directory
    files = os.listdir(current_directory)
    # remove DS_Store file
    files.pop(0)
    # sort files
    files.sort()

    df_dict = {}
    for file in files:
        path_to_file = os.path.join(current_directory, file)
        df = pd.read_csv(path_to_file, sep=';')
        df = preprocess_weather(df)
        df_dict[file] = df
    
    return df_dict    

In [35]:
df_dict = load_weather_data()

In [36]:
df_list = [df for df in df_dict.values()]
df_weather = pd.concat(df_list, axis=0)

In [37]:
df_weather.head()

Unnamed: 0,lat,lon,t_2m:C,relative_humidity_2m:p,wind_speed_10m:ms,diffuse_rad:W,global_rad:W,effective_cloud_cover:octas,datetime
0,53.33,-2.99,0.304225,0.808659,0.22293,0.0,0.0,0.5,2018-01-01 08:00:00
1,53.33,-2.99,0.301408,0.835196,0.210191,0.007789,0.015471,0.75,2018-01-01 09:00:00
2,53.33,-2.99,0.329577,0.789106,0.242038,0.091372,0.041628,0.5,2018-01-01 10:00:00
3,53.33,-2.99,0.357746,0.744413,0.273885,0.072798,0.14972,0.375,2018-01-01 11:00:00
4,53.33,-2.99,0.385915,0.705307,0.305732,0.086579,0.178117,0.125,2018-01-01 12:00:00


In [38]:
# data parameters
DAY_INIT = 0
DAY_MIN = 8
DAY_MAX = 16
N_DAYS = 365
MINUTE_INTERVAL = 5
DAILY_DATA_POINTS = (DAY_MAX - DAY_MIN) * 60 // MINUTE_INTERVAL
N_SYSTEMS = 500
# create a tuple of 4 coordinates that form a polygon in the uk

POLY_COORDS = ((50, -6), (50.5, 1.9), (57.6, -5.5), (58, 1.9))

In [39]:
from data import PVDataGenerator
generator = PVDataGenerator(n_days=N_DAYS,
                    day_init=DAY_INIT,
                    n_systems=N_SYSTEMS,
                    coords=POLY_COORDS,
                    minute_interval=MINUTE_INTERVAL,
                    day_min=DAY_MIN,
                    day_max=DAY_MAX,
                    folder_name='pv_data',
                    file_name_pv='pv_data_clean.csv',
                    file_name_location='location_data_clean.csv',
                    distance_method='poly',
                    drop_nan=False)

==> Loading data
==> Loaded data in: 0 m : 1 sec

==> Loading data
==> Loaded data in: 0 m : 0 sec



In [43]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

In [44]:
df_pv = generator.pv_series
df_pv['month'] = df_pv['datetime'].dt.month
df_pv['season'] = df_pv['datetime'].dt.month.apply(lambda x: get_season(x))
df_pv = df_pv.drop(['epoch', 'farm'], axis=1)

In [45]:

def merge_weather_and_pv(df_weather, df_pv,):
    # Convert 'datetime' columns to datetime type if not already done
    df_pv['datetime'] = pd.to_datetime(df_pv['datetime'])
    df_weather['datetime'] = pd.to_datetime(df_weather['datetime'])

    # get the data in df_pv that has the latitude longitude pairs from unique_coords
    unique_coords = df_weather[['lat', 'lon']].drop_duplicates()
    merged_dataframes = []

    for index, row in unique_coords.iterrows():
        lat, lon = row['lat'], row['lon']
        pv_test = df_pv[(df_pv['latitude'] == lat) & (df_pv['longitude'] == lon)]
        weather_test = df_weather[(df_weather['lat'] == lat) & (df_weather['lon'] == lon)]
        weather_test = weather_test.drop(['lat', 'lon'], axis=1)

        merged_df = pv_test.merge(weather_test, how='outer', on='datetime')
        # set index to datetime
        
        # interpolate nan values
        for col in merged_df.columns:
            if col in ['latitude', 'longitude', 'season']:
                continue
            
            if merged_df[col].isnull().values.any():
                merged_df[col] = merged_df[col].interpolate(method='linear', limit_direction='both')
        merged_dataframes.append(merged_df)
    # concatenate all the merged data frames into a single data frame
    final_merged_df = pd.concat(merged_dataframes, ignore_index=True)
    
    return final_merged_df



In [46]:
df = merge_weather_and_pv(df_weather=df_weather, df_pv=df_pv)

In [47]:
from data.utils import save_csv
save_csv(df, folder_name='pv_data', file_name='pv_and_weather.csv')
unique_lat_lon = df[['latitude', 'longitude']].drop_duplicates()
save_csv(unique_lat_lon, folder_name='pv_data', file_name='location_data_weather_clean.csv')