# Programming for Data Analytics Project
# Analysis of Wind Speed Data

*Author: Eoghan Walsh*
***

## Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import AutoMinorLocator
import seaborn as sns
import csv
import re
import requests
from io import StringIO
import geopandas as gpd
from shapely.geometry import Point

## Import Data

In [2]:
import os
import pandas as pd

# Folder containing CSV files.
csv_folder = './data/hourly/'

# Regex pattern to find start of data.
regex_pattern = "^date(?!:)"

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through all files in the folder.
for csv_file in os.listdir(csv_folder):
    # Check if the file is a CSV
    if csv_file.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(csv_folder, csv_file)

        with open(file_path) as f:
            reader = csv.reader(f)
            
            station_name = ",".join(next(reader)).split("Station Name: ")[1].lower()

            for row_number, row in enumerate(reader, start=0):
                if any(re.search(regex_pattern, string) for string in row):
                    break
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, skiprows=row_number,low_memory=False)
        
        # Optionally, add a column to track the source file
        df['station_name'] = station_name
        df['source_file'] = csv_file

        # Append the DataFrame to the list
        dataframes.append(df)



In [3]:
final_dataframe = pd.concat(dataframes)

In [4]:
final_dataframe.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.4,wddir,station_name,source_file,ww,w,sun,vis,clht,clamt
0,01-dec-1955 01:00,0,0.0,0,10.7,0,10.0,9.4,11.8,91,...,1.0,170,roches point,hly1075.csv,,,,,,
1,01-dec-1955 02:00,0,2.9,0,9.8,0,9.7,10.0,12.0,99,...,1.0,190,roches point,hly1075.csv,,,,,,
2,01-dec-1955 03:00,0,3.8,0,9.7,0,9.5,9.4,11.7,97,...,1.0,160,roches point,hly1075.csv,,,,,,
3,01-dec-1955 04:00,0,0.8,0,9.8,0,9.7,9.4,11.9,98,...,1.0,140,roches point,hly1075.csv,,,,,,
4,01-dec-1955 05:00,0,0.3,0,8.9,0,8.7,8.3,11.1,97,...,1.0,330,roches point,hly1075.csv,,,,,,


In [5]:
final_dataframe = final_dataframe[["date", "rain", "temp", "wdsp", "station_name", "source_file"]]

In [6]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8842594 entries, 0 to 448391
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   rain          object
 2   temp          object
 3   wdsp          object
 4   station_name  object
 5   source_file   object
dtypes: object(6)
memory usage: 472.2+ MB


In [7]:
final_dataframe.loc[:, 'date'] = pd.to_datetime(final_dataframe.loc[:, 'date'], format="%d-%b-%Y %H:%M")

In [8]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8842594 entries, 0 to 448391
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   rain          object
 2   temp          object
 3   wdsp          object
 4   station_name  object
 5   source_file   object
dtypes: object(6)
memory usage: 472.2+ MB


In [9]:
final_dataframe.loc[:, 'rain'] = pd.to_numeric(final_dataframe.loc[:, 'rain'], errors='coerce')

In [10]:
final_dataframe.loc[:, 'temp'] = pd.to_numeric(final_dataframe.loc[:, 'temp'], errors='coerce')

In [11]:
final_dataframe.loc[:, 'wdsp'] = pd.to_numeric(final_dataframe.loc[:, 'wdsp'], errors='coerce')

In [12]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8842594 entries, 0 to 448391
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   rain          object
 2   temp          object
 3   wdsp          object
 4   station_name  object
 5   source_file   object
dtypes: object(6)
memory usage: 472.2+ MB


In [13]:
# Function to import daily weather data to dataframes.
def daily_weather():

    # File containing csv URLs.
    txt_file = "./data/download-file-list-hourly.txt"

    # Regex pattern to find the start of useful data in CSV files.
    regex = r"^date(?!:)"

    # Columns to import to dataframe.
    columns = ["date", "wdsp", "rain", "temp"]

    # Set index in dataframe.
    index = "date"

    # Numeric columns.
    numeric_columns = ["wdsp", "rain", "temp"]

    # Convert index to datetime.
    date_format = "%d-%b-%Y %H:%M"

    # Dictionary to store the dataframes.
    dataframes = {}

    # Regex pattern to find the column header row in CSV files.
    regex_pattern = re.compile(regex, re.IGNORECASE)

    # Read the URLs from the txt file and strip newlines.
    with open(txt_file) as f:
        csv_urls = [line.strip() for line in f if line.strip()]

    # Loop through the URLs.
    for url in csv_urls:
        try:
            # Send GET request to the URL.
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to retrieve the CSV file. Status code: {response.status_code}")
                continue

            # Treat response content as file-like object.
            csv_content = response.text
            csv_file = StringIO(csv_content)

            # Find the row with colum headers.
            csv_reader = csv.reader(csv_file)
            for row_number, row in enumerate(csv_reader):
                row_string = ",".join(row)
                if regex_pattern.search(row_string):
                    #print(f"Skipped {row_number} rows in {url}")
                    break
            else:
                print(f"Error finding regex pattern: \"{regex_pattern.pattern}\" in {url}")
                continue

            # Reset the file pointer get weather station name.
            csv_file.seek(0)
            for row in csv_reader:
                row_string = ",".join(row)
                df_name = row_string.split("Station Name: ")[1].lower()
                break
            else:
                print(f"Could not extract Station Name from {url}")
                df_name = url.split("webdata/")[1].rstrip(".csv")
                continue
            
            # Reset the file pointer and import the data to DataFrame.
            csv_file.seek(0)
            dataframes[df_name] = pd.read_csv(csv_file, skiprows=row_number, index_col=index, usecols=columns, low_memory=False)
            
            # Convert index to datetime.
            dataframes[df_name].index = pd.to_datetime(dataframes[df_name].index, format=date_format)

            # Convert data to numeric.
            for col in numeric_columns:
                dataframes[df_name][col] = pd.to_numeric(dataframes[df_name][col], errors="coerce")
            print(f"Imported {url} as {df_name}")

        except Exception as e:
            print(f"An error occurred while processing {url} ({df_name}): {e}")

    return dataframes


# Call the function.
dataframes = daily_weather()

FileNotFoundError: [Errno 2] No such file or directory: './data/download-file-list-hourly.txt'

In [26]:
dataframes.count()

AttributeError: 'dict' object has no attribute 'count'

## Clean Data

## Look at the historical windspeed data.

* Hourly/Daily/Monthly/Yearly
* Wind ranges for windfarms to operate
* Analysis of time of day/year
* Power usage trends
* Wind generation 
* https://public.tableau.com/views/Electricity-EnhancedMonthlyPanels/6?:language=en-US&:increment_view_count=no&:embed=y&:sid=&:redirect=auth&:embed_code_version=3&:loadOrderID=0&:display_count=y&:tabs=n&:origin=viz_share_link
* https://www.seai.ie/data-and-insights/seai-statistics/monthly-energy-data/electricity-monthly#comp000064913259000000056f1221
* https://www.eirgrid.ie/grid/system-and-renewable-data-reports
- https://cms.eirgrid.ie/sites/default/files/publications/System-Data-Qtr-Hourly-December-2024.xlsx
- https://cms.eirgrid.ie/sites/default/files/publications/System-Data-Qtr-Hourly-2022-2023_0.xlsx
- https://cms.eirgrid.ie/sites/default/files/publications/System-Data-Qtr-Hourly-2020-2021.xlsx
- https://cms.eirgrid.ie/sites/default/files/publications/System-Data-Qtr-Hourly-2018-2019.xlsx
- https://cms.eirgrid.ie/sites/default/files/publications/System-Data-Qtr-Hourly-2016-2017.xlsx
- https://cms.eirgrid.ie/sites/default/files/publications/System-Data-Qtr-Hourly-2014-2015.xlsx


## Predict future windspeed trends.
* KNN
* Decision trees
* Power output of wind farms