# Install and Import Dependencies

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
import os
import shutil
import zipfile

import pandas as pd

# Load data

The dataset for this project will be obtained through web scraping from the Mendeley Data website.

## Web Crawling settings

In [None]:
# Configure Selenium to run
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")

# Initialize the browser
driver = webdriver.Chrome(options=options)

# Access Mendeley Data page
url = "https://data.mendeley.com/datasets/hfhwmn8w24/3"
driver.get(url)

# Accepting cookies
try:
    wait = WebDriverWait(driver, 10)  # 10 seconds sleep
    accept_cookies = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
    accept_cookies.click()
    print("Cookies accepted.")
except Exception as e:
    print("No cookie popup found or error while accepting cookies:", e)

# Waiting for Download button to become available
download_button = wait.until(EC.presence_of_element_located((By.ID, "download-all")))
download_button.click()

print("Click on Download button was successful.")

# Donwloading dataset
time.sleep(5)

# Closing the browser
driver.quit()

Cookies accepted.
Click on Download button was successful.


## Adjusting dataset directory

In [4]:
# Define the source and target directories
source_folder = os.path.join(os.path.expanduser("~"), "Downloads")
target_folder = r"C:\\Users\\guilh\\OneDrive\\Documents\\GitHub\\ml-regression-models\\solar-energy\\dataset"
target_file_name = "data.csv"

# Find the downloaded zip file by its original name
original_file_name = "hfhwmn8w24-3.zip"
source_file_path = os.path.join(source_folder, original_file_name)
unzipped_file_name = "Pasion et al dataset.csv"

# Check if the source zip file exists
if os.path.exists(source_file_path):
    # Ensure the target directory exists
    os.makedirs(target_folder, exist_ok=True)
    
    # Unzip the file
    with zipfile.ZipFile(source_file_path, 'r') as zip_ref:
        zip_ref.extractall(target_folder)

    # Rename the extracted file to the target name
    extracted_file_path = os.path.join(target_folder, unzipped_file_name)
    final_file_path = os.path.join(target_folder, target_file_name)
    if os.path.exists(extracted_file_path):
        os.rename(extracted_file_path, final_file_path)
        print(f"File unzipped and renamed to '{final_file_path}'.")
    else:
        print(f"Extracted file '{unzipped_file_name}' not found in the target folder.")

    # Optionally remove the original zip file
    os.remove(source_file_path)
    print(f"Original zip file '{source_file_path}' removed.")
else:
    print(f"Zip file '{original_file_name}' not found in the Downloads folder.")

File unzipped and renamed to 'C:\\Users\\guilh\\OneDrive\\Documents\\GitHub\\ml-regression-models\\solar-energy\\dataset\data.csv'.
Original zip file 'C:\Users\guilh\Downloads\hfhwmn8w24-3.zip' removed.


In [5]:
df = pd.read_csv('../solar-energy/dataset/data.csv', sep=',')
df.shape

(21045, 17)

# Descriptive Analysis

In [7]:
df.head()

Unnamed: 0,Location,Date,Time,Latitude,Longitude,Altitude,YRMODAHRMI,Month,Hour,Season,Humidity,AmbientTemp,PolyPwr,Wind.Speed,Visibility,Pressure,Cloud.Ceiling
0,Camp Murray,20171203,1145,47.11,-122.57,84,201712000000.0,12,11,Winter,81.71997,12.86919,2.42769,5,10.0,1010.6,722
1,Camp Murray,20171203,1315,47.11,-122.57,84,201712000000.0,12,13,Winter,96.64917,9.66415,2.46273,0,10.0,1011.3,23
2,Camp Murray,20171203,1330,47.11,-122.57,84,201712000000.0,12,13,Winter,93.61572,15.44983,4.46836,5,10.0,1011.6,32
3,Camp Murray,20171204,1230,47.11,-122.57,84,201712000000.0,12,12,Winter,77.21558,10.36659,1.65364,5,2.0,1024.4,6
4,Camp Murray,20171204,1415,47.11,-122.57,84,201712000000.0,12,14,Winter,54.80347,16.85471,6.57939,3,3.0,1023.7,9
