# Tasks for laboratory assignment 1

In [2]:
# imports section

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from datetime import datetime
import csv
import numpy as np
import matplotlib.pyplot as plt

## Extract webpage data given the url

Create a Python script that performs basic web scraping on a page to extract all the information into text and returns it as a string.
String should not contain tags.

In [3]:
def parse_web_page(url):
    try:
        # Sending a GET request to fetch the content of the page
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        response.raise_for_status()

        # If the request was successful, return the page content
        return response.text

    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        return None
    except Exception as err:
        print(f"An error occurred: {err}")
        return None


print(parse_web_page('https://fmi.chnu.edu.ua/')[:255])
print(parse_web_page('https://en.wikipedia.org/wiki/Web_scraping')[:255])


<!DOCTYPE html>
<html lang="uk" prefix="og: https://ogp.me/ns#">
<head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1">
        <title>&#x413;&#x43E;&#x43B;&#x43E;&#x432;&#x43D;&#x430; - &#x424;&#
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 


## Get data from the API

Create a python script that performs basic request to API endpoint and saves that data to a JSON file `result.json`.

In [4]:
def parse_api(api_url):
    try:
        # Send a GET request to the API endpoint
        response = requests.get(api_url)

        # Raise an exception if the request was unsuccessful
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()

        # Save the data to result.json
        with open('result.json', 'w') as f:
            json.dump(data, f, indent=4)

        print("Data successfully saved to result.json")

    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except Exception as err:
        print(f"An error occurred: {err}")

# Call the function with the GitHub API URL
parse_api('https://api.github.com/')

Data successfully saved to result.json


## Parse the json file

Parse the `weather.json` file and return weather data for a specific date, that is given as a parameter. Return the data as an array.

In [5]:
def parse_json(target_date):
    try:
        # Load the data from the weather.json file in the './resources' folder
        with open('./resources/weather.json', 'r') as f:
            data = json.load(f)
        
        # Loop through the 'daily' list to find the weather data for the target date
        for record in data['daily']:
            if record['date'] == target_date:
                # Return the weather data for the target date as a list
                return [record]
        
        # If no data is found for the given date, return an empty list
        return []
    
    except FileNotFoundError:
        print("weather.json file not found in './resources'.")
        return []
    except json.JSONDecodeError:
        print("Error decoding JSON from weather.json.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Example usage:
target_date = '2024-08-19'
print(parse_json(target_date))


[{'date': '2024-08-19', 'max_temperature': 30.0, 'min_temperature': 21.0, 'precipitation': 5.0, 'wind_speed': 10.0, 'humidity': 70, 'weather_description': 'Light rain'}]


## Parse the csv file

Parse the `weather.csv` file and return weather data for a specific date, that is given as a parameter. Return the data as an array.

In [6]:
def parse_csv(target_date):
    try:
        # Open the CSV file located at './resources/weather.csv'
        with open('./resources/weather.csv', mode='r') as file:
            reader = csv.DictReader(file)  # Read the CSV as dictionaries
            
            # Iterate through each row in the CSV
            for row in reader:
                if row['CET'] == target_date:  # Check if the date matches
                    # Return the weather data for the target date as a list
                    return [row]
        
        # If no matching date is found, return an empty list
        return []
    
    except FileNotFoundError:
        print("weather.csv file not found in './resources'.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Example usage:
target_date = '1997-1-2'
print(parse_csv(target_date))

[{'CET': '1997-1-2', 'Max TemperatureC': '7', 'Mean TemperatureC': '3', 'Min TemperatureC': '0', 'Dew PointC': '6', 'MeanDew PointC': '3', 'Min DewpointC': '0', 'Max Humidity': '100', ' Mean Humidity': '92', ' Min Humidity': '71', ' Max Sea Level PressurehPa': '1007', ' Mean Sea Level PressurehPa': '1003', ' Min Sea Level PressurehPa': '997', ' Max VisibilityKm': '10', ' Mean VisibilityKm': '9', ' Min VisibilitykM': '4', ' Max Wind SpeedKm/h': '26', ' Mean Wind SpeedKm/h': '8', ' Max Gust SpeedKm/h': '47', 'Precipitationmm': '0.00', ' CloudCover': '5', ' Events': 'Rain', 'WindDirDegrees': '143'}]


## Visualize data

Visualize the `weather.csv` data using matplotlib. Choose your own approach to data visualization. Save the results (as `.png`, `.webp` files etc., your choise) in this repository. 

In [10]:
import csv
import matplotlib.pyplot as plt
from datetime import datetime

def visualize_data():
    # Read data from weather.csv using the csv module
    dates = []
    max_temps = []
    mean_temps = []
    min_temps = []
    precipitation = []
    max_humidity = []
    mean_humidity = []
    min_humidity = []
    
    with open('./resources/weather.csv', mode='r') as file:
        reader = csv.DictReader(file)  # Automatically handles the header row
        for row in reader:
            # Strip spaces from the column names to avoid KeyError
            row = {key.strip(): value for key, value in row.items()}
            
            # Only append data if all required fields are present and valid
            try:
                # Check if all necessary data exists
                date = row['CET']
                max_temp = row['Max TemperatureC']
                mean_temp = row['Mean TemperatureC']
                min_temp = row['Min TemperatureC']
                precip = row['Precipitationmm']
                max_hum = row['Max Humidity']
                mean_hum = row['Mean Humidity']
                min_hum = row['Min Humidity']

                # Ensure valid values (non-empty, non-None)
                if date and max_temp and mean_temp and min_temp and precip and max_hum and mean_hum and min_hum:
                    dates.append(datetime.strptime(date, '%Y-%m-%d'))
                    max_temps.append(float(max_temp))
                    mean_temps.append(float(mean_temp))
                    min_temps.append(float(min_temp))
                    precipitation.append(float(precip))
                    max_humidity.append(float(max_hum))
                    mean_humidity.append(float(mean_hum) if mean_hum else None)
                    min_humidity.append(float(min_hum))
                else:
                    print(f"Skipping row due to missing data: {row}")
            except (ValueError, KeyError) as e:
                # Handle rows with missing or malformed data
                print(f"Error processing row: {row} | Error: {e}")
                continue
    
    # Check that all lists have the same length
    print(f"Dates: {len(dates)}, Max Temps: {len(max_temps)}, Mean Temps: {len(mean_temps)}")
    
    # Visualization 1: Line plot of Max, Min, and Mean Temperatures over time
    plt.figure(figsize=(10, 6))
    plt.plot(dates, max_temps, label='Max Temperature', color='red')
    plt.plot(dates, mean_temps, label='Mean Temperature', color='blue')
    plt.plot(dates, min_temps, label='Min Temperature', color='green')
    plt.title('Temperature Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Temperature (°C)')
    plt.legend()
    plt.xticks(rotation=45)  # Rotate date labels for better readability
    plt.tight_layout()  # Ensure the labels fit within the figure
    plt.savefig('./output/temperature_trends.png')  # Save the plot
    plt.close()  # Close the current plot
    
    # Visualization 2: Bar chart for Precipitation levels
    plt.figure(figsize=(10, 6))
    plt.bar(dates, precipitation, color='skyblue')
    plt.title('Daily Precipitation')
    plt.xlabel('Date')
    plt.ylabel('Precipitation (mm)')
    plt.xticks(rotation=45)  # Rotate date labels for better readability
    plt.tight_layout()  # Ensure the labels fit within the figure
    plt.savefig('./output/precipitation_levels.png')  # Save the plot
    plt.close()  # Close the current plot
    
    # Visualization 3: Line plot of Max, Mean, and Min Humidity over time
    plt.figure(figsize=(10, 6))
    plt.plot(dates, max_humidity, label='Max Humidity', color='orange')
    plt.plot(dates, mean_humidity, label='Mean Humidity', color='purple')
    plt.plot(dates, min_humidity, label='Min Humidity', color='brown')
    plt.title('Humidity Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Humidity (%)')
    plt.legend()
    plt.xticks(rotation=45)  # Rotate date labels for better readability
    plt.tight_layout()  # Ensure the labels fit within the figure
    plt.savefig('./output/humidity_trends.png')  # Save the plot
    plt.close()  # Close the current plot
    
    print("Visualizations have been saved as images in the repository.")

# Run the function to generate and save the plots
visualize_data()



Skipping row due to missing data: {'CET': '2000-8-22', 'Max TemperatureC': '27', 'Mean TemperatureC': '', 'Min TemperatureC': '27', 'Dew PointC': '-1', 'MeanDew PointC': '-1', 'Min DewpointC': '-1', 'Max Humidity': '16', 'Mean Humidity': '16', 'Min Humidity': '16', 'Max Sea Level PressurehPa': '1015', 'Mean Sea Level PressurehPa': '1015', 'Min Sea Level PressurehPa': '1015', 'Max VisibilityKm': '', 'Mean VisibilityKm': '', 'Min VisibilitykM': '', 'Max Wind SpeedKm/h': '19', 'Mean Wind SpeedKm/h': '19', 'Max Gust SpeedKm/h': '', 'Precipitationmm': '0.00', 'CloudCover': '', 'Events': '', 'WindDirDegrees': '260'}
Skipping row due to missing data: {'CET': '2001-6-25', 'Max TemperatureC': '', 'Mean TemperatureC': '', 'Min TemperatureC': '', 'Dew PointC': '', 'MeanDew PointC': '', 'Min DewpointC': '', 'Max Humidity': '', 'Mean Humidity': '', 'Min Humidity': '', 'Max Sea Level PressurehPa': '1021', 'Mean Sea Level PressurehPa': '1019', 'Min Sea Level PressurehPa': '1018', 'Max VisibilityKm': 