In [2]:
import os
import subprocess
import zipfile
import csv
import fileinput
import pandas as pd
import shutil

# Collecting data from imgw

In [3]:
url = 'https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/synop/'
raw_data_folder = 'imgw_data'
unzipped_folder = "unzipped/"

# Create the output folder if it doesn't exist
if not os.path.exists(raw_data_folder):
    os.makedirs(raw_data_folder)

# Use wget command to download the entire public_html folder
subprocess.call(['wget', '-r', '--no-parent', '--no-host-directories', '--cut-dirs=1',
                    '--directory-prefix=' + raw_data_folder, '-A', 'zip', '--no-clobber', url])

0

# Concatenating data about Tarnów from IMGW into single CSV

In [4]:
#Unpacking all zips

if not os.path.exists(unzipped_folder):
    os.mkdir(unzipped_folder)

for root, dirs, files in os.walk(raw_data_folder):
    for filename in files:
        filepath = os.path.join(root, filename)

        if zipfile.is_zipfile(filepath):
            with zipfile.ZipFile(filepath) as zip_file:
                zip_file.extractall(unzipped_folder)

In [5]:
#Removing rows from csv files not containing station "Tarnów"

for filename in os.listdir(unzipped_folder):
    if filename.endswith('.csv'):
        filepath = os.path.join(unzipped_folder, filename)
        with open(filepath, 'r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            rows_to_keep = [row for row in reader if row[1] == 'TARNÓW']
        with open(filepath, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(rows_to_keep)

In [6]:
#Removing remaining empty csv files

for file_name in os.listdir(unzipped_folder):
    if file_name.endswith('.csv'):
        file_path = os.path.join(unzipped_folder, file_name)
        if os.path.getsize(file_path) == 0:
            os.remove(file_path)

In [8]:
#Concatenating remaining csv files
csv_files = [file for file in os.listdir(unzipped_folder) if file.endswith(".csv")]

with open("data.csv", "w", newline='') as outfile:
    writer = csv.writer(outfile)
    for file in csv_files:
        path = os.path.join(unzipped_folder, file)
        with open(path, "r") as infile:
            reader = csv.reader(infile)
            for row in reader:
                writer.writerow(row)

In [9]:
#Removing "unzipped folder"
shutil.rmtree('unzipped')

In [10]:
#Sorting rows chronologically
with open('data.csv', 'r') as file:
    reader = csv.reader(file)
    sorted_rows = sorted(reader, key=lambda row: (int(row[2]), int(row[3]), int(row[4])))

with open('data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(sorted_rows)

In [11]:
#Adding headers to data.csv
headers = ['CODE', 'NAME', 'YEAR', 'MONTH', 'DAY', 'TMAX', 'TMAX_ST', 'TMIN', 'TMIN_ST', 
           'STD', 'STD_ST', 'TMNG', 'TMNG_ST', 'SMDB', 'SMDB_ST', 'RO', 'PKSN', 'PKSN_ST', 
           'RWSN', 'RWSN_ST', 'USL', 'USL_ST', 'DESZ', 'DESZ_ST', 'SNEG', 'SNEG_ST', 'DISN', 
           'DISN_ST', 'GRAD', 'GRAD_ST', 'MGLA', 'MGLA_ST', 'ZMGL', 'ZMGL_ST', 'SADZ', 
           'SADZ_ST', 'GOLO', 'GOLO_ST', 'ZMNI', 'ZMNI_ST', 'ZMWS', 'ZMWS_ST', 'ZMET', 
           'ZMET_ST', 'FF10', 'FF10_ST', 'FF15', 'FF15_ST', 'BRZA', 'BRZA_ST', 'ROSA', 
           'ROSA_ST', 'SZRO', 'SZRO_ST', 'DZPS', 'DZPS_ST', 'DZBL', 'DZBL_ST', 'SG', 
           'IZD', 'IZD_ST', 'IZG', 'IZG_ST', 'AKTN', 'AKTN_ST']

with open('data.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    data = list(csv_reader)

with open('data.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(headers)
    for row in data:
        csv_writer.writerow(row)

In [12]:
#Removing columns containing station info
input_file = 'data.csv'

with fileinput.input(files=input_file, inplace=True) as f_input:
    csv_reader = csv.reader(f_input)

    for row in csv_reader:
        updated_row = row[2:]

        print(','.join(updated_row))

Deleted unnecessary info about station.


In [13]:
#Converting date into YYYY-MM-DD format
df = pd.read_csv("data.csv", low_memory=False)
df["DATE"] = pd.to_datetime(df[["YEAR", "MONTH", "DAY"]])
df.drop(["YEAR", "MONTH", "DAY"], axis=1, inplace=True)
df = df[["DATE"] + [col for col in df.columns if col != "DATE"]]
df.to_csv("data.csv", index=False)