In [54]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### All weather station data from 2015 - 2023/09

In [14]:
url = 'http://farmer.iyard.org/date/'

response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the <a> tags on the page
    all_a_tags = soup.find_all('a')

    # Extract content inside each <a> tag ending with ".zip"
    zip_links = [a_tag['href'] for a_tag in all_a_tags if a_tag.get('href', '').endswith('.zip')]

    # Create a DataFrame
    df = pd.DataFrame(zip_links[:-1], columns=['zip_link'])

    base_url = "http://farmer.iyard.org"
    df['full_link'] = base_url + df['zip_link']

    df.to_csv('./DATA/file_links/weather_links.csv', index = False)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


In [None]:
weather_link = pd.read_csv('./DATA/file_links/weather_links.csv')

for index, row in weather_link.iterrows():
    link = row["full_link"]
    response = requests.get(link, stream=True)

    if response.status_code == 200:
        # Extract the filename from the URL
        filename = link.split("/")[-1]

        # Save the content to the file
        with open('./DATA/all_raw_data/' + filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)

        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download file from {link}. Status code: {response.status_code}")

### Taipei weather station data from 2015 - 2023/09

In [None]:
taipei_weather_station = pd.read_csv('./DATA/taipei_weather_station.csv')

taipei_weather_station

In [61]:
url = 'http://farmer.iyard.org/station/'

response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the <a> tags on the page
    filter_values = taipei_weather_station['代號'].to_list()

    # Extract content inside each <a> tag ending with ".zip" and containing the filter values
    zip_links = [a_tag['href'] for a_tag in all_a_tags
                 if a_tag.get('href', '').endswith('.zip') and any(value in a_tag['href'] for value in filter_values)]

    # Create a DataFrame
    df = pd.DataFrame(zip_links, columns=['zip_link'])

    base_url = "http://farmer.iyard.org"
    df['full_link'] = base_url + df['zip_link']

    df.to_csv('./DATA/file_links/taipei_weather_links.csv', index = False)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


In [None]:
taipei_weather_link = pd.read_csv('./DATA/file_links/taipei_weather_links.csv')

for index, row in taipei_weather_link.iterrows():
    link = row["full_link"]
    response = requests.get(link, stream=True)

    if response.status_code == 200:
        # Extract the filename from the URL
        filename = link.split("/")[-1]

        # Save the content to the file
        with open('./DATA/taipei_raw_data/' + filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)

        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download file from {link}. Status code: {response.status_code}")

## Concatenate all bureau data

In [4]:
import zipfile
import pandas as pd
import datetime as dt

In [None]:
# Specify the list of folders
bureau_folders = ['466920', '466930', '466910', 'CAAH60', '466960', 'A0A460']

# Initialize an empty DataFrame to store the concatenated data
bureau_concatenated_data = pd.DataFrame(columns = list(range(1, 31)))

# Iterate through each folder
for folder_name in bureau_folders:
    # Construct the path to the zip file
    zip_path = f"./DATA/taipei_raw_data/{folder_name}.zip"

    
    # Open the zipped dataset
    with zipfile.ZipFile(zip_path) as z:
        # Find all files in the zip file
        files_in_zip = z.namelist()

        # Iterate through each file in the zip file
        for file_in_zip in files_in_zip:
            # Open the csv file in the dataset
            with z.open(file_in_zip) as f:
                # Read the dataset
                data = pd.read_csv(f, sep=',', encoding='big5', names = list(range(1, 31)))

                # Assuming your date column is at index 5
                data[5] = pd.to_datetime(data[5])
                data[5] = data[5].dt.strftime('%Y-%m-%d %H:%M')

                # Concatenate the data to the main DataFrame
                bureau_concatenated_data = pd.concat([bureau_concatenated_data, data], ignore_index=True)
                print(file_in_zip)

# Save the concatenated data to a new file or perform further analysis
bureau_concatenated_data.to_csv('./DATA/bureau_station_all_data.csv', index=False)


### Add column_name

In [49]:
bureau_column_version_1 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', '?', 'ELEV', 'WDIR', 'WDSD', 'TEMP',
                           'HUMD', 'PRES', '24R', 'H_FX', 'H_XD', 'H_FXT', 'H_F10', 'H_10D', 'H_F10T', 'CITY', 'CITY_SN',
                           'TOWN', 'TOWN_SN']

bureau_column_version_2 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'ELEV', 'WDIR', 'WDSD', 'TEMP', 'HUMD', 
                           'PRES', '24R', 'H_FX', 'H_XD', 'H_FXT', 'H_F10', 'H_10D', 'H_F10T', 'H_UVI', 'CITY', 'CITY_SN',
                           'TOWN', 'TOWN_SN']

bureau_column_version_3 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'ELEV', 'WDIR', 'WDSD', 'TEMP', 'HUMD', 
                           'PRES', '24R', 'H_FX', 'H_XD', 'H_FXT', 'H_F10', 'H_10D', 'H_F10T', 'H_UVI', 'D_TX', 'D_TXT',
                           'D_TN', 'D_TNT', 'D_TS', 'H_VIS', 'H_Weather', 'CITY', 'CITY_SN', 'TOWN', 'TOWN_SN']

col_wanted = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'WDSD', 'TEMP', 'HUMD',
              '24R', 'TOWN']

In [None]:
# part1：2017-6-21 15:10之前
bureau_weather_part1 = bureau_concatenated_data[bureau_concatenated_data[5] < '2017-06-21 15:20']

bureau_weather_part1.dropna(how='all', axis=1, inplace=True)
bureau_weather_part1.columns = bureau_column_version_1
bureau_weather_part1_wanted_columns = bureau_weather_part1[col_wanted]

bureau_weather_part1_wanted_columns.to_csv('./DATA/organized_data/bureau_weather_part1.csv', index = False)

In [None]:
# part2：2019-10-21 10:10之前
bureau_weather_part2 = (
    bureau_concatenated_data[(bureau_concatenated_data[5] > '2017-06-21 15:10') & 
                             (bureau_concatenated_data[5] < '2019-10-21 10:20')]
    )

bureau_weather_part2.dropna(how='all', axis=1, inplace=True)
bureau_weather_part2.columns = bureau_column_version_2
bureau_weather_part2_wanted_columns = bureau_weather_part2[col_wanted]

bureau_weather_part2_wanted_columns

bureau_weather_part2_wanted_columns.to_csv('./DATA/organized_data/bureau_weather_part2.csv', index = False)

In [None]:
# part3：2019-10-21 10:20之後
bureau_weather_part3 = bureau_concatenated_data[bureau_concatenated_data[5] > '2019-10-21 10:10']

bureau_weather_part3.dropna(how='all', axis=1, inplace=True)
bureau_weather_part3.columns = bureau_column_version_3
bureau_weather_part3_wanted_columns = bureau_weather_part3[col_wanted]

bureau_weather_part3_wanted_columns.to_csv('./DATA/organized_data/bureau_weather_part3.csv', index = False)

## Concatenate all auto station data

In [None]:
# Specify the list of folders
auto_folders = ['C0A9E0','C0A9C0','CM0020','C0A980','C0AH40','C0A770','C0A9A0','C0A9F0',
                  'C0AC80','C0AI40','C0A9B0','C0AC40','C0AC70','C0A9G0','C0AH70','CAA040','CAA090']

# Initialize an empty DataFrame to store the concatenated data
auto_concatenated_data = pd.DataFrame(columns = list(range(1, 31)))

# Iterate through each folder
for folder_name in auto_folders:
    # Construct the path to the zip file
    zip_path = f"./DATA/taipei_raw_data/{folder_name}.zip"

    
    # Open the zipped dataset
    with zipfile.ZipFile(zip_path) as z:
        # Find all files in the zip file
        files_in_zip = z.namelist()

        # Iterate through each file in the zip file
        for file_in_zip in files_in_zip:
            # Open the csv file in the dataset
            with z.open(file_in_zip) as f:
                # Read the dataset
                data = pd.read_csv(f, sep=',', encoding='big5', names = list(range(1, 31)))

                # Assuming your date column is at index 5
                data[5] = pd.to_datetime(data[5])
                data[5] = data[5].dt.strftime('%Y-%m-%d %H:%M')

                # Concatenate the data to the main DataFrame
                auto_concatenated_data = pd.concat([auto_concatenated_data, data], ignore_index=True)
                print(file_in_zip)

# Save the concatenated data to a new file or perform further analysis
auto_concatenated_data.to_csv('./DATA/auto_station_all_data.csv', index=False)

### Add column_name

In [1]:
import pandas as pd

In [39]:
auto_column_version_1 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'ELEV', 'WDIR', 'WDSD', 'TEMP',
                         'HUMD', 'PRES', 'SUN', '24R','WS15M','WD15M','WS15T', 'CITY', 'CITY_SN',
                         'TOWN', 'TOWN_SN']

auto_column_version_2 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'ELEV', 'WDIR', 'WDSD', 'TEMP',
                         'HUMD', 'PRES', '24R', 'H_FX', 'H_XD', 'H_FXT', 'D_TX', 'CITY', 'CITY_SN',
                         'TOWN', 'TOWN_SN']

auto_column_version_3 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'ELEV', 'WDIR', 'WDSD', 'TEMP',
                         'HUMD', 'PRES', '24R', 'H_FX', 'H_XD', 'H_FXT', 'D_TX', 'D_TXT', 'D_TN', 'D_TNT', 
                         'CITY', 'CITY_SN', 'TOWN', 'TOWN_SN']

# only for CAA040 & CAA090
auto_column_version_4 = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'ELEV', 'WDIR', 'WDSD', 'TEMP', 'HUMD', 
                         'PRES', '24R', 'H_FX', 'H_XD', 'H_FXT', 'H_F10', 'H_10D', 'H_F10T', 'H_UVI', 'D_TX', 'D_TXT',
                         'D_TN', 'D_TNT', 'D_TS', 'H_VIS', 'H_Weather', 'CITY', 'CITY_SN', 'TOWN', 'TOWN_SN']

col_wanted = ['lat', 'lon', 'locationName', 'stationId', 'obsTime', 'WDSD', 'TEMP', 'HUMD',
              '24R', 'TOWN']

exclude_weather_station = ['CAA040','CAA090']

In [None]:
# part1：2019-9-19 11:00之前 and is not 'CAA040','CAA090'
auto_weather_part1 = auto_concatenated_data[
    (auto_concatenated_data[5] < '2019-09-19 11:10') & 
    (~auto_concatenated_data[4].isin(exclude_weather_station))]

auto_weather_part1.dropna(how='all', axis=1, inplace=True)
auto_weather_part1.columns = auto_column_version_1
auto_weather_part1_wanted_columns = auto_weather_part1[col_wanted]

auto_weather_part1_wanted_columns.to_csv('./DATA/organized_data/auto_weather_part1.csv', index = False)

In [None]:
# part2：2019-9-19 11:00之後 and 2019-10-21 10:00之前 and is not 'CAA040','CAA090'
auto_weather_part2 = auto_concatenated_data[
    (auto_concatenated_data[5] > '2019-09-19 11:00') & 
    (auto_concatenated_data[5] < '2019-10-21 10:10') &
    (~auto_concatenated_data[4].isin(exclude_weather_station))]

auto_weather_part2.dropna(how='all', axis=1, inplace=True)
auto_weather_part2.columns = auto_column_version_2
auto_weather_part2_wanted_columns = auto_weather_part2[col_wanted]

auto_weather_part2_wanted_columns.to_csv('./DATA/organized_data/auto_weather_part2.csv', index = False)

In [None]:
# part3：2019-10-21 11:00之後 and is not 'CAA040','CAA090'
auto_weather_part3 = auto_concatenated_data[
    (auto_concatenated_data[5] > '2019-10-21 10:50') & 
    (~auto_concatenated_data[4].isin(exclude_weather_station))]

auto_weather_part3.dropna(how='all', axis=1, inplace=True)
auto_weather_part3.columns = auto_column_version_3
auto_weather_part3_wanted_columns = auto_weather_part3[col_wanted]

auto_weather_part3_wanted_columns.to_csv('./DATA/organized_data/auto_weather_part3.csv', index = False)

In [None]:
# part4：'CAA040','CAA090'
auto_weather_part4 = auto_concatenated_data[
    auto_concatenated_data[4].isin(exclude_weather_station)]

auto_weather_part4.dropna(how='all', axis=1, inplace=True)
auto_weather_part4.columns = auto_column_version_4
auto_weather_part4_wanted_columns = auto_weather_part4[col_wanted]

auto_weather_part4_wanted_columns.to_csv('./DATA/organized_data/auto_weather_part4.csv', index = False)

In [44]:
# part5：is'CAA040','CAA090' but only contains 23 columns >> auto_column_version_3
auto_weather_part5 = auto_concatenated_data[
    auto_concatenated_data[4].isin(exclude_weather_station)]

auto_weather_part5 = auto_weather_part5[auto_weather_part5.isnull().any(axis=1)]
auto_weather_part5.dropna(how='all', axis=1, inplace=True)
auto_weather_part5.columns = auto_column_version_3
auto_weather_part5_wanted_columns = auto_weather_part5[col_wanted]

auto_weather_part5_wanted_columns.to_csv('./DATA/organized_data/auto_weather_part5.csv', index = False)

### Concatenate bureau & auto station data

In [46]:
import pandas as pd
from glob import glob

In [50]:
def concatenate_data(folder_path, file_extension='csv'):
    global concatenated_data

    # Get a list of all files in the folder with the specified extension
    file_pattern = f"{folder_path}/*.{file_extension}"
    file_list = glob(file_pattern)

    concatenated_organized_data = pd.DataFrame(columns = col_wanted )

    # Loop through each file and concatenate its data to the global DataFrame
    for file_path in file_list:
        current_data = pd.read_csv(file_path)  # Assuming the files are in CSV format, adjust accordingly
        concatenated_organized_data = pd.concat([concatenated_organized_data, current_data], ignore_index=True)

    return concatenated_organized_data

# Call the function with the folder path


# Now, concatenated_data contains all the data from the files in the folder


In [59]:
folder_path = './DATA/organized_data'

concatenated_organized_data = concatenate_data(folder_path)

sorted_data = concatenated_organized_data.sort_values(by=['stationId', 'obsTime'])

sorted_data.to_csv('./DATA/all_weather_data_sort_by_station_time.csv', index = False)

  current_data = pd.read_csv(file_path)  # Assuming the files are in CSV format, adjust accordingly
  current_data = pd.read_csv(file_path)  # Assuming the files are in CSV format, adjust accordingly
