In [20]:
import csv
from calendar import monthrange

In [21]:
input_file = 'data/weather-2016/IAD-weather-2016'
output_file = 'data/weather-2016/IAD-weather-2016-cleaned'
cleaned_data = []

In [22]:
# Dictionary to map month names to month numbers
months = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
    "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
}

# Initialize lists to store each column's data
month = []
days = []
temperature_max = []
temperature_avg = []
temperature_min = []
dew_point_max = []
dew_point_avg = []
dew_point_min = []
humidity_max = []
humidity_avg = []
humidity_min = []
wind_speed_max = []
wind_speed_avg = []
wind_speed_min = []
pressure_max = []
pressure_avg = []
pressure_min = []
precipitation = []

# Flags to track which section we're in
current_section = None
current_month = None
days_in_month = 31  # Default to 31 days, will change based on month
current_section_n = -1 # Default value set when current section is month
sections = {
    "Temperature (C)": (temperature_max, temperature_avg, temperature_min),
    "Dew Point (C)": (dew_point_max, dew_point_avg, dew_point_min),
    "Humidity (%)": (humidity_max, humidity_avg, humidity_min),
    "Wind Speed (km/h)": (wind_speed_max, wind_speed_avg, wind_speed_min),
    "Pressure (hPa)": (pressure_max, pressure_avg, pressure_min),
    "Precipitation (mm)": precipitation
}
sections_n = []
for i, n in enumerate(sections):
    sections_n.append([i, n])

In [23]:
sections_n

[[0, 'Temperature (C)'],
 [1, 'Dew Point (C)'],
 [2, 'Humidity (%)'],
 [3, 'Wind Speed (km/h)'],
 [4, 'Pressure (hPa)'],
 [5, 'Precipitation (mm)']]

In [24]:
# Function to get the number of days in a month
def get_days_in_month(month_name, year):
    month_num = months[month_name]
    return monthrange(year, month_num)[1]  # Returns the number of days in the month

# Read the input file and parse the data
with open(input_file, 'r') as infile:
    for line in infile:
        line = line.strip()

        # Detect the month name (e.g., Jan, Feb, etc.)
        if line in months:
            current_month = line
            # Set the number of days based on the month and a fixed year (e.g., 2024)
            days_in_month = get_days_in_month(current_month, 2016)
            month += [f'{months[current_month]}' for x in range(1, days_in_month + 1)]
            days += [f'{day}' for day in range(1, days_in_month + 1)]
            # Set the current section to an empty string to skip day rows.
            current_section = ''
            continue
        
        # Detect "Max Avg Min" line
        if line == "Max\tAvg\tMin" or line == "Max Avg Min":
            current_section_n += 1
            current_section = sections_n[current_section_n][1]
            continue

        if line == "Total":
            current_section = 'Precipitation (mm)'
            current_section_n = -1
            continue

        # Add data to the appropriate section
        if current_section:
            data = line.split()
            if current_section == "Precipitation (mm)":
                # Precipitation has only one value per line
                precipitation.append(data[0])
            else:
                # Max, Avg, Min sections
                sections[current_section][0].append(data[0])
                sections[current_section][1].append(data[1])
                sections[current_section][2].append(data[2])

In [25]:
# Write data to CSV
with open(output_file, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    # Write header
    csvwriter.writerow([
        "Month", "Day", "Temperature Max (C)", "Temperature Avg (C)", "Temperature Min (C)",
        "Dew Point Max (C)", "Dew Point Avg (C)", "Dew Point Min (C)",
        "Humidity Max (%)", "Humidity Avg (%)", "Humidity Min (%)",
        "Wind Speed Max (km/h)", "Wind Speed Avg (km/h)", "Wind Speed Min (km/h)",
        "Pressure Max (hPa)", "Pressure Avg (hPa)", "Pressure Min (hPa)",
        "Precipitation (mm)"
    ])
    # Write data rows
    for i in range(len(month)):
        csvwriter.writerow([
            month[i],
            days[i],
            temperature_max[i], temperature_avg[i], temperature_min[i],
            dew_point_max[i], dew_point_avg[i], dew_point_min[i],
            humidity_max[i], humidity_avg[i], humidity_min[i],
            wind_speed_max[i], wind_speed_avg[i], wind_speed_min[i],
            pressure_max[i], pressure_avg[i], pressure_min[i],
            precipitation[i] if i < len(precipitation) else ''  # Handle missing precipitation data
        ])