Initial commit of data and code for weather history visualization

fivethirtyeight · Jul 21, 2015 · 83bac8d · 83bac8d
1 parent d49c751
commit 83bac8d
Show file tree

Hide file tree

Showing 13 changed files with 3,928 additions and 0 deletions.
diff --git a/us-weather-history/KCLT.csv b/us-weather-history/KCLT.csv
diff --git a/us-weather-history/KCQT.csv b/us-weather-history/KCQT.csv
diff --git a/us-weather-history/KHOU.csv b/us-weather-history/KHOU.csv
diff --git a/us-weather-history/KIND.csv b/us-weather-history/KIND.csv
diff --git a/us-weather-history/KJAX.csv b/us-weather-history/KJAX.csv
diff --git a/us-weather-history/KMDW.csv b/us-weather-history/KMDW.csv
diff --git a/us-weather-history/KNYC.csv b/us-weather-history/KNYC.csv
diff --git a/us-weather-history/KPHL.csv b/us-weather-history/KPHL.csv
diff --git a/us-weather-history/KPHX.csv b/us-weather-history/KPHX.csv
diff --git a/us-weather-history/KSEA.csv b/us-weather-history/KSEA.csv
diff --git a/us-weather-history/visualize_weather.py b/us-weather-history/visualize_weather.py
@@ -0,0 +1,106 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+from datetime import datetime
+
+'''
+This is an example to generate the Philadelphia, PA weather chart.
+
+If you want to make the chart for another city, you will have to modify
+this code slightly to read that city's data in, change the title, and
+likely change the y-axis of the chart to fit your city's temperature range.
+
+I also use a custom matplotlib style as the basis for these charts, which you
+can find here: https://gist.githubusercontent.com/rhiever/d0a7332fe0beebfdc3d5/raw/223d70799b48131d5ce2723cd5784f39d7a3a653/tableau10.mplstyle
+'''
+
+weather_data = pd.read_csv('KPHL.csv', parse_dates=['date'])
+print(weather_data.describe())
+
+# Generate a bunch of histograms of the data to make sure that all of the data
+# is in an expected range.
+with plt.style.context('https://gist.githubusercontent.com/rhiever/d0a7332fe0beebfdc3d5/raw/223d70799b48131d5ce2723cd5784f39d7a3a653/tableau10.mplstyle'):
+    for column in weather_data.columns:
+        if column in ['date']:
+            continue
+        plt.figure()
+        plt.hist(weather_data[column].values)
+        plt.title(column)
+        plt.savefig('{}.png'.format(column))
+
+    # Make sure we're only plotting temperatures for July 2014 - June 2015
+    weather_data_subset = weather_data[weather_data['date'] >= datetime(year=2014, month=7, day=1)]
+    weather_data_subset = weather_data_subset[weather_data_subset['date'] < datetime(year=2015, month=7, day=1)].copy()
+    weather_data_subset['day_order'] = range(len(weather_data_subset))
+
+    day_order = weather_data_subset['day_order']
+    record_max_temps = weather_data_subset['record_max_temp'].values
+    record_min_temps = weather_data_subset['record_min_temp'].values
+    average_max_temps = weather_data_subset['average_max_temp'].values
+    average_min_temps = weather_data_subset['average_min_temp'].values
+    actual_max_temps = weather_data_subset['actual_max_temp'].values
+    actual_min_temps = weather_data_subset['actual_min_temp'].values
+
+    fig, ax1 = plt.subplots(figsize=(15, 7))
+
+    # Create the bars showing all-time record highs and lows
+    plt.bar(day_order, record_max_temps - record_min_temps, bottom=record_min_temps,
+            edgecolor='none', color='#C3BBA4', width=1)
+
+    # Create the bars showing average highs and lows
+    plt.bar(day_order, average_max_temps - average_min_temps, bottom=average_min_temps,
+            edgecolor='none', color='#9A9180', width=1)
+
+    # Create the bars showing this year's highs and lows
+    plt.bar(day_order, actual_max_temps - actual_min_temps, bottom=actual_min_temps,
+            edgecolor='black', linewidth=0.5, color='#5A3B49', width=1)
+
+    new_max_records = weather_data_subset[weather_data_subset.record_max_temp <= weather_data_subset.actual_max_temp]
+    new_min_records = weather_data_subset[weather_data_subset.record_min_temp >= weather_data_subset.actual_min_temp]
+
+    # Create the dots marking record highs and lows for the year
+    plt.scatter(new_max_records['day_order'].values + 0.5,
+                new_max_records['actual_max_temp'].values + 0.75,
+                s=15, zorder=10, color='#d62728', alpha=0.75, linewidth=0)
+
+    plt.scatter(new_min_records['day_order'].values + 0.5,
+                new_min_records['actual_min_temp'].values - 0.75,
+                s=15, zorder=10, color='#1f77b4', alpha=0.75, linewidth=0)
+
+    plt.ylim(-15, 111)
+    plt.xlim(-5, 370)
+
+    plt.yticks(range(-10, 111, 10), [r'{}$^\circ$'.format(x)
+                                     for x in range(-10, 111, 10)], fontsize=10)
+    plt.ylabel(r'Temperature ($^\circ$F)', fontsize=12)
+
+    month_beginning_df = weather_data_subset[weather_data_subset['date'].apply(lambda x: True if x.day == 1 else False)]
+    month_beginning_indeces = list(month_beginning_df['day_order'].values)
+    month_beginning_names = list(month_beginning_df['date'].apply(lambda x: x.strftime("%B")).values)
+    month_beginning_names[0] += '\n\'14'
+    month_beginning_names[6] += '\n\'15'
+
+    # Add the last month label manually
+    month_beginning_indeces += [weather_data_subset['day_order'].values[-1]]
+    month_beginning_names += ['July']
+
+    plt.xticks(month_beginning_indeces,
+               month_beginning_names,
+               fontsize=10)
+
+    ax2 = ax1.twiny()
+    plt.xticks(month_beginning_indeces,
+               month_beginning_names,
+               fontsize=10)
+
+    plt.xlim(-5, 370)
+    plt.grid(False)
+
+    ax3 = ax1.twinx()
+    plt.yticks(range(-10, 111, 10), [r'{}$^\circ$'.format(x)
+                                     for x in range(-10, 111, 10)], fontsize=10)
+    plt.ylim(-15, 111)
+    plt.grid(False)
+
+    plt.title('Philadelphia, PA\'s weather, July 2014 - June 2015\n\n', fontsize=20)
+
+    plt.savefig('philadelphia-weather-july14-june15.png')
diff --git a/us-weather-history/wunderground_parser.py b/us-weather-history/wunderground_parser.py
@@ -0,0 +1,109 @@
+from datetime import datetime, timedelta
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+
+
+def parse_pages(station):
+    '''
+    This function parses the web pages downloaded from wunderground.com
+    into a flat CSV file for the station you provide it.
+
+    Make sure to run the wunderground scraper first so you have the web
+    pages downloaded.
+    '''
+
+    # Scrape between July 1, 2014 and July 1, 2015
+    # You can change the dates here if you prefer to parse a different range
+    current_date = datetime(year=2014, month=7, day=1)
+    end_date = datetime(year=2015, month=7, day=1)
+
+    with open('{}.csv'.format(station), 'w') as out_file:
+        out_file.write('date,actual_mean_temp,actual_min_temp,actual_max_temp,'
+                       'average_min_temp,average_max_temp,'
+                       'record_min_temp,record_max_temp,'
+                       'record_min_temp_year,record_max_temp_year,'
+                       'actual_precipitation,average_precipitation,'
+                       'record_precipitation\n')
+
+        while current_date != end_date:
+            try_again = False
+            with open('{}/{}-{}-{}.html'.format(station,
+                                                current_date.year,
+                                                current_date.month,
+                                                current_date.day)) as in_file:
+                soup = BeautifulSoup(in_file.read(), 'html.parser')
+
+                weather_data = soup.find(id='historyTable').find_all('span', class_='wx-value')
+                weather_data_units = soup.find(id='historyTable').find_all('td')
+
+                try:
+                    actual_mean_temp = weather_data[0].text
+                    actual_max_temp = weather_data[2].text
+                    average_max_temp = weather_data[3].text
+                    record_max_temp = weather_data[4].text
+                    actual_min_temp = weather_data[5].text
+                    average_min_temp = weather_data[6].text
+                    record_min_temp = weather_data[7].text
+                    record_max_temp_year = weather_data_units[
+                        9].text.split('(')[-1].strip(')')
+                    record_min_temp_year = weather_data_units[
+                        13].text.split('(')[-1].strip(')')
+
+                    actual_precipitation = weather_data[9].text
+                    if actual_precipitation == 'T':
+                        actual_precipitation = '0.0'
+                    average_precipitation = weather_data[10].text
+                    record_precipitation = weather_data[11].text
+
+                    # Verify that the parsed data is valid
+                    if (record_max_temp_year == '-1' or record_min_temp_year == '-1' or
+                            int(record_max_temp) < max(int(actual_max_temp), int(average_max_temp)) or
+                            int(record_min_temp) > min(int(actual_min_temp), int(average_min_temp)) or
+                            float(actual_precipitation) > float(record_precipitation) or
+                            float(average_precipitation) > float(record_precipitation)):
+                        raise Exception
+
+                    out_file.write('{}-{}-{},'.format(current_date.year, current_date.month, current_date.day))
+                    out_file.write(','.join([actual_mean_temp, actual_min_temp, actual_max_temp,
+                                             average_min_temp, average_max_temp,
+                                             record_min_temp, record_max_temp,
+                                             record_min_temp_year, record_max_temp_year,
+                                             actual_precipitation, average_precipitation,
+                                             record_precipitation]))
+                    out_file.write('\n')
+                    current_date += timedelta(days=1)
+                except:
+                    # If the web page is formatted improperly, signal that the page may need
+                    # to be downloaded again.
+                    try_again = True
+
+            # If the web page needs to be downloaded again, re-download it from
+            # wunderground.com
+
+            # If the parser gets stuck on a certain date, you may need to investigate
+            # the page to find out what is going on. Sometimes data is missing, in
+            # which case the parser will get stuck. You can manually put in the data
+            # yourself in that case, or just tell the parser to skip this day.
+            if try_again:
+                print('Error with date {}'.format(current_date))
+
+                lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'
+                formatted_lookup_URL = lookup_URL.format(station,
+                                                         current_date.year,
+                                                         current_date.month,
+                                                         current_date.day)
+                html = urlopen(formatted_lookup_URL).read().decode('utf-8')
+
+                out_file_name = '{}/{}-{}-{}.html'.format(station,
+                                                          current_date.year,
+                                                          current_date.month,
+                                                          current_date.day)
+
+                with open(out_file_name, 'w') as out_file:
+                    out_file.write(html)
+
+
+# Parse the stations used in this article
+for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX',
+                'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']:
+    parse_station(station)
diff --git a/us-weather-history/wunderground_scraper.py b/us-weather-history/wunderground_scraper.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+
+from datetime import datetime, timedelta
+from urllib.request import urlopen
+import os
+
+
+def scrape_station(station):
+    '''
+    This function scrapes the weather data web pages from wunderground.com
+    for the station you provide it.
+
+    You can look up your city's weather station by performing a search for
+    it on wunderground.com then clicking on the "History" section.
+    The 4-letter name of the station will appear on that page.
+    '''
+
+    # Scrape between July 1, 2014 and July 1, 2015
+    # You can change the dates here if you prefer to scrape a different range
+    current_date = datetime(year=2014, month=7, day=1)
+    end_date = datetime(year=2015, month=7, day=1)
+
+    # Make sure a directory exists for the station web pages
+    os.mkdir(station)
+
+    # Use .format(station, YYYY, M, D)
+    lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'
+
+    while current_date != end_date:
+
+        if current_date.day == 1:
+            print(current_date)
+
+        formatted_lookup_URL = lookup_URL.format(station,
+                                                 current_date.year,
+                                                 current_date.month,
+                                                 current_date.day)
+        html = urlopen(formatted_lookup_URL).read().decode('utf-8')
+
+        out_file_name = '{}/{}-{}-{}.html'.format(station, current_date.year,
+                                                  current_date.month,
+                                                  current_date.day)
+
+        with open(out_file_name, 'w') as out_file:
+            out_file.write(html)
+
+        current_date += timedelta(days=1)
+
+
+# Scrape the stations used in this article
+for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX',
+                'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']:
+    scrape_station(station)