Skip to content

Commit

Permalink
Initial commit of data and code for weather history visualization
Browse files Browse the repository at this point in the history
  • Loading branch information
rhiever committed Jul 21, 2015
1 parent d49c751 commit 83bac8d
Show file tree
Hide file tree
Showing 13 changed files with 3,928 additions and 0 deletions.
366 changes: 366 additions & 0 deletions us-weather-history/KCLT.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KCQT.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KHOU.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KIND.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KJAX.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KMDW.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KNYC.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KPHL.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KPHX.csv

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions us-weather-history/KSEA.csv

Large diffs are not rendered by default.

106 changes: 106 additions & 0 deletions us-weather-history/visualize_weather.py
@@ -0,0 +1,106 @@
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

'''
This is an example to generate the Philadelphia, PA weather chart.
If you want to make the chart for another city, you will have to modify
this code slightly to read that city's data in, change the title, and
likely change the y-axis of the chart to fit your city's temperature range.
I also use a custom matplotlib style as the basis for these charts, which you
can find here: https://gist.githubusercontent.com/rhiever/d0a7332fe0beebfdc3d5/raw/223d70799b48131d5ce2723cd5784f39d7a3a653/tableau10.mplstyle
'''

weather_data = pd.read_csv('KPHL.csv', parse_dates=['date'])
print(weather_data.describe())

# Generate a bunch of histograms of the data to make sure that all of the data
# is in an expected range.
with plt.style.context('https://gist.githubusercontent.com/rhiever/d0a7332fe0beebfdc3d5/raw/223d70799b48131d5ce2723cd5784f39d7a3a653/tableau10.mplstyle'):
for column in weather_data.columns:
if column in ['date']:
continue
plt.figure()
plt.hist(weather_data[column].values)
plt.title(column)
plt.savefig('{}.png'.format(column))

# Make sure we're only plotting temperatures for July 2014 - June 2015
weather_data_subset = weather_data[weather_data['date'] >= datetime(year=2014, month=7, day=1)]
weather_data_subset = weather_data_subset[weather_data_subset['date'] < datetime(year=2015, month=7, day=1)].copy()
weather_data_subset['day_order'] = range(len(weather_data_subset))

day_order = weather_data_subset['day_order']
record_max_temps = weather_data_subset['record_max_temp'].values
record_min_temps = weather_data_subset['record_min_temp'].values
average_max_temps = weather_data_subset['average_max_temp'].values
average_min_temps = weather_data_subset['average_min_temp'].values
actual_max_temps = weather_data_subset['actual_max_temp'].values
actual_min_temps = weather_data_subset['actual_min_temp'].values

fig, ax1 = plt.subplots(figsize=(15, 7))

# Create the bars showing all-time record highs and lows
plt.bar(day_order, record_max_temps - record_min_temps, bottom=record_min_temps,
edgecolor='none', color='#C3BBA4', width=1)

# Create the bars showing average highs and lows
plt.bar(day_order, average_max_temps - average_min_temps, bottom=average_min_temps,
edgecolor='none', color='#9A9180', width=1)

# Create the bars showing this year's highs and lows
plt.bar(day_order, actual_max_temps - actual_min_temps, bottom=actual_min_temps,
edgecolor='black', linewidth=0.5, color='#5A3B49', width=1)

new_max_records = weather_data_subset[weather_data_subset.record_max_temp <= weather_data_subset.actual_max_temp]
new_min_records = weather_data_subset[weather_data_subset.record_min_temp >= weather_data_subset.actual_min_temp]

# Create the dots marking record highs and lows for the year
plt.scatter(new_max_records['day_order'].values + 0.5,
new_max_records['actual_max_temp'].values + 0.75,
s=15, zorder=10, color='#d62728', alpha=0.75, linewidth=0)

plt.scatter(new_min_records['day_order'].values + 0.5,
new_min_records['actual_min_temp'].values - 0.75,
s=15, zorder=10, color='#1f77b4', alpha=0.75, linewidth=0)

plt.ylim(-15, 111)
plt.xlim(-5, 370)

plt.yticks(range(-10, 111, 10), [r'{}$^\circ$'.format(x)
for x in range(-10, 111, 10)], fontsize=10)
plt.ylabel(r'Temperature ($^\circ$F)', fontsize=12)

month_beginning_df = weather_data_subset[weather_data_subset['date'].apply(lambda x: True if x.day == 1 else False)]
month_beginning_indeces = list(month_beginning_df['day_order'].values)
month_beginning_names = list(month_beginning_df['date'].apply(lambda x: x.strftime("%B")).values)
month_beginning_names[0] += '\n\'14'
month_beginning_names[6] += '\n\'15'

# Add the last month label manually
month_beginning_indeces += [weather_data_subset['day_order'].values[-1]]
month_beginning_names += ['July']

plt.xticks(month_beginning_indeces,
month_beginning_names,
fontsize=10)

ax2 = ax1.twiny()
plt.xticks(month_beginning_indeces,
month_beginning_names,
fontsize=10)

plt.xlim(-5, 370)
plt.grid(False)

ax3 = ax1.twinx()
plt.yticks(range(-10, 111, 10), [r'{}$^\circ$'.format(x)
for x in range(-10, 111, 10)], fontsize=10)
plt.ylim(-15, 111)
plt.grid(False)

plt.title('Philadelphia, PA\'s weather, July 2014 - June 2015\n\n', fontsize=20)

plt.savefig('philadelphia-weather-july14-june15.png')
109 changes: 109 additions & 0 deletions us-weather-history/wunderground_parser.py
@@ -0,0 +1,109 @@
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from urllib.request import urlopen


def parse_pages(station):
'''
This function parses the web pages downloaded from wunderground.com
into a flat CSV file for the station you provide it.
Make sure to run the wunderground scraper first so you have the web
pages downloaded.
'''

# Scrape between July 1, 2014 and July 1, 2015
# You can change the dates here if you prefer to parse a different range
current_date = datetime(year=2014, month=7, day=1)
end_date = datetime(year=2015, month=7, day=1)

with open('{}.csv'.format(station), 'w') as out_file:
out_file.write('date,actual_mean_temp,actual_min_temp,actual_max_temp,'
'average_min_temp,average_max_temp,'
'record_min_temp,record_max_temp,'
'record_min_temp_year,record_max_temp_year,'
'actual_precipitation,average_precipitation,'
'record_precipitation\n')

while current_date != end_date:
try_again = False
with open('{}/{}-{}-{}.html'.format(station,
current_date.year,
current_date.month,
current_date.day)) as in_file:
soup = BeautifulSoup(in_file.read(), 'html.parser')

weather_data = soup.find(id='historyTable').find_all('span', class_='wx-value')
weather_data_units = soup.find(id='historyTable').find_all('td')

try:
actual_mean_temp = weather_data[0].text
actual_max_temp = weather_data[2].text
average_max_temp = weather_data[3].text
record_max_temp = weather_data[4].text
actual_min_temp = weather_data[5].text
average_min_temp = weather_data[6].text
record_min_temp = weather_data[7].text
record_max_temp_year = weather_data_units[
9].text.split('(')[-1].strip(')')
record_min_temp_year = weather_data_units[
13].text.split('(')[-1].strip(')')

actual_precipitation = weather_data[9].text
if actual_precipitation == 'T':
actual_precipitation = '0.0'
average_precipitation = weather_data[10].text
record_precipitation = weather_data[11].text

# Verify that the parsed data is valid
if (record_max_temp_year == '-1' or record_min_temp_year == '-1' or
int(record_max_temp) < max(int(actual_max_temp), int(average_max_temp)) or
int(record_min_temp) > min(int(actual_min_temp), int(average_min_temp)) or
float(actual_precipitation) > float(record_precipitation) or
float(average_precipitation) > float(record_precipitation)):
raise Exception

out_file.write('{}-{}-{},'.format(current_date.year, current_date.month, current_date.day))
out_file.write(','.join([actual_mean_temp, actual_min_temp, actual_max_temp,
average_min_temp, average_max_temp,
record_min_temp, record_max_temp,
record_min_temp_year, record_max_temp_year,
actual_precipitation, average_precipitation,
record_precipitation]))
out_file.write('\n')
current_date += timedelta(days=1)
except:
# If the web page is formatted improperly, signal that the page may need
# to be downloaded again.
try_again = True

# If the web page needs to be downloaded again, re-download it from
# wunderground.com

# If the parser gets stuck on a certain date, you may need to investigate
# the page to find out what is going on. Sometimes data is missing, in
# which case the parser will get stuck. You can manually put in the data
# yourself in that case, or just tell the parser to skip this day.
if try_again:
print('Error with date {}'.format(current_date))

lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'
formatted_lookup_URL = lookup_URL.format(station,
current_date.year,
current_date.month,
current_date.day)
html = urlopen(formatted_lookup_URL).read().decode('utf-8')

out_file_name = '{}/{}-{}-{}.html'.format(station,
current_date.year,
current_date.month,
current_date.day)

with open(out_file_name, 'w') as out_file:
out_file.write(html)


# Parse the stations used in this article
for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX',
'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']:
parse_station(station)
53 changes: 53 additions & 0 deletions us-weather-history/wunderground_scraper.py
@@ -0,0 +1,53 @@
# coding: utf-8

from datetime import datetime, timedelta
from urllib.request import urlopen
import os


def scrape_station(station):
'''
This function scrapes the weather data web pages from wunderground.com
for the station you provide it.
You can look up your city's weather station by performing a search for
it on wunderground.com then clicking on the "History" section.
The 4-letter name of the station will appear on that page.
'''

# Scrape between July 1, 2014 and July 1, 2015
# You can change the dates here if you prefer to scrape a different range
current_date = datetime(year=2014, month=7, day=1)
end_date = datetime(year=2015, month=7, day=1)

# Make sure a directory exists for the station web pages
os.mkdir(station)

# Use .format(station, YYYY, M, D)
lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'

while current_date != end_date:

if current_date.day == 1:
print(current_date)

formatted_lookup_URL = lookup_URL.format(station,
current_date.year,
current_date.month,
current_date.day)
html = urlopen(formatted_lookup_URL).read().decode('utf-8')

out_file_name = '{}/{}-{}-{}.html'.format(station, current_date.year,
current_date.month,
current_date.day)

with open(out_file_name, 'w') as out_file:
out_file.write(html)

current_date += timedelta(days=1)


# Scrape the stations used in this article
for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX',
'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']:
scrape_station(station)

0 comments on commit 83bac8d

Please sign in to comment.