Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit of data and code for weather history visualization
- Loading branch information
Showing
13 changed files
with
3,928 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
from datetime import datetime | ||
|
||
''' | ||
This is an example to generate the Philadelphia, PA weather chart. | ||
If you want to make the chart for another city, you will have to modify | ||
this code slightly to read that city's data in, change the title, and | ||
likely change the y-axis of the chart to fit your city's temperature range. | ||
I also use a custom matplotlib style as the basis for these charts, which you | ||
can find here: https://gist.githubusercontent.com/rhiever/d0a7332fe0beebfdc3d5/raw/223d70799b48131d5ce2723cd5784f39d7a3a653/tableau10.mplstyle | ||
''' | ||
|
||
weather_data = pd.read_csv('KPHL.csv', parse_dates=['date']) | ||
print(weather_data.describe()) | ||
|
||
# Generate a bunch of histograms of the data to make sure that all of the data | ||
# is in an expected range. | ||
with plt.style.context('https://gist.githubusercontent.com/rhiever/d0a7332fe0beebfdc3d5/raw/223d70799b48131d5ce2723cd5784f39d7a3a653/tableau10.mplstyle'): | ||
for column in weather_data.columns: | ||
if column in ['date']: | ||
continue | ||
plt.figure() | ||
plt.hist(weather_data[column].values) | ||
plt.title(column) | ||
plt.savefig('{}.png'.format(column)) | ||
|
||
# Make sure we're only plotting temperatures for July 2014 - June 2015 | ||
weather_data_subset = weather_data[weather_data['date'] >= datetime(year=2014, month=7, day=1)] | ||
weather_data_subset = weather_data_subset[weather_data_subset['date'] < datetime(year=2015, month=7, day=1)].copy() | ||
weather_data_subset['day_order'] = range(len(weather_data_subset)) | ||
|
||
day_order = weather_data_subset['day_order'] | ||
record_max_temps = weather_data_subset['record_max_temp'].values | ||
record_min_temps = weather_data_subset['record_min_temp'].values | ||
average_max_temps = weather_data_subset['average_max_temp'].values | ||
average_min_temps = weather_data_subset['average_min_temp'].values | ||
actual_max_temps = weather_data_subset['actual_max_temp'].values | ||
actual_min_temps = weather_data_subset['actual_min_temp'].values | ||
|
||
fig, ax1 = plt.subplots(figsize=(15, 7)) | ||
|
||
# Create the bars showing all-time record highs and lows | ||
plt.bar(day_order, record_max_temps - record_min_temps, bottom=record_min_temps, | ||
edgecolor='none', color='#C3BBA4', width=1) | ||
|
||
# Create the bars showing average highs and lows | ||
plt.bar(day_order, average_max_temps - average_min_temps, bottom=average_min_temps, | ||
edgecolor='none', color='#9A9180', width=1) | ||
|
||
# Create the bars showing this year's highs and lows | ||
plt.bar(day_order, actual_max_temps - actual_min_temps, bottom=actual_min_temps, | ||
edgecolor='black', linewidth=0.5, color='#5A3B49', width=1) | ||
|
||
new_max_records = weather_data_subset[weather_data_subset.record_max_temp <= weather_data_subset.actual_max_temp] | ||
new_min_records = weather_data_subset[weather_data_subset.record_min_temp >= weather_data_subset.actual_min_temp] | ||
|
||
# Create the dots marking record highs and lows for the year | ||
plt.scatter(new_max_records['day_order'].values + 0.5, | ||
new_max_records['actual_max_temp'].values + 0.75, | ||
s=15, zorder=10, color='#d62728', alpha=0.75, linewidth=0) | ||
|
||
plt.scatter(new_min_records['day_order'].values + 0.5, | ||
new_min_records['actual_min_temp'].values - 0.75, | ||
s=15, zorder=10, color='#1f77b4', alpha=0.75, linewidth=0) | ||
|
||
plt.ylim(-15, 111) | ||
plt.xlim(-5, 370) | ||
|
||
plt.yticks(range(-10, 111, 10), [r'{}$^\circ$'.format(x) | ||
for x in range(-10, 111, 10)], fontsize=10) | ||
plt.ylabel(r'Temperature ($^\circ$F)', fontsize=12) | ||
|
||
month_beginning_df = weather_data_subset[weather_data_subset['date'].apply(lambda x: True if x.day == 1 else False)] | ||
month_beginning_indeces = list(month_beginning_df['day_order'].values) | ||
month_beginning_names = list(month_beginning_df['date'].apply(lambda x: x.strftime("%B")).values) | ||
month_beginning_names[0] += '\n\'14' | ||
month_beginning_names[6] += '\n\'15' | ||
|
||
# Add the last month label manually | ||
month_beginning_indeces += [weather_data_subset['day_order'].values[-1]] | ||
month_beginning_names += ['July'] | ||
|
||
plt.xticks(month_beginning_indeces, | ||
month_beginning_names, | ||
fontsize=10) | ||
|
||
ax2 = ax1.twiny() | ||
plt.xticks(month_beginning_indeces, | ||
month_beginning_names, | ||
fontsize=10) | ||
|
||
plt.xlim(-5, 370) | ||
plt.grid(False) | ||
|
||
ax3 = ax1.twinx() | ||
plt.yticks(range(-10, 111, 10), [r'{}$^\circ$'.format(x) | ||
for x in range(-10, 111, 10)], fontsize=10) | ||
plt.ylim(-15, 111) | ||
plt.grid(False) | ||
|
||
plt.title('Philadelphia, PA\'s weather, July 2014 - June 2015\n\n', fontsize=20) | ||
|
||
plt.savefig('philadelphia-weather-july14-june15.png') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
from datetime import datetime, timedelta | ||
from bs4 import BeautifulSoup | ||
from urllib.request import urlopen | ||
|
||
|
||
def parse_pages(station): | ||
''' | ||
This function parses the web pages downloaded from wunderground.com | ||
into a flat CSV file for the station you provide it. | ||
Make sure to run the wunderground scraper first so you have the web | ||
pages downloaded. | ||
''' | ||
|
||
# Scrape between July 1, 2014 and July 1, 2015 | ||
# You can change the dates here if you prefer to parse a different range | ||
current_date = datetime(year=2014, month=7, day=1) | ||
end_date = datetime(year=2015, month=7, day=1) | ||
|
||
with open('{}.csv'.format(station), 'w') as out_file: | ||
out_file.write('date,actual_mean_temp,actual_min_temp,actual_max_temp,' | ||
'average_min_temp,average_max_temp,' | ||
'record_min_temp,record_max_temp,' | ||
'record_min_temp_year,record_max_temp_year,' | ||
'actual_precipitation,average_precipitation,' | ||
'record_precipitation\n') | ||
|
||
while current_date != end_date: | ||
try_again = False | ||
with open('{}/{}-{}-{}.html'.format(station, | ||
current_date.year, | ||
current_date.month, | ||
current_date.day)) as in_file: | ||
soup = BeautifulSoup(in_file.read(), 'html.parser') | ||
|
||
weather_data = soup.find(id='historyTable').find_all('span', class_='wx-value') | ||
weather_data_units = soup.find(id='historyTable').find_all('td') | ||
|
||
try: | ||
actual_mean_temp = weather_data[0].text | ||
actual_max_temp = weather_data[2].text | ||
average_max_temp = weather_data[3].text | ||
record_max_temp = weather_data[4].text | ||
actual_min_temp = weather_data[5].text | ||
average_min_temp = weather_data[6].text | ||
record_min_temp = weather_data[7].text | ||
record_max_temp_year = weather_data_units[ | ||
9].text.split('(')[-1].strip(')') | ||
record_min_temp_year = weather_data_units[ | ||
13].text.split('(')[-1].strip(')') | ||
|
||
actual_precipitation = weather_data[9].text | ||
if actual_precipitation == 'T': | ||
actual_precipitation = '0.0' | ||
average_precipitation = weather_data[10].text | ||
record_precipitation = weather_data[11].text | ||
|
||
# Verify that the parsed data is valid | ||
if (record_max_temp_year == '-1' or record_min_temp_year == '-1' or | ||
int(record_max_temp) < max(int(actual_max_temp), int(average_max_temp)) or | ||
int(record_min_temp) > min(int(actual_min_temp), int(average_min_temp)) or | ||
float(actual_precipitation) > float(record_precipitation) or | ||
float(average_precipitation) > float(record_precipitation)): | ||
raise Exception | ||
|
||
out_file.write('{}-{}-{},'.format(current_date.year, current_date.month, current_date.day)) | ||
out_file.write(','.join([actual_mean_temp, actual_min_temp, actual_max_temp, | ||
average_min_temp, average_max_temp, | ||
record_min_temp, record_max_temp, | ||
record_min_temp_year, record_max_temp_year, | ||
actual_precipitation, average_precipitation, | ||
record_precipitation])) | ||
out_file.write('\n') | ||
current_date += timedelta(days=1) | ||
except: | ||
# If the web page is formatted improperly, signal that the page may need | ||
# to be downloaded again. | ||
try_again = True | ||
|
||
# If the web page needs to be downloaded again, re-download it from | ||
# wunderground.com | ||
|
||
# If the parser gets stuck on a certain date, you may need to investigate | ||
# the page to find out what is going on. Sometimes data is missing, in | ||
# which case the parser will get stuck. You can manually put in the data | ||
# yourself in that case, or just tell the parser to skip this day. | ||
if try_again: | ||
print('Error with date {}'.format(current_date)) | ||
|
||
lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html' | ||
formatted_lookup_URL = lookup_URL.format(station, | ||
current_date.year, | ||
current_date.month, | ||
current_date.day) | ||
html = urlopen(formatted_lookup_URL).read().decode('utf-8') | ||
|
||
out_file_name = '{}/{}-{}-{}.html'.format(station, | ||
current_date.year, | ||
current_date.month, | ||
current_date.day) | ||
|
||
with open(out_file_name, 'w') as out_file: | ||
out_file.write(html) | ||
|
||
|
||
# Parse the stations used in this article | ||
for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX', | ||
'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']: | ||
parse_station(station) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# coding: utf-8 | ||
|
||
from datetime import datetime, timedelta | ||
from urllib.request import urlopen | ||
import os | ||
|
||
|
||
def scrape_station(station): | ||
''' | ||
This function scrapes the weather data web pages from wunderground.com | ||
for the station you provide it. | ||
You can look up your city's weather station by performing a search for | ||
it on wunderground.com then clicking on the "History" section. | ||
The 4-letter name of the station will appear on that page. | ||
''' | ||
|
||
# Scrape between July 1, 2014 and July 1, 2015 | ||
# You can change the dates here if you prefer to scrape a different range | ||
current_date = datetime(year=2014, month=7, day=1) | ||
end_date = datetime(year=2015, month=7, day=1) | ||
|
||
# Make sure a directory exists for the station web pages | ||
os.mkdir(station) | ||
|
||
# Use .format(station, YYYY, M, D) | ||
lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html' | ||
|
||
while current_date != end_date: | ||
|
||
if current_date.day == 1: | ||
print(current_date) | ||
|
||
formatted_lookup_URL = lookup_URL.format(station, | ||
current_date.year, | ||
current_date.month, | ||
current_date.day) | ||
html = urlopen(formatted_lookup_URL).read().decode('utf-8') | ||
|
||
out_file_name = '{}/{}-{}-{}.html'.format(station, current_date.year, | ||
current_date.month, | ||
current_date.day) | ||
|
||
with open(out_file_name, 'w') as out_file: | ||
out_file.write(html) | ||
|
||
current_date += timedelta(days=1) | ||
|
||
|
||
# Scrape the stations used in this article | ||
for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX', | ||
'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']: | ||
scrape_station(station) |