Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
fcecinati committed Sep 26, 2018
0 parents commit d0e8a9c
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
13 changes: 13 additions & 0 deletions README.textile
@@ -0,0 +1,13 @@
Hourly Weather Observation Data collected from http://www.imd.gov.in/section/nhac/aws/aws.htm.

Abbreviations Used :
MSLP : Mean Sea Level Pressure in hPa
Ptend : Pressure tendency in last 24 hours in hPa
TMax : Maximum Temperature in 0C (reported in 12 UTC only)
TMin : Minimum Temperature in 0C (reported in 03 UTC only)
DBT : Dry bulb Temperature in 0C
DPT : Dew point Temperature in 0C
RRR : Total rainfall in mm since 03 UTC
SSHM : Bright hours of Sunshine in hours and minutes
DDD : Direction in 36 points compass
FF : Speed in knots measured through anemometer
40 changes: 40 additions & 0 deletions scraper.py
@@ -0,0 +1,40 @@
"""Hourly Weather Observation Data collected from http://www.imd.gov.in/section/nhac/aws/aws.htm
"""

import scraperwiki
import lxml.html
import dateutil.parser

utils = scraperwiki.utils.swimport("utils")
utils.save.unique_keys = ['observed_date', 'station_name', 'state']

scraperwiki.sqlite.execute("create index if not exists idx1 on swdata (observed_date, station_name, state)")


def get_hourly_data(i):
url = 'http://www.imd.gov.in/section/nhac/aws/aws%02d.htm' % i
html = scraperwiki.scrape(url)
html = html.replace('&nbsp', '') # Lot of strings like this
root = lxml.html.fromstring(html)
date = root.cssselect('p')[0].text_content().split('/')[-1]
observed_date = dateutil.parser.parse(date + ' %02d:00' % i)
table = root.cssselect('table')[0]
rows = table.cssselect('tr')
headers = rows.pop(0)
headers = [td.text_content() for td in headers.cssselect('td')]
for row in rows:
cells = [td.text_content() for td in row.cssselect('td')]
rec = dict(zip(headers, cells))
rec['observed_date'] = observed_date
rec['station_name'] = rec['Name']
del rec['Name']
del rec['S.No']
utils.save(rec)


def main():
for i in range(24):
get_hourly_data(i)


main()

0 comments on commit d0e8a9c

Please sign in to comment.