Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…

…pers/india_automatic_weather_stations_hourly_data/
fcecinati · Sep 26, 2018 · d0e8a9c · d0e8a9c
commit d0e8a9c
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# Ignore output of scraper
+data.sqlite
diff --git a/README.textile b/README.textile
@@ -0,0 +1,13 @@
+Hourly Weather Observation Data collected from http://www.imd.gov.in/section/nhac/aws/aws.htm.
+
+Abbreviations Used :
+MSLP  : Mean Sea Level Pressure in hPa
+Ptend : Pressure tendency in last 24 hours in hPa
+TMax  : Maximum Temperature in 0C (reported in 12 UTC only)
+TMin  : Minimum Temperature in 0C (reported in 03 UTC only)
+DBT   : Dry bulb Temperature in 0C
+DPT   : Dew point Temperature in 0C
+RRR   : Total rainfall in mm since 03 UTC
+SSHM  : Bright hours of Sunshine in hours and minutes
+DDD   : Direction in 36 points compass
+FF    : Speed in knots measured through anemometer
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,40 @@
+"""Hourly Weather Observation Data collected from http://www.imd.gov.in/section/nhac/aws/aws.htm
+"""
+
+import scraperwiki
+import lxml.html
+import dateutil.parser
+
+utils = scraperwiki.utils.swimport("utils")
+utils.save.unique_keys = ['observed_date', 'station_name', 'state']
+
+scraperwiki.sqlite.execute("create index if not exists idx1 on swdata (observed_date, station_name, state)")
+
+
+def get_hourly_data(i):
+    url = 'http://www.imd.gov.in/section/nhac/aws/aws%02d.htm' % i
+    html = scraperwiki.scrape(url)
+    html = html.replace('&nbsp', '') # Lot of strings like this
+    root = lxml.html.fromstring(html)
+    date = root.cssselect('p')[0].text_content().split('/')[-1]
+    observed_date = dateutil.parser.parse(date + ' %02d:00' % i)
+    table = root.cssselect('table')[0]
+    rows = table.cssselect('tr')
+    headers = rows.pop(0)
+    headers = [td.text_content() for td in headers.cssselect('td')]
+    for row in rows:
+        cells = [td.text_content() for td in row.cssselect('td')]
+        rec = dict(zip(headers, cells))
+        rec['observed_date'] = observed_date
+        rec['station_name'] = rec['Name']
+        del rec['Name']
+        del rec['S.No']
+        utils.save(rec)
+
+
+def main():
+    for i in range(24):
+        get_hourly_data(i)
+
+
+main()