In [1]:
# import libraries
%matplotlib inline
import numpy as np
import csv
import matplotlib.pyplot as plt
import pandas as pd
import glob
import ulmo
import os
import scipy.spatial

In [200]:
ghcn = pd.read_fwf('data/ghcnd-stations.txt', colspecs = [(0,10), (12,19), (21,29), (31,36),(38,40), (41,70), (72,74),(76,78),(80,85)], header = None) 
colnames = ['GHCN ID', 'lat', 'lon', 'elevation', 'state', 'name', 'gsn flag', 'HCN/CRN FLAG', 'WMO ID']
ghcn.columns = colnames

# from http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
# FORMAT OF "ghcnd-stations.txt"
#
# ------------------------------
# Variable   Columns   Type
# ------------------------------
# ID            1-11   Character
# LATITUDE     13-20   Real
# LONGITUDE    22-30   Real
# ELEVATION    32-37   Real
# STATE        39-40   Character
# NAME         42-71   Character
# GSN FLAG     73-75   Character
# HCN/CRN FLAG 77-79   Character
# WMO ID       81-85   Character
# ------------------------------

# These variables have the following definitions:

# ID         is the station identification code.  Note that the first two
#            characters denote the FIPS  country code, the third character 
#            is a network code that identifies the station numbering system 
#            used, and the remaining eight characters contain the actual 
#            station ID. 

#            See "ghcnd-countries.txt" for a complete list of country codes.
# 	   See "ghcnd-states.txt" for a list of state/province/territory codes.

#            The network code  has the following five values:

#            0 = unspecified (station identified by up to eight 
# 	       alphanumeric characters)
# 	   1 = Community Collaborative Rain, Hail,and Snow (CoCoRaHS)
# 	       based identification number.  To ensure consistency with
# 	       with GHCN Daily, all numbers in the original CoCoRaHS IDs
# 	       have been left-filled to make them all four digits long. 
# 	       In addition, the characters "-" and "_" have been removed 
# 	       to ensure that the IDs do not exceed 11 characters when 
# 	       preceded by "US1". For example, the CoCoRaHS ID 
# 	       "AZ-MR-156" becomes "US1AZMR0156" in GHCN-Daily
#            C = U.S. Cooperative Network identification number (last six 
#                characters of the GHCN-Daily ID)
# 	   E = Identification number used in the ECA&D non-blended
# 	       dataset
# 	   M = World Meteorological Organization ID (last five
# 	       characters of the GHCN-Daily ID)
# 	   N = Identification number used in data supplied by a 
# 	       National Meteorological or Hydrological Center
# 	   R = U.S. Interagency Remote Automatic Weather Station (RAWS)
# 	       identifier
# 	   S = U.S. Natural Resources Conservation Service SNOwpack
# 	       TELemtry (SNOTEL) station identifier
#            W = WBAN identification number (last five characters of the 
#                GHCN-Daily ID)

# LATITUDE   is latitude of the station (in decimal degrees).

# LONGITUDE  is the longitude of the station (in decimal degrees).

# ELEVATION  is the elevation of the station (in meters, missing = -999.9).


# STATE      is the U.S. postal code for the state (for U.S. stations only).

# NAME       is the name of the station.

# GSN FLAG   is a flag that indicates whether the station is part of the GCOS
#            Surface Network (GSN). The flag is assigned by cross-referencing 
#            the number in the WMOID field with the official list of GSN 
#            stations. There are two possible values:

#            Blank = non-GSN station or WMO Station number not available
#            GSN   = GSN station 

# HCN/      is a flag that indicates whether the station is part of the U.S.
# CRN FLAG  Historical Climatology Network (HCN).  There are three possible 
#           values:

#            Blank = Not a member of the U.S. Historical Climatology 
# 	           or U.S. Climate Reference Networks
#            HCN   = U.S. Historical Climatology Network station
# 	   CRN   = U.S. Climate Reference Network or U.S. Regional Climate 
# 	           Network Station

# WMO ID     is the World Meteorological Organization (WMO) number for the
#            station.  If the station has no WMO number (or one has not yet 
# 	   been matched to this station), then the field is blank.

# --------------------------------------------------------------------------------
# --------------------------------------------------------------------------------
giss = pd.read_fwf('data/v3.temperature.inv.txt',skiprows = 39, header = None,
                  colspecs=[(0,3),(3,8),(8,11), (12,44),(44,49), (52,58), (58,63), (63,67), (67,68), (69,73), (73,75), (75, 77), (78,79), (79,81), (81,82),(82,84), (84,100), (100,102), (103,106)])
colnames = ['icc country code', 'WMO ID', '3 digit modifier', 'name','lat', 'lon', 'elevation', 'TELe', 'P', 'Pop', 'Tp', 'V', 'Lo', 'Co', 'Airport', 'ds', 'Vege', 'bi', 'BI']
giss.columns = colnames

# LEGEND  
# ======
# icc  =3 digit country code; the first digit represents WMO region/continent
# WMO_#=5 digit WMO station number
# ...  =3 digit modifier; 000 means the station is probably the WMO
#       station; 001, etc. mean the station is near that WMO station
# Name =30 character station name
# Lat  =latitude in degrees, negative = South of Equator
# Lon  =longitude in degrees, negative = West of Greenwich (England)
# Elev =station elevation in meters, missing is -999
# TEle =station elevation interpolated from TerrainBase gridded data set
# P    =R if rural (not associated with a town of >10,000 population)
#       S if associated with a small town (10,000-50,000 population)
#       U if associated with an urban area (>50,000 population)
# Pop  =population of the small town or urban area in 1000s
#       If rural, no analysis:  -9.
# Tp   =general topography around the station:  FL flat; HI hilly,
#       MT mountain top; MV mountainous valley or at least not on the top
#       of a mountain.
# V    =general vegetation near the station based on Operational
#       Navigation Charts;  MA marsh; FO forested; IC ice; DE desert;
#       CL clear or open;  xx information not provided
# Lo   =CO if station is within 30 km from the coast
#       LA if station is next to a large (> 25 km**2) lake
#       no if neither of the above
#       Note: Stations which are both CO and LA will be marked CO
# Co   =distance in km to the coast if Lo=CO, else -9
# A    =A if the station is at an airport; else x
# ds   =distance in km from the airport to its associated
#       small town or urban center (not relevant for rural airports
#       or non airport stations in which case ds=-9)
# Vege =gridded vegetation for the 0.5x0.5 degree grid point closest
#       to the station from a gridded vegetation data base. 16 characters.
# bi   =brightness index    A=dark B=dim C=bright   (comment added by R.Ruedy)
# BI   =brightness index    0=dark -> 256 =bright   (based on satellite night light data)

see: http://stackoverflow.com/questions/35296935/python-calculate-lots-of-distances-quickly

In [201]:
# compute distances between all stations
tree = scipy.spatial.cKDTree(giss[['lon', 'lat']].values, leafsize=100)

# query the closest point 
closestInd = tree.query(giss[['lon', 'lat']].values[11,:], k =2, distance_upper_bound=6)[1][1]

In [5]:
# find the synoptic station and pair it with the closest stations
city = 'MADISON'
synopticID = ghcn[ghcn['name'].str.contains(city) & ~np.isnan(ghcn['WMO ID'])]['WMO ID'].iloc[0]
closeststations = tree.query(giss[giss['WMO ID']==synopticID][['lon', 'lat']].values , k =5, distance_upper_bound=5)

In [6]:
# find the synoptic station and pair it with the closest stations
#city = 'MADISON'
frames = []
selectedCities = ['MADISON', 'NEW YORK', 'BIRMINGHAM', 'MINNEAPOLIS', 'HOUSTON', 'ATLANTA', 'SEATTLE', 'LOS ANGELES', # US cities
                  'MEXICO CITY', 'MONTERREY', 'MERIDA', 'PUEBLA', # MEXICO
                  'PARIS', 'LYON', 'NICE', 'BORDEAUX', 'MARSEILLE', # French cities
                  'LONDON', 'BIRMINGHAM', 'MANCHESTER', 'YORK', #UK
                   'MUNICH', 'BERLIN', 'FRANKFURT', 'HAMBURG', 'BREMEN', 'HANOVER',#eu
                  'MOSCOW', 'VLADIVOSTOK', 'ST PETERSBURG', 'SAMARA', #russia
                  'CASABLANCA', 'RABAT', 'TANGIER', # morocco 
                 'CAIRO', 'DUBAI', 'AMMAN', 'BEIRUT','BAALBEK', 'JERUSALEM', 'TEL AVIV', 'JEDDAH', # Other arab cities]
                'NAIROBI', 'DAR ES SALAAM', 'OUGADOUGOU', 'CAPETOWN', 'PRETORIA', 'MAPUTO', 'WINDHOEK', 'LUANDA',# sub-saharan
                  'DELHI', 'TASHKENT', 'KARACHI', 'LAHORE', 'KABUL', 'TEHRAN',# Central Asia
                  'HANOI', 'PHNOM PENH', 'BANGKOK', 'XIAN', 'ZHENDOU', 'SHANGHAI', 'SHENYANG', 'SEOUL', 'INCHEON', 'TOKYO','KYOTO', 'OSAKA',] 
                  
for city in selectedCities: #['LONDON', 'NEW YORK','MEXICO CITY', 'PARIS', 'MADISON']:
    # first check that there is a WMO station 
    if ghcn[ghcn['name'].str.contains(city)].values.shape[0] ==0: 
        print 'No station found for %s'%city
    else : 
        synopticID = ghcn[ghcn['name'].str.contains(city) & ~np.isnan(ghcn['WMO ID'])]['WMO ID']
        if synopticID.shape[0] == 0 :
            print 'No synoptic station found for %s'%city
        else: 
            synopticID = synopticID.iloc[0].astype(int)
            closeststations = tree.query(giss[giss['WMO ID']==synopticID][['lon', 'lat']].values , k =5, distance_upper_bound=5)
            #print city, synopticID

            # Check that there is an entry for the city
            if giss[giss['WMO ID'] == synopticID].values.shape[0]==0:
                print 'No GISS data found for synoptic station found for %s, %s'%(city, synopticID)
            else: 
                #print 'Station found for %s'%city
                # Now classify the synoptic station as urban/rural and pair it with the appropriate station
                synopticBrightness = giss[giss['WMO ID'] == synopticID]['BI'].iloc[0]

                if synopticBrightness > 20 :
                # if the synoptic station is urban, then search for rural station

                    urbanID = synopticID
                    ruralID = giss.iloc[giss.iloc[closeststations[1][0]]['BI'].argmin()]['WMO ID']
                    frames.append([city,urbanID, ruralID])
                else : 
                # if the synoptic station is rural, then search for a more urban station

                    ruralID = synopticBrightness
                    urbanID = giss.iloc[giss.iloc[closeststations[1][0]]['BI'].argmax()]['WMO ID']
                    frames.append([city, urbanID, ruralID])

No GISS data found for synoptic station found for HOUSTON, 72244
No station found for PUEBLA
No GISS data found for synoptic station found for NICE, 71712
No synoptic station found for MUNICH
No station found for FRANKFURT
No synoptic station found for HANOVER
No synoptic station found for ST PETERSBURG
No station found for CASABLANCA
No GISS data found for synoptic station found for RABAT, 38049
No synoptic station found for CAIRO
No GISS data found for synoptic station found for DUBAI, 41194
No synoptic station found for AMMAN
No station found for BEIRUT
No station found for BAALBEK
No synoptic station found for JERUSALEM
No synoptic station found for TEL AVIV
No GISS data found for synoptic station found for JEDDAH, 41024
No station found for OUGADOUGOU
No station found for CAPETOWN
No GISS data found for synoptic station found for MAPUTO, 67331
No GISS data found for synoptic station found for DELHI, 71573
No station found for KARACHI
No station found for HANOI
No station found for

In [7]:
frames

[['MADISON', 72641, 74357],
 ['NEW YORK', 72503, 74486],
 ['BIRMINGHAM', 72228, 72229],
 ['MINNEAPOLIS', 72658, 219],
 ['ATLANTA', 72219, 96],
 ['SEATTLE', 72793, 72793],
 ['LOS ANGELES', 72295, 68816],
 ['MEXICO CITY', 76680, 76680],
 ['MONTERREY', 76390, 14],
 ['MERIDA', 80097, 17],
 ['PARIS', 58424, 0],
 ['LYON', 7480, 14],
 ['BORDEAUX', 7510, 399],
 ['MARSEILLE', 7650, 7586],
 ['LONDON', 71623, 14],
 ['BIRMINGHAM', 72228, 72229],
 ['MANCHESTER', 3334, 3329],
 ['YORK', 76055, 0],
 ['BERLIN', 10381, 10379],
 ['HAMBURG', 10147, 71871],
 ['BREMEN', 10224, 71133],
 ['MOSCOW', 27612, 31168],
 ['VLADIVOSTOK', 31960, 17022],
 ['SAMARA', 28900, 9],
 ['TANGIER', 60320, 0],
 ['NAIROBI', 63742, 17],
 ['DAR ES SALAAM', 63894, 13],
 ['PRETORIA', 68262, 68263],
 ['WINDHOEK', 68110, 68112],
 ['LUANDA', 66160, 61902],
 ['TASHKENT', 38457, 87774],
 ['LAHORE', 41640, 41600],
 ['KABUL', 87418, 0],
 ['TEHRAN', 40754, 94940],
 ['PHNOM PENH', 96995, 7],
 ['BANGKOK', 48455, 65501],
 ['XIAN', 72248, 0],
 [

In [29]:
giss[giss['WMO ID']==66160]

Unnamed: 0,icc country code,WMO ID,3 digit modifier,name,lat,lon,elevation,TELe,P,Pop,Tp,V,Lo,Co,Airport,ds,Vege,bi,BI
53,102,66160,0,LUANDA,-8.85,13.23,70,70,U,475,FL,xx,O,2,A,2,WATER,C,26


In [30]:
giss[giss['WMO ID']==61902]

Unnamed: 0,icc country code,WMO ID,3 digit modifier,name,lat,lon,elevation,TELe,P,Pop,Tp,V,Lo,Co,Airport,ds,Vege,bi,BI
649,158,61902,0,WIDE AWAKE FI,-7.97,14.4,79,0,R,-9,HI,xx,O,1,x,-9,WATER,B,0


In [8]:
# convert WMO codes to GHCN in order to automatically pull data
# from http://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt: World Meteorological Organization ID (last five characters of the GHCN-Daily ID)
# for foreign stations, this means the GHCN ID contains 'Mxxxxx' where xxxxx is the WMO ID
# for US stations, 

def WMOtoGHCN(id): 
    try: 
        id = ghcn[ghcn['WMO ID']== id]['GHCN ID'].iloc[0]
    except IndexError : 
        id = ghcn[ghcn['GHCN ID'].str.contains('M' +str(id))]['GHCN ID']
        if id.size > 0 :
            id = id.iloc[0]
        else :
            id = None
    return id 

for line in frames: 
    try: 
        print WMOtoGHCN(line[1]), WMOtoGHCN(line[2]) 
    except IndexError : 
        print ghcn[ghcn['GHCN ID'].str.contains(id)]
        #print ' no GHCN match'
    #try looking up id using 

USW0001483 None
USW0001473 USW0009478
USW0001387 USW0000388
USW0001492 None
USW0001387 None
USW0002423 USW0002423
USW0002317 SFM0006881
MX00076680 MX00076680
MX00000504 US1CHARM14
COM0008009 US1CHARM17
CHM0005842 AEM0004119
FR06902900 US1CHARM14
FR00000751 None
FR00000765 None
CA00614447 US1CHARM14
USW0001387 USW0000388
UKM0000333 None
MXM0007605 AEM0004119
GM00000331 GM00000334
GM00001014 CA00301395
GM00000147 CA00406772
RSM0002761 RSM0003116
RSM0003196 TUM0001702
RSM0002890 CA00302M92
SPE0012001 AEM0004119
None US1CHARM17
TZ00006389 USS0005M13
SFM0006826 SFM0006826
WA00740154 WA00784839
AO00006616 SHM0006190
UZM0003845 ARM0008777
PK00004164 PKM0004160
AR00008741 AEM0004119
IR00040754 ASM0009494
KT00009699 CA00119M72
TH00004845 UV00006550
USW0001395 AEM0004119
CHM0005836 SF00113025
CHM0005434 None
KSM0004710 GR00016723
KS00004711 GR00016723
JA00004766 None
JA00004775 None
JAM0004777 None


In [35]:
# new idea
# find a city station 
# fidn the nearby synoptic stations
# classify as urban or rural

for city in ['LONDON', 'NEW YORK','MEXICO CITY', 'PARIS', 'MADISON']:
    # first check that there is a WMO station 
    if ghcn[ghcn['name'].str.contains(city)].values.shape[0] ==0: 
        print 'No station found for %s'%city
    else : 
        id = ghcn[ghcn['name'].str.contains(city)]['GHCN ID'].iloc[0]
        print city, id 
        # find closest couple stations
        closeststations = tree.query(giss[giss['WMO ID']==synopticID][['lon', 'lat']].values , k =5, distance_upper_bound=5)


LONDON ASN0006912
NEW YORK USC0021590
MEXICO CITY MX00076680
PARIS ASN0000207
MADISON US10stan00


In [228]:
dictionary = {651: 'LONDON', 'MANCHESTER', 'YORK', 'BIRMINGHAM';
             425: 'MADISON', 'NEW YORK', 'BIRMINGHAM', 'MINNEAPOLIS', 'HOUSTON', 'ATLANTA', 'SEATTLE', 'LOS ANGELES'}

SyntaxError: invalid syntax (<ipython-input-228-ca34f36d1902>, line 1)

In [227]:
city = 'NEW YORK'
# city = 'LONDON'
# country = 425#651 #425 US, 651 UK
cities = 
countries = 425
frames = []
for (city, country) in zip(cities,countries): 
    ids = giss[(giss['name'].str.contains(city)) & (giss['icc country code']==country)]
    if ids.shape[0]==0 : 
        print 'no stations found for %s'%city
    elif ids.shape[0]==1 :
        id = ids['WMO ID'].iloc[0]
    else: 
        print 'picking the most synoptic station'
        id = ids.loc[ids['3 digit modifier'].argmin()]['WMO ID']
        #id = 
    # find closest couple stations
    closeststations = tree.query(giss[giss['WMO ID']==id ].iloc[0][['lon', 'lat']].values, k =5, distance_upper_bound=.5)

    urbanID = giss.loc[giss.iloc[closeststations[1]]['BI'].argmax()]['WMO ID']
    ruralID = giss.loc[giss.iloc[closeststations[1]]['BI'].argmin()]['WMO ID']
    #print closeststations[1]
    #print city, id 
    frames.append([city, urbanID, ruralID])

picking the most synoptic station


In [212]:
closeststations[1]

array([5173, 4074, 4962, 4961, 5174, 4003, 4055, 4871, 7364, 7364])

In [222]:
giss.iloc[closeststations[1]]

Unnamed: 0,icc country code,WMO ID,3 digit modifier,name,lat,lon,elevation,TELe,P,Pop,Tp,V,Lo,Co,Airport,ds,Vege,bi,BI
5173,425,74486,1,NEW YORK/FLOYD BENNETT FIELD,40.58,73.88,10,3,U,9342,FL,xx,O,3,A,1,WARM CROPS,C,30
4074,425,305,801,NY CITY CNTRL PARK,40.78,73.97,40,18,U,9342,FL,xx,O,5,x,-9,WARM CROPS,C,146
4962,425,72503,2,NEW YORK/FORT TOTTEN,40.78,73.77,7,22,U,9342,FL,xx,O,1,x,-9,WARM CROPS,C,92
4961,425,72502,3,NEWARK/INT'L ARPT,40.7,74.17,8,20,U,9342,FL,xx,O,7,A,1,WARM CROPS,C,139
5174,425,74486,2,HEMPSTEAD/MITCHELL FLD AFB,40.73,73.6,38,24,U,9342,FL,xx,O,12,A,1,WARM CROPS,C,105


In [224]:
urbanID = giss.loc[giss.iloc[closeststations[1]]['BI'].argmax()]['WMO ID']
ruralID = giss.loc[giss.iloc[closeststations[1]]['BI'].argmin()]['WMO ID']