In [1]:
# import libraries
%matplotlib inline
import numpy as np
#import csv
import matplotlib.pyplot as plt
import pandas as pd
import glob
import ulmo
import os
import scipy.spatial
import scipy.stats
import matplotlib
matplotlib.rcParams.update({'font.size': 18})
from scipy.odr import Model, Data, ODR
from scipy.stats import linregress
def f(B, x):
    '''Linear function y = m*x + b'''
    # B is a vector of the parameters.
    # x is an array of the current x values.
    # x is in the same format as the x passed to Data or RealData.
    #
    # Return an array in the same format as y passed to Data or RealData.
    return B[0]*x + B[1]

In [2]:
# plotting utilities
def lin_trend_plot(start_x, y,title, xlabel, ylabel) : 
# plots x,y (need to be np array) and calculates and prints their best fit line
    ind = ~np.isnan(y.values)# & ~np.isnan(x) # subset values that aren't NaNs
    x = np.arange(0,y.shape[0])
    m, b, r_value, p, std_err = scipy.stats.linregress(x[ind],y[ind])
    plt.scatter(x+start_x,y)
    plt.plot(x+start_x, m*x+b, color = 'black')
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    # annotate the linear reqression, y = mx+b
    plt.annotate('y = %.2f x + %.2f'%(m,b), xy=(.5, .9), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('r = %.2f'%(r_value), xy=(.5, .85), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('p = %.2f'%(p), xy=(.5, .8), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('N = %i'%(ind.shape), xy=(.5, .75), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    return m, b, r_value, p, std_err

def easy_scatter(x,y,title, xlabel, ylabel) : 
# plots x,y (need to be np array) and calculates and prints their best fit line
    ind = ~np.isnan(y) & ~np.isnan(x) # subset values that aren't NaNs
    m,b = np.polyfit(x[ind],y[ind],1)
    r, p = scipy.stats.pearsonr(x[ind], y[ind]) #np.corrcoef(x[ind],y[ind])[0,1]
    plt.scatter(x,y)
    plt.plot(x, m*x+b, color = 'black')
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    # annotate the linear reqression, y = mx+b
    plt.annotate('y = %.2f x + %.2f'%(m,b), xy=(.5, .9), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('r = %.2f'%(r), xy=(.5, .85), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('p = %.2f'%(p), xy=(.5, .8), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('N = %i'%(ind.shape), xy=(.5, .75), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    return m, r, p

def how_UHI_varies(mean_UHIs, closeststations_distance): 
    plt.figure(figsize= [12,4])
    plt.subplot(1,3,1)
    data = mean_UHIs[~np.isnan(mean_UHIs)]
    plt.hist(data)
    plt.xlabel('$\Delta T$ ($\Delta ^\circ$C)')
    plt.ylabel('Count')
    plt.annotate('min = %.2f'%(data.min()), xy=(0, .94), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('max = %.2f'%(data.max()), xy=(0, .89), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('$\mu$ = %.2f'%(data.mean()), xy=(0, .84), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.annotate('$\sigma$ = %.2f'%(data.std()), xy=(0, .79), xycoords='axes fraction',  horizontalalignment='left', verticalalignment='bottom')
    plt.title('Histogram of $\Delta T$')

    plt.subplot(1,3,2)
    m,r,p = easy_scatter(closeststations_distance, mean_UHIs, 
                 'Distance to rural station vs. $\Delta T$ ', 
             'Distance to rural station (degrees lat/lon)', '$\Delta T$ ($\Delta ^\circ$C)',
             )

    plt.subplot(1,3,3)
    m2,r2,p2 = easy_scatter(ghcn.loc[rural_station_list].Brightness.values, mean_UHIs, 
                 'Brightness versus $\Delta T$', 
                 'Satellite Brightness Index', '$\Delta T$ ($\Delta ^\circ$C)',
                 )
    return m,r,p

In [3]:
# readin in GHCN data 
ghcn = pd.read_fwf('data/ghcnd-stations.txt', colspecs = [(0,11), (12,19), (21,29), (31,36),(38,40), (41,70), (72,74),(76,78),(80,85)], header = None) 
colnames = ['GHCN ID', 'lat', 'lon', 'elevation', 'state', 'name', 'gsn flag', 'HCN/CRN FLAG', 'WMO ID']
ghcn.columns = colnames

# append the brightness index 
BI = np.load('data/brightnessGHCN.npy')
ghcn['Brightness'] = BI

currentstations = ulmo.ncdc.ghcn_daily.get_stations(start_year=1985, end_year = 2016, elements = ['TMIN', 'TMAX', 'AWND'], as_dataframe=True, update=False)
currentGHCNstations = np.intersect1d(currentstations.id, ghcn['GHCN ID'].values)
ghcnSubset = ghcn.set_index('GHCN ID').loc[currentstations.id.values]
# at this point, ghcn must have the station id set as the index 

ghcn_noairport = ghcn[~ghcn.name.str.contains('INTL')]
ghcn_noairport = ghcn_noairport[~ghcn_noairport.name.str.contains(' AP')]
ghcn_noairport = ghcn_noairport[~ghcn_noairport.name.str.contains('AIRPORT')]
ghcn = ghcn_noairport.set_index('GHCN ID').loc[currentstations.id.values]

# generate atlas of cities and k-d tree 
atlas = pd.read_csv('data/world_cities.csv')
atlas = atlas[(atlas['country'] == 'United States of America')]# & (atlas['pop']>100000)]
atlas = atlas.set_index('city')
tree = scipy.spatial.cKDTree(ghcn[['lon', 'lat']].values, leafsize=100)
#atlas = atlas[atlas['pop']> 500000]

paired_df = pd.read_csv('USpairs2005-2015.csv').set_index('City', drop = False)
paired_df = paired_df[paired_df['Urban distance'] < 0.25]

results = np.ones([paired_df.shape[0],11])*np.nan # save out min,max, mean, std, m,r,p
results_filepath = 'plots/version7/errorbars/'

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [4]:
!ls plots/s/version7/errorbars

[34mslopes[m[m [34mtrend[m[m  [34mvalues[m[m


In [4]:
!mkdir plots/version6/errorbars
!mkdir plots/version6/errorbars/trend
!mkdir plots/version6/errorbars/slopes
!mkdir plots/version6/errorbars/values

In [5]:
city = 'Mesa'
### Set variables
print(city)
urbanID = paired_df.loc[city]['Urban station']
urbandata = ulmo.ncdc.ghcn_daily.get_data(urbanID,
                                     as_dataframe=True, update = False)
urban_tmin = pd.to_numeric(urbandata['TMIN']['2000-01-01':].value/10.) 
urban_tmin.loc[urbandata['TMIN']['2000-01-01':]['qflag'].dropna().index] = np.nan
urban_summer = urban_tmin[(urban_tmin.index.month >= 6) & (urban_tmin.index.month <= 8)]
# set lat/lon
try: # if  
    if atlas[atlas.index==city].shape[0]>1 : # if more than one hit for the city, eg, Kansas City (MO and KS)
            lat = atlas.loc[city]['lat'][0]
            lon = atlas.loc[city]['lng'] [1]
    else: 
        lat = atlas[atlas['pop']> 300000].loc[city]['lat']
        lon = atlas[atlas['pop']> 300000].loc[city]['lng']        
# try to fix if there are two such cities in the atlas; take the second one
except IndexError: 
    lat = atlas[atlas['pop']> 300000].loc[city]['lat'][1]
    lon = atlas[atlas['pop']> 300000].loc[city]['lng'][1]
# find the closest stations
closeststations = tree.query([lon,lat], k =35, distance_upper_bound=1.5) #used in pairing algorithm: .5
cols = ['Distance', 'Index', 'GHCNID', 'Brightness']

Mesa


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  axis=1)
.resample() is now a deferred operation
You called index(...) on this deferred object which materialized it into a dataframe
by implicitly taking the mean.  Use .resample(...).mean() instead
  daily_index = element_df.resample('D').index.copy()


In [8]:
lat, lon

(33.423914609999997, -111.73608440000001)

In [7]:
closeststations

(array([ 0.08292739,  0.09221545,  0.15282465,  0.17957532,  0.18491879,
         0.22573856,  0.24113618,  0.2579119 ,  0.3215709 ,  0.32743828,
         0.33666198,  0.34675309,  0.35318386,  0.35533968,  0.38165245,
         0.39480938,  0.42021496,  0.42535645,  0.42784842,  0.47419441,
         0.47860788,  0.52119142,  0.53750026,  0.59025405,  0.60893941,
         0.62185774,  0.63139606,  0.63681901,  0.64329358,  0.64584151,
         0.65098671,  0.67889616,  0.69537766,  0.83784176,  0.84701399]),
 array([11159, 11113, 11211, 11121, 11230, 11096, 11223, 11062, 11164,
        11187, 11221, 22627, 11196, 22404, 11108, 11069, 11149, 11090,
        11154, 11092, 22407, 11117, 11091, 11259, 11192, 11194, 11151,
        11195, 19291, 19368, 11227, 11115, 11064, 11094, 11184]))

In [6]:
brightness_threshold = 25
#redo_cities = np.where(np.isnan(results[:,0]))[0]
for city in paired_df.index: # probalby need to redo 1-15
    ### Set variables
    print(city)
    
    urbanID = paired_df.loc[city]['Urban station']
    urbandata = ulmo.ncdc.ghcn_daily.get_data(urbanID,
                                         as_dataframe=True, update = False)
    urban_tmin = pd.to_numeric(urbandata['TMIN']['2000-01-01':].value/10.) 
    urban_tmin.loc[urbandata['TMIN']['2000-01-01':]['qflag'].dropna().index] = np.nan
    urban_summer = urban_tmin[(urban_tmin.index.month >= 6) & (urban_tmin.index.month <= 8)]
    # set lat/lon
    try: # if  
        if city == 'Grand Prairie': 
                lat = atlas.loc['Arlington']['lat']
                lon = atlas.loc['Arlington']['lng']
        elif atlas[atlas.index==city].shape[0]>1 : # if more than one hit for the city, eg, Kansas City (MO and KS)
                lat = atlas.loc[city]['lat'][0]
                lon = atlas.loc[city]['lng'][1]
        else: 
            lat = atlas[atlas['pop']> 300000].loc[city]['lat']
            lon = atlas[atlas['pop']> 300000].loc[city]['lng']        
    # try to fix if there are two such cities in the atlas; take the second one
    except IndexError: 
        lat = atlas[atlas['pop']> 300000].loc[city]['lat'][1]
        lon = atlas[atlas['pop']> 300000].loc[city]['lng'][1]

    # find the closest stations
    closeststations = tree.query([lon,lat], k =35, distance_upper_bound=1.5) #used in pairing algorithm: .5
    cols = ['Distance', 'Index', 'GHCNID', 'Brightness']
    try: 
        # get station ids
        stations = pd.DataFrame(np.array([closeststations[0][~np.isinf(closeststations[0])].astype(float),
                                          closeststations[1][~np.isinf(closeststations[0])],
                                          ghcn.iloc[closeststations[1][~np.isinf(closeststations[0])]]['Brightness'].index,
                                          ghcn.iloc[closeststations[1][~np.isinf(closeststations[0])]]['Brightness'].values]).T, 
                    columns = cols).set_index('GHCNID').drop(urbanID)
        stations = stations[~np.isinf(stations['Distance'].values.astype(float))]
    except IndexError: 
        closeststations = closeststations[0]
        stations = pd.DataFrame(np.array([closeststations[0].astype(float),closeststations[1], ghcn.iloc[closeststations[1]]['Brightness'].index, ghcn.iloc[closeststations[1]]['Brightness'].values]).T, 
                columns = cols).set_index('GHCNID').drop(urbanID)
        stations = stations[~np.isinf(stations['Distance'].values.astype(float))]
    except ValueError: # got this when urban ID wasn't in the stations for Jacksonville, should maybe check
        stations = pd.DataFrame(np.array([closeststations[0].astype(float),closeststations[1], ghcn.iloc[closeststations[1]]['Brightness'].index, ghcn.iloc[closeststations[1]]['Brightness'].values]).T, 
            columns = cols).set_index('GHCNID')#.drop(urbanID)
        stations = stations[~np.isinf(stations['Distance'].values.astype(float))]
    #rural_stations = stations.index
    # select only dim stations
    rural_station_list = stations[stations.Brightness < 25].index
    rural_distance = stations[stations.Brightness < 25].Distance.values.astype(float)
    # if there are rural stations available, do analysis
    if len(rural_station_list) > 0 :# & (ind.sum() > 0): 
        # preallocate
        mean_rural = np.ones(len(rural_station_list))*np.nan
        mean_ms = np.ones(len(rural_station_list))*np.nan
        mean_rs = np.ones(len(rural_station_list))*np.nan
        mean_ps = np.ones(len(rural_station_list))*np.nan
        mean_Trslopes =  np.ones(len(rural_station_list))*np.nan
        
        ii = 0 
        # loop over rural stations
        for rural_id in rural_station_list: #[2:] : 
            # read in rural data
            ruraldata = ulmo.ncdc.ghcn_daily.get_data(rural_id, as_dataframe=True, update=False)
            ######
            ###### Import data
            ######
            rural_tmin = pd.to_numeric(ruraldata['TMIN']['2000-01-01':].value/10.) #rural tmin
            # drop data with flags here
            rural_tmin.loc[ruraldata['TMIN']['2000-01-01':]['qflag'].dropna().index] = np.nan
            # extract summertime data 
            rural_summer = rural_tmin[(rural_tmin.index.month >= 6) & (rural_tmin.index.month <= 8)]            
            ######
            ###### calculate mean rural temp
            ######  
            mean_rural[ii] = np.nanmean(rural_summer)
            ######
            ###### calculate slope 
            ######
            ind = ~np.isnan(rural_summer) & ~np.isnan(urban_summer[rural_summer.index])
            if ind.sum() > 0 :
                if urban_summer.shape[0] < rural_summer.shape[0] : 
                    x = rural_summer[urban_summer.index]
                    y = urban_summer
                    ind = ~np.isnan(x) & ~np.isnan(y)
                else: 
                    x = rural_summer
                    y = urban_summer[rural_summer.index]
                    ind = ~np.isnan(x) & ~np.isnan(y)
                x = rural_summer
                y = urban_summer
                linear = Model(f)
                ind = ~np.isnan(y) & ~np.isnan(x)
                linreg = scipy.stats.linregress(x[ind], y[ind])
                mydata = Data(x[ind], y[ind])
                myodr = ODR(mydata, linear, beta0=linreg[0:2]) # beta0 are initial guesses for paremeter values, ie, intercept and slope
                myoutput = myodr.run()
                m = myoutput.beta[0]
#                m, b, r, p, std_err = scipy.stats.linregress(x[ind],y[ind])
                #print(m)
                mean_ms[ii] = m
#                 mean_rs[ii] = r
#                 mean_ps[ii] = p
                ######
                ###### calculate rural trend of annual means (to solve for data availability and to get meaningful units)
                ######
#                 try: 
#                     #y = np.nanmean(rural_summer.groupby(rural_summer.index.year)).dropna()
#                     #x = y.index - y.index[0]
#                     y = rural_summer.groupby(rural_summer.index.year).mean()
#                     m, b, r, p, std_err = scipy.stats.linregress(y.dropna().index, y.dropna())#scipy.stats.linregress(x,y)
#                     mean_Trslopes[ii] = m
#                 except Error: 
#                     print('investigate error')
#                 except IndexError: 
#                     mean_Trslopes[ii] = np.nan
#                 except ValueError:
#                     mean_Trslopes[ii] = np.nan
#                 except KeyError: 
#                     print('missing data from station', rural_id)
#                 except TypeError: 
#                     print('probably not enough data')
                #except: 
                #    print 'unkown error'
            ii = ii+1
        print(mean_ms)
        #np.save(results_filepath+'trend/'+ city.replace(" ", "")[0:5]+'Trtrend.npy', mean_Trslopes)
        np.save(results_filepath+'slopes/'+ city.replace(" ", "")[0:5]+'m_rural.npy', mean_ms)
#         np.save(results_filepath+'slopes/'+ city.replace(" ", "")[0:5]+'r.npy', mean_ms)
#         np.save(results_filepath+'slopes/'+ city.replace(" ", "")[0:5]+'p.npy', mean_ps)
#         np.save(results_filepath+'values/'+ city.replace(" ", "")[0:5]+'rural.npy', mean_rural)
    else: 
        print( 'Not enough rural stations for %s'%city)
#     if np.mod(i,10) == 0 : 
#         np.savetxt(results_filepath+'results.csv', results, delimiter = ',')
        
#np.savetxt(results_filepath+'results.csv', results, delimiter = ',')

Mesa


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  axis=1)
.resample() is now a deferred operation
You called index(...) on this deferred object which materialized it into a dataframe
by implicitly taking the mean.  Use .resample(...).mean() instead
  daily_index = element_df.resample('D').index.copy()


[ 1.02294651  0.99073426  1.02384605  0.84159405  0.85050474  0.98667851
  1.20483194         nan  1.09744474  1.02478634  1.05911625  1.09232545
  1.10323426]
Grand Prairie
[ 0.88958363  1.00545357         nan  1.05268351  1.18986988  1.10902873
  0.94887205  0.72398674  0.79853252]
Irvine


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  dataframe[column_name][dates] = element_df[col][months]


[ 0.29240393  0.8650078   0.48153895  0.54888717  0.23547971  0.18525485
         nan  0.27829222]
Fort Lauderdale
[ 1.72715821  0.75075773  1.34062965  0.66454904  0.63870218  1.09675602
  1.53471111  1.30117439  0.53613561  1.39923549  0.64588618  0.62917752]
Long Beach
[ 0.28471445  0.24341374         nan  0.22572932         nan  0.25566555]
Hartford
[ 1.05455707         nan  0.95739633         nan         nan         nan
         nan  1.08485303         nan         nan  0.9588014          nan
  0.98432504  0.98869272         nan  1.04778304  0.95791857  0.87544572
  0.96995058]
Providence
[ 1.10294286         nan  1.03828077  1.25691276  1.08111699  1.05666664]
Virginia Beach
[ 1.3029686   1.25121115  0.92558287         nan  0.98981312  0.73963964
  0.9458948          nan  0.85072386  0.99115076  0.89195559  0.90346237
         nan  1.57276131  1.02457035  0.89869785  0.89216087  1.00674912
  1.11598313  0.95669845  0.91925651  0.92284343]
Oakland
[        nan  0.11121076  0.658712

# 