# Descarga del fichero

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
import urllib.request

def dl_data(url, output):
    try:        
        f = urllib.request.urlopen(url)
        print ("Downloading " + url)

        os.makedirs(os.path.dirname(output), exist_ok=True)
        with open(output, "wb") as local_file:
            local_file.write(f.read())

    except URLError:
        print ("Error", url)

In [2]:
# ¡¡¡¡¡ COMENTADO PARA NO EJECUTAR SIN QUERER YA QUE TARDA MUCHO !!!!!

#url = "https://data.seattle.gov/api/views/egc4-d24i/rows.csv?accessType=DOWNLOAD&bom=true&format=true"
#output1 = './data/Road_Weather_Information_Stations.csv'
#dl_data(url,output1)

In [3]:
df = pd.read_csv('./data/Road_Weather_Information_Stations.csv', parse_dates=['DateTime'])

In [4]:
df.head()

Unnamed: 0,StationName,StationLocation,DateTime,RecordId,RoadSurfaceTemperature,AirTemperature
0,35thAveSW_SWMyrtleSt,"(47.53918, -122.37658)",2014-03-03 12:42:00,672560,53.88,53.88
1,35thAveSW_SWMyrtleSt,"(47.53918, -122.37658)",2014-03-03 12:43:00,672561,54.05,54.05
2,35thAveSW_SWMyrtleSt,"(47.53918, -122.37658)",2014-03-03 12:44:00,672562,54.21,54.21
3,35thAveSW_SWMyrtleSt,"(47.53918, -122.37658)",2014-03-03 12:45:00,672563,54.38,54.38
4,35thAveSW_SWMyrtleSt,"(47.53918, -122.37658)",2014-03-03 12:46:00,672564,54.54,54.54


In [5]:
df = df.drop(columns=['StationName','RecordId'])

In [6]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31642625 entries, 0 to 31642624
Data columns (total 4 columns):
StationLocation           object
DateTime                  datetime64[ns]
RoadSurfaceTemperature    float64
AirTemperature            float64
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 3.1 GB


In [7]:
df.DateTime.dt.year.unique()

array([2014, 2017, 2015, 2016, 2018])

# Filtrado año 2016 de 08:00 a 20:00h

In [8]:
df = df.loc[df.DateTime.dt.year == 2016]
df.DateTime.dt.hour.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [9]:
df = df.loc[(df.DateTime.dt.hour >= 8) & (df.DateTime.dt.hour <= 19)]

In [10]:
# Redondeamos la hora (sin minutos o segundos)
df.DateTime = df.DateTime.apply(lambda x: x.replace(minute=0,second=0,microsecond=0))

In [11]:
dfg = df.groupby(['StationLocation','DateTime'])['RoadSurfaceTemperature', 'AirTemperature'].mean()

In [12]:
dfg = dfg.reset_index()

In [13]:
dfg.head()

Unnamed: 0,StationLocation,DateTime,RoadSurfaceTemperature,AirTemperature
0,"(47.53918, -122.37658)",2016-01-01 08:00:00,35.524302,35.524302
1,"(47.53918, -122.37658)",2016-01-01 09:00:00,35.508429,35.508429
2,"(47.53918, -122.37658)",2016-01-01 10:00:00,35.651985,35.651985
3,"(47.53918, -122.37658)",2016-01-01 11:00:00,35.857643,35.857643
4,"(47.53918, -122.37658)",2016-01-01 12:00:00,36.275429,36.275429


In [14]:
dfg.to_csv('./data/Road_Weather_Information_Stations_2016_08AM_20PM.csv', index=False)

In [15]:
dfg.StationLocation.unique().size

10

In [16]:
dfg['Station'] = dfg.StationLocation.astype('category').cat.codes

In [17]:
df_lat_lon = dfg.StationLocation.str.strip('()')   \
                   .str.split(', ', expand=True)   \
                   .rename(columns={0:'latitude', 1:'longitude'})
df_lat_lon = df_lat_lon.astype(float)
df_lat_lon.head()

Unnamed: 0,latitude,longitude
0,47.53918,-122.37658
1,47.53918,-122.37658
2,47.53918,-122.37658
3,47.53918,-122.37658
4,47.53918,-122.37658


In [18]:
dfpois = pd.concat([dfg, df_lat_lon], axis=1, sort=True)
dfpois.head()

Unnamed: 0,StationLocation,DateTime,RoadSurfaceTemperature,AirTemperature,Station,latitude,longitude
0,"(47.53918, -122.37658)",2016-01-01 08:00:00,35.524302,35.524302,0,47.53918,-122.37658
1,"(47.53918, -122.37658)",2016-01-01 09:00:00,35.508429,35.508429,0,47.53918,-122.37658
2,"(47.53918, -122.37658)",2016-01-01 10:00:00,35.651985,35.651985,0,47.53918,-122.37658
3,"(47.53918, -122.37658)",2016-01-01 11:00:00,35.857643,35.857643,0,47.53918,-122.37658
4,"(47.53918, -122.37658)",2016-01-01 12:00:00,36.275429,36.275429,0,47.53918,-122.37658


In [21]:
dfpois = dfpois.drop(columns=['StationLocation'])

In [19]:
dfpois.DateTime.dt.hour.unique()

array([ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

# Conversión de temperaturas de Farenheit a Celsius

In [25]:
#(Fahrenheit - 32) * 5.0/9.0

dfpois.RoadSurfaceTemperature = ((dfpois.RoadSurfaceTemperature - 32) * 5.0/9.0).map("{0:.2f}".format).astype(float)
dfpois.AirTemperature = ((dfpois.AirTemperature - 32) * 5.0/9.0).map("{0:.2f}".format).astype(float)

In [26]:
dfpois.head()

Unnamed: 0,DateTime,RoadSurfaceTemperature,AirTemperature,Station,latitude,longitude
0,2016-01-01 08:00:00,1.96,1.96,0,47.53918,-122.37658
1,2016-01-01 09:00:00,1.96,1.96,0,47.53918,-122.37658
2,2016-01-01 10:00:00,2.03,2.03,0,47.53918,-122.37658
3,2016-01-01 11:00:00,2.14,2.14,0,47.53918,-122.37658
4,2016-01-01 12:00:00,2.37,2.37,0,47.53918,-122.37658


In [31]:
dfpois.columns = ['timestamp', 'road_temp', 'air_temp', 'station_closest', 'latitude', 'longitude']

In [32]:
dfpois.to_csv('./data/Road_Weather_Information_Stations_2016_08AM_20PM_FINAL.csv', index=False)

# Ubicación de las 10 estaciones de medida

In [28]:
stations = dfpois.groupby(['Station','latitude','longitude']).size().reset_index(name='count')
stations

Unnamed: 0,Station,latitude,longitude,count
0,0,47.53918,-122.37658,4358
1,1,47.547426,-122.314114,4304
2,2,47.571389,-122.351743,4368
3,3,47.571695,-122.370873,4360
4,4,47.59653,-122.317301,4324
5,5,47.598438,-122.335612,4323
6,6,47.633454,-122.387341,3229
7,7,47.643174,-122.347278,3683
8,8,47.66127,-122.301929,4308
9,9,47.692098,-122.31765,3794


In [33]:
for i in range (0,10):
    print(dfpois.timestamp.loc[dfpois.station_closest == i].unique().size)

4358
4304
4368
4360
4324
4323
3229
3683
4308
3794


# Estación más próxima a cada parquímetro

In [86]:
coord = pd.read_csv('./data/Coord_EK.csv')

In [87]:
coord.head()

Unnamed: 0,element_key,latitude,longitude
0,1001,47.602862,-122.334703
1,1002,47.602997,-122.334538
2,1005,47.603602,-122.335382
3,1006,47.603725,-122.335171
4,1009,47.60501,-122.336669


In [68]:
from math import radians, cos, sin, asin, sqrt, acos

def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6373
    return c * r

In [88]:
coord['station_closest'] = 10
for c_index, c_row in coord.iterrows():
    dist_station = []
    for st_index, st_row in stations.iterrows():
        dist = haversine(c_row.longitude, c_row.latitude, st_row.longitude, st_row.latitude)
        dist_station.append(dist)
    coord.at[c_index, 'station_closest'] = dist_station.index(min(dist_station))

In [89]:
coord.head()

Unnamed: 0,element_key,latitude,longitude,station_closest
0,1001,47.602862,-122.334703,5
1,1002,47.602997,-122.334538,5
2,1005,47.603602,-122.335382,5
3,1006,47.603725,-122.335171,5
4,1009,47.60501,-122.336669,5


In [90]:
coord.station_closest.value_counts()

5    864
7    275
4    222
8     83
6     44
9     29
Name: station_closest, dtype: int64

In [97]:
coord.to_csv('./data/Coord_EK_stations.csv', index=False)

# Visualización sobre un mapa

In [35]:
import seaborn as sns
import folium
%matplotlib inline

In [51]:
colors = ['gold','magenta','yellowgreen','red','green','blue',
          'tan','black','dodgerblue','plum','grey','purple',
         'darkorange','darkcyan','limegreen','blueviolet','skyblue','maroon']

this_map = folium.Map(prefer_canvas=True, max_bounds=False)

def plotDot(df):
    df = df.copy()
    folium.CircleMarker(location=[df.latitude, df.longitude],
                        radius=r,
                        fill=True,
                        fill_opacity=0.8,
                        fill_color=usedColor,
                        color='whitesmoke',
                        weight=0.5).add_to(this_map)

for i in range(0,10):
    usedColor = colors[i]
    r = 12
    stations.loc[stations.Station == i].apply(plotDot, axis = 1)
    r = 5
    coord.loc[coord.station_closest == i].apply(plotDot, axis = 1)
    
    
map_bounds = this_map.get_bounds()
this_map.fit_bounds(map_bounds, max_zoom=20)
this_map.max_lat = map_bounds[1][0]
this_map.min_lat = map_bounds[0][0]
this_map.max_lon = map_bounds[1][1]
this_map.min_lon = map_bounds[0][1]

this_map.save('road_weather_info_stations.html')
this_map

# Completando la serie

In [70]:
stations_list = coord.station_closest.unique()
dfpois = dfpois.loc[dfpois.station_closest.isin(stations_list)]
dfpois.to_csv('./data/RWIS_filtered.csv', index=False)

In [56]:
dfpois.timestamp.dt.dayofyear.unique().size  # falta el día 258

365

In [66]:
dfpois['day_year'] = dfpois.timestamp.dt.dayofyear
dfpois['hour'] = dfpois.timestamp.dt.hour

In [76]:
dfpois257_259 = dfpois.loc[(dfpois.day_year == 257) | (dfpois.day_year == 259)]

In [75]:
dfpois.head()

Unnamed: 0,road_temp,air_temp,station_closest,day_year,hour
17390,-3.48,-1.17,4,1,8
17391,-2.61,0.21,4,1,9
17392,-0.98,1.33,4,1,10
17393,1.11,2.97,4,1,11
17394,3.09,6.01,4,1,12


In [84]:
#dfpois = dfpois.drop(columns=['timestamp', 'latitude', 'longitude'])
dfpois258 = dfpois257_259.groupby(['station_closest', 'hour'])['road_temp', 'air_temp'].mean()
dfpois258 = dfpois258.reset_index()
dfpois258['day_year'] = 258
dfpois258.head()

Unnamed: 0,station_closest,hour,road_temp,air_temp,day_year
0,4,8,16.41,15.08,258
1,4,9,18.03,16.56,258
2,4,10,21.45,17.49,258
3,4,11,25.18,20.05,258
4,4,12,28.24,21.99,258


In [57]:
dfpois.station_closest.value_counts()  # 365 * 12 = 4380, faltan datos en todos

4    4324
5    4323
8    4308
9    3794
7    3683
6    3229
Name: station_closest, dtype: int64

In [86]:
anyo16 = pd.date_range(start='2016-01-01', periods=366*24, freq='H')

In [87]:
anyo16

DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
               '2016-01-01 02:00:00', '2016-01-01 03:00:00',
               '2016-01-01 04:00:00', '2016-01-01 05:00:00',
               '2016-01-01 06:00:00', '2016-01-01 07:00:00',
               '2016-01-01 08:00:00', '2016-01-01 09:00:00',
               ...
               '2016-12-31 14:00:00', '2016-12-31 15:00:00',
               '2016-12-31 16:00:00', '2016-12-31 17:00:00',
               '2016-12-31 18:00:00', '2016-12-31 19:00:00',
               '2016-12-31 20:00:00', '2016-12-31 21:00:00',
               '2016-12-31 22:00:00', '2016-12-31 23:00:00'],
              dtype='datetime64[ns]', length=8784, freq='H')

In [88]:
anyo16 = pd.DataFrame(data=anyo16.to_pydatetime())

In [89]:
anyo16.columns = ['timestamp']
anyo16 = anyo16.loc[(anyo16.timestamp.dt.hour >= 8) & (anyo16.timestamp.dt.hour < 20)]
anyo16.head()

Unnamed: 0,timestamp
8,2016-01-01 08:00:00
9,2016-01-01 09:00:00
10,2016-01-01 10:00:00
11,2016-01-01 11:00:00
12,2016-01-01 12:00:00


In [90]:
anyo16['day_year'] = anyo16.timestamp.dt.dayofyear
anyo16['hour'] = anyo16.timestamp.dt.hour

In [95]:
anyo16_stations = pd.DataFrame()
for sl in stations_list:
    anyo16['station_closest'] = sl
    anyo16_stations = pd.concat([anyo16_stations, anyo16], sort=True)

In [98]:
dfpois = pd.concat([dfpois, dfpois258], sort=True)

In [106]:
dfpois_complete = pd.merge(anyo16_stations, dfpois, on=['day_year','hour','station_closest'],
                           how = 'left', validate='many_to_one')

In [136]:
dfpois_complete.isnull().sum()

day_year           0
hour               0
station_closest    0
timestamp          0
air_temp           0
road_temp          0
dtype: int64

In [135]:
dfpois_complete.interpolate(inplace=True)

In [139]:
dfpois_complete.head()

Unnamed: 0,day_year,hour,station_closest,timestamp,air_temp,road_temp
0,1,8,5,2016-01-01 08:00:00,2.03,-2.63
1,1,9,5,2016-01-01 09:00:00,1.99,-2.27
2,1,10,5,2016-01-01 10:00:00,2.08,-0.89
3,1,11,5,2016-01-01 11:00:00,2.28,2.44
4,1,12,5,2016-01-01 12:00:00,2.57,4.87


In [143]:
dfpois_complete = dfpois_complete.drop(columns=['day_year', 'hour'])

In [144]:
dfpois_complete.to_csv('./data/RWIS_completed.csv', index=False)