# Daily Traffic Data

Data here is sourced from the <a href="https://dtdapps.coloradodot.info/otis/TrafficData">Colorado Dept of Transportation's Online Transportation Information System.</a>  

### The daily traffic data is returned as an HTML table via the following URL:

<blockquote>https://dtdapps.coloradodot.info/otis/TrafficData/GetDailyTrafficVolumeForStationByMonth/{stationID}/true/{year}/{mm}</blockquote>

### Where mm is the two-digit month.  Here we loop through all months from 1992 - 2022 for both of the stations we're interested in and parse the HTML tables using BeautifulSoup

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

stationIds = ['000106','000308','000240','000219', '000120', '000236', '000119', '000126']
years = range(1992,2023)
months = range(1,13)
rowList = []
errorList = []
for year in years:
    for month in months:
        # Convert month to two-digit equivalent
        mm = str(month).zfill(2)
        for station in stationIds:
            print (f'{station} {year}/{mm}', end='\r')
            url = f'https://dtdapps.coloradodot.info/otis/TrafficData/GetDailyTrafficVolumeForStationByMonth/{station}/true/{year}/{mm}'
            r = requests.get(url)
            # If the status code isn't 200, something went wrong with the request
            if r.status_code != 200:
                error = {'error_code': f'HTTP {r.status_code}', 'station_id': station, 'year': year, 'month': mm}
                errorList.append(error)
            else:
                soup = BeautifulSoup(r.text)
                # If there are no table details, then there's no data for that particular station/year/month
                if len(soup.find_all('td'))==0:
                    error = {'error_code': 'No data found', 'station_id':station, 'year':year, 'month':mm}
                    errorList.append(error)
                else:
                    # There's only one table on the page, so we'll take the <tr> tags from that table
                    table = soup.find('table').find_all('tr')
                    # The headers are in the first row
                    headers = [h.text for h in table[0].find_all('th')]
                    # For the remaining rows, we extract the text from the <td> tags
                    for row in table[1:]:
                        r = [val.text for val in row.find_all('td')]
                        # Combine the <td> text with the header row to produce a DataFrame row
                        r = dict(zip(headers,r))
                        # Add the station ID
                        r['station_id'] = station
                        rowList.append(r)

000126 2022/12

The table is arranged such that each row is a date and the columns are the traffic counts per hour (labelled as 0h, 1h, 2h ... 23h).  Here we'll reshape the table to a more normalized form and aggregate up to each day.

In [86]:
df = pd.DataFrame(rowList)

# Convert the hourly column headers to integers
# Not used currently, but leaving here in case we decide to go more granular ...
# df.columns = [c.replace('h', '') for c in df.columns]

# Reshape the data so that each row is an hourly count per station/date
df = pd.melt(df, id_vars=['station_id','Count Date','Dir'], var_name='hour', value_name='count')

# Convert the integer columns to integers
# Not used currently, but leaving here in case we decide to go more granular ...
# df['hour'] = df['hour'].astype(int)
df['count'] = df['count'].astype(int)
df['station_id'] = df['station_id'].astype(int)

# Convert the date column to an actual date and rename it to database format
df['Count Date'] = pd.to_datetime(df['Count Date'])
df.rename({'Count Date':'date', 'Dir':'direction'}, axis=1, inplace=True)

# Aggregate the data up to each day
df = df.groupby(['station_id','date','direction']).sum()['count'].reset_index()

In [88]:
# Here we translate the primary and secondary direction into compass directions based on which direction the road runs.
directions = {236: {'P': 'east', 'S': 'west'},
              119: {'P': 'east', 'S': 'west'},
              126: {'P': 'east', 'S': 'west'},
              106: {'P': 'east', 'S': 'west'},
              120: {'P': 'east', 'S': 'west'},
              308: {'P': 'north', 'S': 'south'},
              240: {'P': 'north', 'S': 'south'},
              219: {'P': 'north', 'S': 'south'}             
             }
# We also add a flag indicating if the direction is headed towards one or more ski resorts or away.  In some cases, both
# directions are headed towards.
relative_resort_dir = {308: {'north': 'towards', 'south': 'away'},
                       240: {'north': 'towards', 'south': 'towards'},
                       219: {'north': 'towards', 'south': 'towards'},
                       236: {'east': 'towards', 'west': 'away'},
                       119: {'east': 'towards', 'west': 'towards'},
                       126: {'east': 'towards', 'west': 'away'},
                       106: {'east': 'towards', 'west': 'towards'},
                       120: {'east': 'away', 'west': 'towards'}                      
                      }

In [89]:
df['direction'] = df.apply(lambda row: directions[row['station_id']][row['direction']], axis=1)

In [90]:
df['relative_resort_direction'] = df.apply(lambda row: relative_resort_dir[row['station_id']][row['direction']], axis=1)

### Normalize the traffic counts based on population growth in the Denver Metro Area

Population data sourced from <a href="https://www.metrodenver.org/regional-data/demographics/population">here.</a>

In [77]:
# Connect to Database
import getpass
from sqlalchemy.engine.url import URL
from sqlalchemy import create_engine
%reload_ext sql

mypasswd = getpass.getpass()
username = 'dgyw5' # Replace with your pawprint
host = 'pgsql.dsa.lan'
database = 'caponl_22g2'

postgres_db = {'drivername': 'postgres',
               'username': username,
               'password': mypasswd,
               'host': host,
               'database': database}
engine = create_engine(URL(**postgres_db), echo=False)

connection_string = f'postgres://{username}:{mypasswd}@{host}/{database}'
%sql $connection_string
del mypasswd, connection_string

········


In [91]:
# Load the CO population data
co_population_data = %sql select * from co_population_data
co_population_data = co_population_data.DataFrame()

 * postgres://dgyw5:***@pgsql.dsa.lan/caponl_22g2
1152 rows affected.


In [92]:
# Add year and month columns to the traffic data to join to the CO population data
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

In [93]:
# Join the population data
df = df.set_index(['year','month']).join(co_population_data.set_index(['year','month'])).reset_index()

In [94]:
# Calculate the adjusted traffic count based on the Denver metro area population growth
df['dma_adj_traffic'] = df['count'] / df['dma_pop_frac2020']

In [95]:
# Select relevant columns and arrange in logical order
df = df[['station_id','date','year','month','direction','relative_resort_direction','count','dma_adj_traffic']]

### Get Station data

In [96]:
stations = pd.DataFrame()
for id in df.station_id.unique():
    url = f'https://dtdapps.coloradodot.info/otis/API/TRANSYS/GetTrafficStationById/{str(id).zfill(6)}.csv'
    r = requests.get(url)
    # Parse the csv text into a dataframe
    temp = pd.DataFrame([t.split(',') for t in r.text.split('\r\n')]) 
    # column names are in the first row
    temp.columns = temp.iloc[0]
    # drop the first row
    temp = temp.drop(0)
    # drop the extraneous blank row
    temp = temp.dropna()
    # add back to the main dataframe
    stations = stations.append(temp)

In [97]:
# Rename & re-arrange columns
colNames = {'STATIONID': 'station_id',
            'ROUTE': 'route',
            'REFPT': 'begin_ref_point',
            'ENDREFPT': 'end_ref_point',
            'FIPSCITY': 'city_fips',
            'CITY': 'city_name',
            'FIPSCOUNTY': 'county_fips',
            'COUNTY': 'county_name',
            'LOCATION': 'description',
            'COUNTSTATIONFACILITY': 'count_station_facility'}

stations.rename(colNames, axis=1, inplace=True)

stations = stations[colNames.values()]

## Load the data to the database

### Create the tables

In [98]:
%%sql

drop table if exists traffic_stations cascade;

create table traffic_stations (
    station_id int,
    route varchar(4),
    begin_ref_point float,
    end_ref_point float,
    city_fips varchar(5),
    city_name varchar(25),
    county_fips varchar(3),
    county_name varchar(15),
    description varchar(100),
    count_station_facility int,
    constraint pk_traffic_stations primary key (station_id)
);

grant all privileges on traffic_stations to nnfd2, dgyw5, jwcp64, gfdbq;

 * postgres://dgyw5:***@pgsql.dsa.lan/caponl_22g2
Done.
Done.
Done.


[]

In [100]:
%%sql 

drop table if exists daily_traffic_data cascade;

create table daily_traffic_data (
    station_id int,
    date date,
    year int,
    month int,
    direction varchar(20),
    relative_resort_direction varchar(20),
    count int,
    dma_adj_traffic float,
    constraint pk_daily_traffic_data primary key (station_id, date, direction),
    constraint fk_daily_traffic_data foreign key (station_id) references traffic_stations(station_id)
);

grant all privileges on daily_traffic_data to nnfd2, dgyw5, jwcp64, gfdbq;

 * postgres://dgyw5:***@pgsql.dsa.lan/caponl_22g2
Done.
Done.
Done.


[]

### Load the data

In [101]:
stations.to_sql('traffic_stations', con=engine, index=False, if_exists='append')

In [102]:
df.to_sql('daily_traffic_data', con=engine, if_exists='append', index=False)

In [103]:
%sql select count(*) from traffic_stations

 * postgres://dgyw5:***@pgsql.dsa.lan/caponl_22g2
1 rows affected.


count
8


In [104]:
%sql select count(*) from daily_traffic_data

 * postgres://dgyw5:***@pgsql.dsa.lan/caponl_22g2
1 rows affected.


count
130273
