In [1]:
"""Importing Necessary Libraries"""

import pandas as pd
pd.set_option('max_columns',2000)
import numpy as np
import sqlite3
import datetime
import holidays
from meteostat import Stations, Daily
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
"""Importing US Traffic 2015 datasets"""

dot_traffic_2015=pd.read_csv('.\Data\dot_traffic_2015.txt.gz',compression='gzip',sep=',')
dot_traffic_stations_2015=pd.read_csv('.\Data\dot_traffic_stations_2015.txt.gz',compression='gzip',sep=',')

## Checking Data Types and viewing snapshot of as received files

In [3]:
"""Checking for Row and Column Counts of the datasets"""

print("Shape of dot_traffic_2015: ")
print(dot_traffic_2015.shape)
print("\n")
print("Shape of dot_traffic_stations_2015: ")
print(dot_traffic_stations_2015.shape)

Shape of dot_traffic_2015: 
(7140391, 38)


Shape of dot_traffic_stations_2015: 
(28466, 55)


In [4]:
"""Checking for existence of duplicate rows"""

print("Number of distinct rows in dot_traffic_2015: "+str(dot_traffic_2015.drop_duplicates().shape[0]))
print("Number of distinct rows in dot_traffic_stations_2015: "+str(dot_traffic_stations_2015.drop_duplicates().shape[0]))

Number of distinct rows in dot_traffic_2015: 6396747
Number of distinct rows in dot_traffic_stations_2015: 28466


## The dot_traffic_2015 dataset contains duplicate rows (6396747 unique rows vs 7140391 total number of rows), and hence the dataset will be deduped before proceeding forward for subsequent analysis

In [5]:
"""Removing duplicates from the dot_traffic_2015 dataset"""

dot_traffic_2015=dot_traffic_2015.drop_duplicates()

In [6]:
"""Checking for data types of dot_traffic_2015"""

print(dot_traffic_2015.dtypes)

date                                          object
day_of_data                                    int64
day_of_week                                    int64
direction_of_travel                            int64
direction_of_travel_name                      object
fips_state_code                                int64
functional_classification                     object
functional_classification_name                object
lane_of_travel                                 int64
month_of_data                                  int64
record_type                                    int64
restrictions                                 float64
station_id                                    object
traffic_volume_counted_after_0000_to_0100      int64
traffic_volume_counted_after_0100_to_0200      int64
traffic_volume_counted_after_0200_to_0300      int64
traffic_volume_counted_after_0300_to_0400      int64
traffic_volume_counted_after_0400_to_0500      int64
traffic_volume_counted_after_0500_to_0600     

In [7]:
"""Checking for a snapshot of the dot_traffic_2015"""

dot_traffic_2015.head()

Unnamed: 0,date,day_of_data,day_of_week,direction_of_travel,direction_of_travel_name,fips_state_code,functional_classification,functional_classification_name,lane_of_travel,month_of_data,record_type,restrictions,station_id,traffic_volume_counted_after_0000_to_0100,traffic_volume_counted_after_0100_to_0200,traffic_volume_counted_after_0200_to_0300,traffic_volume_counted_after_0300_to_0400,traffic_volume_counted_after_0400_to_0500,traffic_volume_counted_after_0500_to_0600,traffic_volume_counted_after_0600_to_0700,traffic_volume_counted_after_0700_to_0800,traffic_volume_counted_after_0800_to_0900,traffic_volume_counted_after_0900_to_1000,traffic_volume_counted_after_1000_to_1100,traffic_volume_counted_after_1100_to_1200,traffic_volume_counted_after_1200_to_1300,traffic_volume_counted_after_1300_to_1400,traffic_volume_counted_after_1400_to_1500,traffic_volume_counted_after_1500_to_1600,traffic_volume_counted_after_1600_to_1700,traffic_volume_counted_after_1700_to_1800,traffic_volume_counted_after_1800_to_1900,traffic_volume_counted_after_1900_to_2000,traffic_volume_counted_after_2000_to_2100,traffic_volume_counted_after_2100_to_2200,traffic_volume_counted_after_2200_to_2300,traffic_volume_counted_after_2300_to_2400,year_of_data
0,2015-04-07,7,3,1,North,56,3R,Rural: Principal Arterial - Other,1,4,3,,000084,4,3,2,4,43,78,116,144,132,115,150,184,169,136,129,89,122,124,110,69,73,28,12,6,15
1,2015-09-26,26,7,7,West,21,1U,Urban: Principal Arterial - Interstate,2,9,3,,056P94,381,252,218,194,220,348,453,679,826,962,1158,1379,1376,1383,1453,1617,1669,1308,1068,928,885,798,650,613,15
2,2015-06-16,16,3,3,East,6,1U,Urban: Principal Arterial - Interstate,0,6,3,,077590,585,408,328,364,696,1929,4228,5634,5673,4636,3925,3827,4049,3954,4077,4244,4405,4609,4361,3272,2243,2050,1453,892,15
3,2015-04-26,26,1,5,South,55,1U,Urban: Principal Arterial - Interstate,1,4,3,,450001,105,73,68,66,77,113,254,367,487,668,870,996,1003,1000,1043,1011,959,851,708,559,457,297,207,110,15
4,2015-05-23,23,7,3,East,4,4R,Rural: Minor Arterial,0,5,3,,102210,6,4,2,3,1,10,17,52,64,68,82,96,99,87,87,83,61,55,35,29,21,23,9,7,15


In [8]:
"""Checking for data types of dot_traffic_stations_2015"""

print(dot_traffic_stations_2015.dtypes)

algorithm_of_vehicle_classification                  object
algorithm_of_vehicle_classification_name             object
calibration_of_weighing_system                       object
calibration_of_weighing_system_name                  object
classification_system_for_vehicle_classification      int64
concurrent_route_signing                              int64
concurrent_signed_route_number                       object
direction_of_travel                                   int64
direction_of_travel_name                             object
fips_county_code                                      int64
fips_state_code                                       int64
functional_classification                            object
functional_classification_name                       object
hpms_sample_identifier                               object
hpms_sample_type                                     object
lane_of_travel                                        int64
lane_of_travel_name                     

In [9]:
"""Checking for a snapshot of dot_traffic_stations_2015"""

dot_traffic_stations_2015.head()

Unnamed: 0,algorithm_of_vehicle_classification,algorithm_of_vehicle_classification_name,calibration_of_weighing_system,calibration_of_weighing_system_name,classification_system_for_vehicle_classification,concurrent_route_signing,concurrent_signed_route_number,direction_of_travel,direction_of_travel_name,fips_county_code,fips_state_code,functional_classification,functional_classification_name,hpms_sample_identifier,hpms_sample_type,lane_of_travel,lane_of_travel_name,latitude,longitude,lrs_identification,lrs_location_point,method_of_data_retrieval,method_of_data_retrieval_name,method_of_traffic_volume_counting,method_of_traffic_volume_counting_name,method_of_truck_weighing,method_of_truck_weighing_name,method_of_vehicle_classification,method_of_vehicle_classification_name,national_highway_system,number_of_lanes_in_direction_indicated,number_of_lanes_monitored_for_traffic_volume,number_of_lanes_monitored_for_truck_weight,number_of_lanes_monitored_for_vehicle_class,posted_route_signing,posted_signed_route_number,previous_station_id,primary_purpose,primary_purpose_name,record_type,sample_type_for_traffic_volume,sample_type_for_traffic_volume_name,sample_type_for_truck_weight,sample_type_for_truck_weight_name,sample_type_for_vehicle_classification,sample_type_for_vehicle_classification_name,second_type_of_sensor,shrp_site_identification,station_id,station_location,type_of_sensor,type_of_sensor_name,year_of_data,year_station_discontinued,year_station_established
0,,,,,13,3,91.0,7,West,59,6,2U,Urban: Principal Arterial - Other Freeways or ...,,N,4,Other lanes,33.850898,117.814391,00000000091R,,2,Automated (telemetry),3,Permanent automatic traffic recorder (ATR),0,,0,,Y,5,5,0,0,3,91,,P,Planning or traffic statistics purposes,S,T,Station used for Traffic Volume Trends,,,N,Station not used for Heavy Vehicle Travel Info...,N,,129130,LAKEVIEW AVENUE ORA91R10.091,L,Inductance loop,15,0,97
1,,,,,13,3,99.0,5,South,77,6,3R,Rural: Principal Arterial - Other,,N,1,Outside (rightmost) lane,37.874697,121.21959,00000000099R,248336.0,2,Automated (telemetry),3,Permanent automatic traffic recorder (ATR),0,,0,,Y,2,2,0,0,3,99,,P,Planning or traffic statistics purposes,S,T,Station used for Traffic Volume Trends,,,N,Station not used for Heavy Vehicle Travel Info...,N,,100190,LITTLE JOHN CREEK SJ9912.526,L,Inductance loop,15,0,97
2,G,Axle spacing with Scheme F modified,,,15,1,5.0,1,North,93,6,1R,Rural: Principal Arterial - Interstate,,N,2,Other lanes,41.441777,122.43501,00000000005R,750293.0,2,Automated (telemetry),3,Permanent automatic traffic recorder (ATR),0,,3,Permanent vehicle classification device,Y,2,2,0,2,1,5,,P,Planning or traffic statistics purposes,S,T,Station used for Traffic Volume Trends,,,H,Station used for Heavy Vehicle Travel Informat...,N,,022940,EDGEWOOD SIS5R22.999,P,Piezoelectric,15,0,69
3,D,Vehicle length classification,M,Moving average of the steering axle of 3S2s,13,0,,5,South,35,49,1U,Urban: Principal Arterial - Interstate,A00015293910,Y,1,Outside (rightmost) lane,40.5165,111.89152,000000001500,290600.0,2,Automated (telemetry),3,Permanent automatic traffic recorder (ATR),4,Portable weigh-in-motion system,3,Permanent vehicle classification device,Y,5,5,5,5,1,15,,P,Planning or traffic statistics purposes,S,T,Station used for Traffic Volume Trends,B,Station used for TMG sample and Strategic High...,N,Station not used for Heavy Vehicle Travel Info...,,,000302,I 15 12900 South M.P. 290.6,X,Radio wave,15,0,11
4,G,Axle spacing with Scheme F modified,0,,14,1,0.0,7,West,27,34,1U,Urban: Principal Arterial - Interstate,,N,4,Other lanes,40.892373,74.484206,,,2,Automated (telemetry),2,Portable traffic recording device,0,,3,Permanent vehicle classification device,Y,4,4,4,4,1,80,,P,Planning or traffic statistics purposes,S,T,Station used for Traffic Volume Trends,N,Station not used for any of the above,N,Station not used for Heavy Vehicle Travel Info...,,,W01136,E. of Franklin Rd Underpass,L,Inductance loop,15,0,95


## We need to Identify the Level at which the observations of the datasets are Unique (The group of variables which make every observation unique in a dataset)

## For the dot_traffic_2015 dataset it has been mentioned in Kaggle that the dataset contains daily observation of traffic volumes split into 24 hourly bins, with respect to station_id, locational information (geographical place), traffic flow direction and type of road.

## So the following variables were identified which can make the dataset unique:
## ['date','direction_of_travel','fips_state_code','functional_classification','lane_of_travel', 'station_id'] 

In [10]:
"""Checking the number of distinct rows with respect to the aforementioned variables"""

print("Number of distinct rows in dot_traffic_2015 dataset with respect to the aforementioned variables : "+str(dot_traffic_2015[['date','direction_of_travel','fips_state_code','functional_classification','lane_of_travel','station_id']].drop_duplicates().shape[0]))

Number of distinct rows in dot_traffic_2015 dataset with respect to the aforementioned variables : 5784441


## Number of unique Rows in the dot_traffic_2015 dataset: 6396747
## Number of unique rows with respect to the above variable combination: 5784441

## Further analysis is required to identify the missed-out variables which can make each observation unique

In [11]:
"""Checking the distribution of the number of rows with respect to the unique combinations of the aforementioned variables"""

dot_traffic_2015_row_count_agg=dot_traffic_2015.groupby(['date','direction_of_travel','fips_state_code','functional_classification','lane_of_travel','station_id']).size().to_frame().reset_index()
dot_traffic_2015_row_count_agg.columns=['date','direction_of_travel','fips_state_code','functional_classification','lane_of_travel','station_id','Counts']
print("Distribution of Number of Rows for each combination: \n"+str(dot_traffic_2015_row_count_agg['Counts'].value_counts()))

Distribution of Number of Rows for each combination: 
1    5172135
2     612306
Name: Counts, dtype: int64


## With the chosen variables, 5172135 combinations were unique while 612306 combinations had 2 rows each. Hence a manual review of few specific cases was performed to identify the reason behind the same

In [12]:
"""Printing few combinations of the aforementioned variables, where there are more than 1 row"""

dot_traffic_2015_row_count_agg[dot_traffic_2015_row_count_agg['Counts']>1].head()

Unnamed: 0,date,direction_of_travel,fips_state_code,functional_classification,lane_of_travel,station_id,Counts
470,2015-01-01,1,8,5R,1,102801,2
607,2015-01-01,1,12,1R,1,10350,2
609,2015-01-01,1,12,1R,1,189920,2
610,2015-01-01,1,12,1R,1,269904,2
613,2015-01-01,1,12,1R,1,740132,2


In [13]:
"""Example 1 of duplicate entries"""

dot_traffic_2015[(dot_traffic_2015['date']=='2015-01-01') & (dot_traffic_2015['direction_of_travel']==1) & (dot_traffic_2015['fips_state_code']==8) & (dot_traffic_2015['functional_classification']=='5R') & (dot_traffic_2015['lane_of_travel']==1) & (dot_traffic_2015['station_id']=='102801')]

Unnamed: 0,date,day_of_data,day_of_week,direction_of_travel,direction_of_travel_name,fips_state_code,functional_classification,functional_classification_name,lane_of_travel,month_of_data,record_type,restrictions,station_id,traffic_volume_counted_after_0000_to_0100,traffic_volume_counted_after_0100_to_0200,traffic_volume_counted_after_0200_to_0300,traffic_volume_counted_after_0300_to_0400,traffic_volume_counted_after_0400_to_0500,traffic_volume_counted_after_0500_to_0600,traffic_volume_counted_after_0600_to_0700,traffic_volume_counted_after_0700_to_0800,traffic_volume_counted_after_0800_to_0900,traffic_volume_counted_after_0900_to_1000,traffic_volume_counted_after_1000_to_1100,traffic_volume_counted_after_1100_to_1200,traffic_volume_counted_after_1200_to_1300,traffic_volume_counted_after_1300_to_1400,traffic_volume_counted_after_1400_to_1500,traffic_volume_counted_after_1500_to_1600,traffic_volume_counted_after_1600_to_1700,traffic_volume_counted_after_1700_to_1800,traffic_volume_counted_after_1800_to_1900,traffic_volume_counted_after_1900_to_2000,traffic_volume_counted_after_2000_to_2100,traffic_volume_counted_after_2100_to_2200,traffic_volume_counted_after_2200_to_2300,traffic_volume_counted_after_2300_to_2400,year_of_data
534908,2015-01-01,1,5,1,North,8,5R,Rural: Major Collector,1,1,3,,102801,0,0,2,0,1,2,10,9,9,10,17,21,18,20,16,21,15,8,10,5,3,3,3,2,15
3184972,2015-01-01,1,5,1,North,8,5R,Rural: Major Collector,1,1,3,,102801,0,0,2,0,1,2,10,9,9,10,17,20,18,20,16,21,15,8,10,5,3,3,2,2,15


In [14]:
"""Example 2 of duplicate entries"""

dot_traffic_2015[(dot_traffic_2015['date']=='2015-01-01') & (dot_traffic_2015['direction_of_travel']==1) & (dot_traffic_2015['fips_state_code']==12) & (dot_traffic_2015['functional_classification']=='1R') & (dot_traffic_2015['lane_of_travel']==1) & (dot_traffic_2015['station_id']=='010350')]

Unnamed: 0,date,day_of_data,day_of_week,direction_of_travel,direction_of_travel_name,fips_state_code,functional_classification,functional_classification_name,lane_of_travel,month_of_data,record_type,restrictions,station_id,traffic_volume_counted_after_0000_to_0100,traffic_volume_counted_after_0100_to_0200,traffic_volume_counted_after_0200_to_0300,traffic_volume_counted_after_0300_to_0400,traffic_volume_counted_after_0400_to_0500,traffic_volume_counted_after_0500_to_0600,traffic_volume_counted_after_0600_to_0700,traffic_volume_counted_after_0700_to_0800,traffic_volume_counted_after_0800_to_0900,traffic_volume_counted_after_0900_to_1000,traffic_volume_counted_after_1000_to_1100,traffic_volume_counted_after_1100_to_1200,traffic_volume_counted_after_1200_to_1300,traffic_volume_counted_after_1300_to_1400,traffic_volume_counted_after_1400_to_1500,traffic_volume_counted_after_1500_to_1600,traffic_volume_counted_after_1600_to_1700,traffic_volume_counted_after_1700_to_1800,traffic_volume_counted_after_1800_to_1900,traffic_volume_counted_after_1900_to_2000,traffic_volume_counted_after_2000_to_2100,traffic_volume_counted_after_2100_to_2200,traffic_volume_counted_after_2200_to_2300,traffic_volume_counted_after_2300_to_2400,year_of_data
5408064,2015-01-01,1,5,1,North,12,1R,Rural: Principal Arterial - Interstate,1,1,3,,10350,176,225,155,139,148,229,324,525,678,770,839,939,934,912,887,859,823,752,687,586,470,413,320,209,15
5823125,2015-01-01,1,5,1,North,12,1R,Rural: Principal Arterial - Interstate,1,1,3,,10350,174,224,154,138,145,224,322,515,661,759,816,922,914,888,866,841,811,743,675,579,461,405,315,208,15


In [15]:
"""Deleting objects which are no longer required, to release RAM space"""

del dot_traffic_2015_row_count_agg

## From manual review of the above cases, it was observed that there's a discrepancy in the traffic volume information. On the same day, station_id, location, direction of traffic flow and type of road, there's different hourly traffic volume information in 2 different entries. 

## It was also observed that the difference in the recorded hourly traffic volume between the two entries were marginally different.

## Under such circumstances with no prior information related to the data collection process, and also where we have to retain one record out of two, for 612306 cases, a random deletion of 1 record from each combination will give a decent approximation of the actual traffic volumes across the 24-hourly bins.

In [16]:
"""The dataset was partitioned with respect to the identified variables, and ordered randomly, and the first entry was chosen 
    from each partition. In sqlite3, there's no provision to pass a random seed with the random function, hence this particular 
    step can't be replicated. However, for all practical purposes the results will lead us to the same conclusion"""

sql_connection = sqlite3.connect(':memory:')
dot_traffic_2015.to_sql('dot_traffic_2015', sql_connection, index=False)

dot_traffic_2015=pd.read_sql_query("""select * from

(select *,

row_number() over(partition by date,direction_of_travel,fips_state_code,functional_classification,lane_of_travel,station_id
                  order by random()) as Dedup_Flag

from
dot_traffic_2015) a

where Dedup_Flag=1

""", sql_connection)

dot_traffic_2015.drop('Dedup_Flag', axis=1, inplace=True)
sql_connection.close()

In [17]:
"""Now we have a clean dot_traffic_2015 dataset with unique entries for each day, station_id, location, direction of traffic 
    flow and type of road"""

print("New Shape of dot_traffic_2015: ")
print(dot_traffic_2015.shape)
print("\n")

print("Number of distinct rows in the new dot_traffic_2015 dataset with respect to the partitioning variables : "+str(dot_traffic_2015[['date','direction_of_travel','fips_state_code','functional_classification','lane_of_travel','station_id']].drop_duplicates().shape[0]))

New Shape of dot_traffic_2015: 
(5784441, 38)


Number of distinct rows in the new dot_traffic_2015 dataset with respect to the partitioning variables : 5784441


## From the brief description in Kaggle and from manual review, the dot_traffic_stations_2015 dataset seemed to be unique with respect to station_id, locational information (geographical place), traffic flow direction and type of road

## Hence the following set of variables were identified which can make this dataset unique
## ['direction_of_travel','fips_state_code','functional_classification','lane_of_travel', 'station_id'] 

In [18]:
"""Printing total number of rows in dot_traffic_stations_2015 along with the number of unique rows with respect to the 
    aforementioned variable comibnation"""

print("Shape of dot_traffic_stations_2015: "+str(dot_traffic_stations_2015.shape[0]))
print("\n")
print("Number of distinct rows in dot_traffic_stations_2015 dataset with respect to the aforementioned variables : "+str(dot_traffic_stations_2015[['direction_of_travel','fips_state_code','functional_classification','lane_of_travel','station_id']].drop_duplicates().shape[0]))

Shape of dot_traffic_stations_2015: 28466


Number of distinct rows in dot_traffic_stations_2015 dataset with respect to the aforementioned variables : 28466


## No discrepancies in data entries were observed in the dot_traffic_stations_2015 dataset as it was found to be unique with respect to the aforementioned identified variables.

## Total number of observations in the dot_traffic_stations_2015 dataset: 28466
## Number of unique rows with the above variable combination: 28466

## In order to proceed with the analysis, we need to merge the two received datasets with the right set of merging keys. 

## dot_traffic_2015 dataset contains daily data with respect to different combinations of station_id, location, traffic flow direction and type of roard. However, dot_traffic_stations_2015 dataset is a snapshot table containing deeper information about the methodology of traffic volume measurement at a station_id, location, direction of travel and road type level. Hence this will be a many-on-one merge.

In [19]:
"""The common variables on both datasets apart from the merge keys were removed from the dot_traffic_stations_2015 dataset
    and subsequently a left join was performed"""

Merge_Keys=['direction_of_travel','fips_state_code','functional_classification','lane_of_travel', 'station_id']
Common_Variables=list(set(dot_traffic_2015.columns.tolist()).intersection(set(dot_traffic_stations_2015.columns.tolist()))-set(Merge_Keys))

US_Traffic_2015=dot_traffic_2015.merge(dot_traffic_stations_2015[list(set(dot_traffic_stations_2015.columns.tolist())-set(Common_Variables))], on=Merge_Keys, how='left')

In [20]:
"""Checking the shape of the merged dataset"""

print("Shape of merged dataset named as US_Traffic_2015: ")
print(US_Traffic_2015.shape)
print("\n")

Shape of merged dataset named as US_Traffic_2015: 
(5784441, 84)




In [21]:
"""Viewing a sample of the Merged Data"""

US_Traffic_2015.head()

Unnamed: 0,date,day_of_data,day_of_week,direction_of_travel,direction_of_travel_name,fips_state_code,functional_classification,functional_classification_name,lane_of_travel,month_of_data,record_type,restrictions,station_id,traffic_volume_counted_after_0000_to_0100,traffic_volume_counted_after_0100_to_0200,traffic_volume_counted_after_0200_to_0300,traffic_volume_counted_after_0300_to_0400,traffic_volume_counted_after_0400_to_0500,traffic_volume_counted_after_0500_to_0600,traffic_volume_counted_after_0600_to_0700,traffic_volume_counted_after_0700_to_0800,traffic_volume_counted_after_0800_to_0900,traffic_volume_counted_after_0900_to_1000,traffic_volume_counted_after_1000_to_1100,traffic_volume_counted_after_1100_to_1200,traffic_volume_counted_after_1200_to_1300,traffic_volume_counted_after_1300_to_1400,traffic_volume_counted_after_1400_to_1500,traffic_volume_counted_after_1500_to_1600,traffic_volume_counted_after_1600_to_1700,traffic_volume_counted_after_1700_to_1800,traffic_volume_counted_after_1800_to_1900,traffic_volume_counted_after_1900_to_2000,traffic_volume_counted_after_2000_to_2100,traffic_volume_counted_after_2100_to_2200,traffic_volume_counted_after_2200_to_2300,traffic_volume_counted_after_2300_to_2400,year_of_data,number_of_lanes_monitored_for_vehicle_class,sample_type_for_truck_weight_name,concurrent_route_signing,calibration_of_weighing_system_name,number_of_lanes_monitored_for_traffic_volume,sample_type_for_traffic_volume,primary_purpose_name,station_location,posted_route_signing,year_station_established,method_of_vehicle_classification_name,type_of_sensor_name,sample_type_for_vehicle_classification_name,method_of_traffic_volume_counting,type_of_sensor,hpms_sample_identifier,lrs_location_point,hpms_sample_type,method_of_vehicle_classification,method_of_traffic_volume_counting_name,second_type_of_sensor,previous_station_id,longitude,lane_of_travel_name,calibration_of_weighing_system,number_of_lanes_monitored_for_truck_weight,year_station_discontinued,shrp_site_identification,posted_signed_route_number,classification_system_for_vehicle_classification,method_of_data_retrieval_name,fips_county_code,number_of_lanes_in_direction_indicated,latitude,concurrent_signed_route_number,sample_type_for_vehicle_classification,national_highway_system,algorithm_of_vehicle_classification_name,sample_type_for_truck_weight,method_of_data_retrieval,sample_type_for_traffic_volume_name,algorithm_of_vehicle_classification,method_of_truck_weighing,primary_purpose,lrs_identification,method_of_truck_weighing_name
0,2015-01-01,1,5,0,East-West or Southeast-Northwest combined (ATR...,19,3U,Urban: Principal Arterial - Other,0,1,3,,810,185,137,57,30,30,37,64,95,191,266,437,415,527,517,474,522,514,448,413,293,231,179,142,58,15,0.0,Station used for TMG sample (but not SHRP/LTPP...,0.0,,1.0,T,Planning or traffic statistics purposes,"IA 92 W OF \N\"" ST INDIANOLA""",3.0,81.0,,Inductance loop,Station not used for Heavy Vehicle Travel Info...,3.0,L,26617,0.0,Y,0.0,Permanent automatic traffic recorder (ATR),,,93.5827,Data with lanes combined,,0.0,0.0,,00000092,0.0,Automated (telemetry),181.0,1.0,41.3563,,N,N,,T,2.0,Station used for Traffic Volume Trends,,0.0,P,000000000000,
1,2015-01-01,1,5,0,East-West or Southeast-Northwest combined (ATR...,19,6R,Rural: Minor Collector,0,1,3,,403,2,1,0,2,0,0,0,2,3,3,1,7,6,2,7,9,8,2,3,2,2,1,0,0,15,0.0,Station used for TMG sample (but not SHRP/LTPP...,7.0,,1.0,T,Planning or traffic statistics purposes,CO RD H47 5.0 MI SE OF US 34 A,5.0,82.0,,Inductance loop,Station not used for Heavy Vehicle Travel Info...,3.0,L,0,0.0,N,0.0,Permanent automatic traffic recorder (ATR),,,92.7147,Data with lanes combined,,0.0,0.0,,000000H4,0.0,Automated (telemetry),135.0,1.0,40.9918,,N,N,,T,2.0,Station used for Traffic Volume Trends,,0.0,P,000000000000,
2,2015-01-01,1,5,0,East-West or Southeast-Northwest combined (ATR...,19,7R,Rural: Local System,0,1,3,,402,1,0,3,0,0,1,1,0,0,5,4,2,3,0,0,2,3,2,4,0,1,1,0,0,15,0.0,Station used for TMG sample (but not SHRP/LTPP...,0.0,,1.0,T,Planning or traffic statistics purposes,"230TH ST 0.1 MI W OF \U\"" AVE C""",0.0,83.0,,Inductance loop,Station not used for Heavy Vehicle Travel Info...,3.0,L,0,0.0,N,0.0,Permanent automatic traffic recorder (ATR),,,92.3775,Data with lanes combined,,0.0,0.0,,00000000,0.0,Automated (telemetry),171.0,1.0,42.1082,,N,N,,T,2.0,Station used for Traffic Volume Trends,,0.0,P,000000000000,
3,2015-01-01,1,5,0,East-West or Southeast-Northwest combined (ATR...,25,3U,Urban: Principal Arterial - Other,0,1,3,,30304,158,175,98,67,50,49,72,88,175,242,248,353,409,443,444,405,408,320,352,259,208,178,139,106,15,0.0,,0.0,,2.0,T,Planning or traffic statistics purposes,WINTHROP STREET,2.0,82.0,,Inductance loop,,3.0,L,76000101540,1595.0,Y,0.0,Permanent automatic traffic recorder (ATR),,3.0,71.229551,Data with lanes combined,,0.0,0.0,0.0,0000US44,13.0,Not automated (manual),5.0,2.0,41.859768,,,Y,,,1.0,Station used for Traffic Volume Trends,,0.0,P,00000000US44,
4,2015-01-01,1,5,0,East-West or Southeast-Northwest combined (ATR...,25,3U,Urban: Principal Arterial - Other,0,1,3,,70334,166,196,99,71,46,72,134,178,205,309,358,456,529,531,648,635,603,429,431,301,269,236,217,160,15,0.0,,0.0,,2.0,T,Planning or traffic statistics purposes,CENTRE STREET,3.0,84.0,,Inductance loop,,3.0,L,1002501500,4644.0,Y,0.0,Permanent automatic traffic recorder (ATR),,703.0,70.975928,Data with lanes combined,,0.0,0.0,0.0,000SR123,13.0,Not automated (manual),23.0,2.0,42.089429,,,Y,,,1.0,Station used for Traffic Volume Trends,,0.0,P,0000000SR123,


## The merged dataset which has been named and hereafter called as US_Traffic_2015 dataset contains 5784441 observations after the many-on-one merge as expected. Now the dataset is ready for univariate and bi-variate analysis

In [22]:
"""Deleting objects which are no longer required to release RAM space"""

del dot_traffic_2015
del dot_traffic_stations_2015

## A variable called 'previous_station_id' was also identified in the dot_traffic_stations_2015 dataset. The two-way frequency counts of 'station_id' and 'previous_station_id' were exported and evaluated to identify possible self-merging opportunities

In [23]:
writer=pd.ExcelWriter('.\Output\Two_Way_Frequency_Table_of_station_id_and_previous_station_id.xlsx', engine='xlsxwriter')
Two_Way_Freq_Table=US_Traffic_2015.groupby(['previous_station_id','station_id'], dropna=False).size().to_frame().reset_index()
Two_Way_Freq_Table.columns=['previous_station_id','station_id','Counts']
Two_Way_Freq_Table.to_excel(writer, sheet_name='Two_Way_Freq_Table', index=False)
for index, var in enumerate(Two_Way_Freq_Table):
    max_length=max(Two_Way_Freq_Table[var].astype(str).map(len).max(), len(var))
    writer.sheets['Two_Way_Freq_Table'].set_column(index, index, max_length)    
writer.save()

## From manual review of the cross-frequency table of station_id and previous_station_id, it was observed that the previous_station_id is missing or 0 for ~ 85% of the cases. Also, for the cases where it is present, from manual inspection it seemed like, the station_ids were renamed with passing time and introduction of newer stations in the vicinity. So self-merging won't create any meaningful features and hence the step was ignored

In [24]:
"""Deleting objects which are no longer required to release RAM space"""
del Two_Way_Freq_Table

## Creating Helper Functions for generating Missing Value Reports, Univariate Reports and Categorical Variable treatment. The usage of each helper function has been explained below

In [25]:
"""Several Numeric Columns are usually encoded as Categorical Data Types. The following function will try to convert 
    such variables into numeric, and the exceptions will be kept as is. Usually, exceptions will be created due to 
    alphanumeric entries"""

def Categorical_to_Numeric_Converter(dataset):
    for var in dataset.columns.to_list():
        print("Now checking data type for: "+var)
        if dataset[var].dtypes=='object':
            print(var+" is of object type")
            try:
                dataset[var]=pd.to_numeric(dataset[var])
                print("Object to Numeric Conversion is successful for: "+var)
                print("\n")
            except:
                print("Ignoring Object to Numeric Conversion for: "+var)
                print("\n")
        else:
            print(var+" is of numeric type. No further action is required")
            print("\n")
    return(dataset)

"""The following function will be used to create a missing value report for any given datasets. Typically, in pandas, missing
    value in a numeric column will be encoded as NaN, while categorical columns might have blanks as missing values along
    with NaN. The following function will take care of the different possibilites, and create a consolidated missing value 
    report"""

def Missing_Value_Report(dataset):
    Missing_Values=[]
    for var in dataset.columns.to_list():
        if np.issubdtype(dataset[var],np.number):
            Missing_Values.append(dataset[var].isnull().sum())
        else:
            Missing_Values.append(dataset[(dataset[var]=='') | (dataset[var].isnull())].shape[0])
    Percentage_Missing_Values=[np.round(100*(item/dataset.shape[0]),2) for item in Missing_Values]
    return(pd.DataFrame({'Variable':dataset.columns.to_list(),'Type':[str(item) for item in dataset.dtypes.to_frame().reset_index().iloc[:,1].tolist()],'Missing_Values':Missing_Values,'Percentage_Missing_Values':Percentage_Missing_Values}))

"""The following function will run Univariates for all the variables of the dataset. For numeric columns, it will return the
    mean, standard deviation, 25th, 50th and 75th percentiles, while for categorical columns it will create a one-way frequency
    table of all the distinct levels. The Missing Value Report will also be attached at the beginning of the report"""

def Descriptive_Statistics(dataset,fileloc,filename):
    Numeric_Vars=[]
    Categorical_Vars=[]
    for var in dataset.columns.to_list():
        if np.issubdtype(dataset[var],np.number):
            Numeric_Vars.append(var)
        elif np.issubdtype(dataset[var],np.object_):
            Categorical_Vars.append(var)
    writer=pd.ExcelWriter(fileloc+'Univariate_Report_{}.xlsx'.format(filename), engine='xlsxwriter')
    Temp_Missing_Value_Report=Missing_Value_Report(dataset)
    Temp_Missing_Value_Report.to_excel(writer, sheet_name='Missing_Value_Report', index=False)
    for index, var in enumerate(Temp_Missing_Value_Report):
        max_length=max(Temp_Missing_Value_Report[var].astype(str).map(len).max(), len(var))
        writer.sheets['Missing_Value_Report'].set_column(index, index, max_length)
    Univariate_Numeric_Variables=dataset[Numeric_Vars].describe().reset_index()
    Univariate_Numeric_Variables.to_excel(writer, sheet_name='Univariate_Numeric_Variables', index=False)
    for index, var in enumerate(Univariate_Numeric_Variables):
        max_length=max(Univariate_Numeric_Variables[var].astype(str).map(len).max(), len(var))
        writer.sheets['Univariate_Numeric_Variables'].set_column(index, index, max_length)    
    for cat_var in Categorical_Vars:
        temp_df=dataset[cat_var].value_counts().to_frame().reset_index()
        temp_df.to_excel(writer, sheet_name=cat_var[:31], index=False)
        for index, var in enumerate(temp_df):
            max_length=max(temp_df[var].astype(str).map(len).max(), len(var))
            writer.sheets[cat_var[:31]].set_column(index, index, max_length)
    writer.save()
    
"""For any statistical analysis, categorical variable treatment is very important. There are multiple methods for such treatment,
    -One Hot Encoding being one of the most popular ones. But One Hot Encoding also increases data dimensions significantly and
    hence it has been avoided here. String Indexing has been used as an alternative, where the level having the highest frequency
    will be assigned an index of 0, the level having second-highest frequency will be assigned with an index of 1, so on and so
    forth. The following function will be used to perform string indexing of categorical variables"""

def String_Indexing(dataset, varlist, fileloc):
    writer=pd.ExcelWriter(fileloc+'String_Indexing_Logic.xlsx', engine='xlsxwriter')
    for cat_var in varlist:
        temp=dataset[cat_var].value_counts().to_frame().reset_index()
        temp['{}_index'.format(cat_var)]=list(range(len(temp)))
        temp.drop([cat_var], axis=1, inplace=True)
        temp.columns=[cat_var,'{}_index'.format(cat_var)]
        temp.to_excel(writer, sheet_name=cat_var[:31], index=False)
        for index, var in enumerate(temp):
            max_length=max(temp[var].astype(str).map(len).max(), len(var))
            writer.sheets[cat_var[:31]].set_column(index, index, max_length)
        dataset=dataset.merge(temp[[cat_var,'{}_index'.format(cat_var)]],on=[cat_var],how='left')
    writer.save()
    return(dataset)

In [26]:
"""Running the helper functions to generate the Univariate Report"""

US_Traffic_2015=Categorical_to_Numeric_Converter(US_Traffic_2015)
Descriptive_Statistics(US_Traffic_2015,'.\\Output\\','US_Traffic_2015')

Now checking data type for: date
date is of object type
Ignoring Object to Numeric Conversion for: date


Now checking data type for: day_of_data
day_of_data is of numeric type. No further action is required


Now checking data type for: day_of_week
day_of_week is of numeric type. No further action is required


Now checking data type for: direction_of_travel
direction_of_travel is of numeric type. No further action is required


Now checking data type for: direction_of_travel_name
direction_of_travel_name is of object type
Ignoring Object to Numeric Conversion for: direction_of_travel_name


Now checking data type for: fips_state_code
fips_state_code is of numeric type. No further action is required


Now checking data type for: functional_classification
functional_classification is of object type
Ignoring Object to Numeric Conversion for: functional_classification


Now checking data type for: functional_classification_name
functional_classification_name is of object type
Ignoring Ob

## Based on the Univariate Report we observed the following:


## The variable 'restrictions' is 100% missing and hence will be dropped
## The variables 'record_type' and 'year_of_data' has zero standard deviation (constant value), and hence will be dropped
## Variables like station_id or previous_station_id which acted as merge_keys are no longer required for analysis and hence will be dropped. 

In [27]:
"""Dropping the aforementioned Variables"""

Variables_to_Drop=['restrictions','record_type','year_of_data','station_id','previous_station_id']
US_Traffic_2015.drop(Variables_to_Drop, axis=1, inplace=True)

## Derived variables will be created based on date information to enhance the pattern identification. After creation of the derived variables the 'date' variable will be dropped.
## Since traffic is heavily dependent upon weather, hence the library meteostat will be used to fetch daily weather data of the closest weather station with respect to the available latitude and longitude 
## All missing values will be imputed with a special value of -99999999
## All categorical variables will be string indexed based on their frequencies and subsequently the original categorical variable will be dropped. However, a record of the string indexing logic will be retained for future reference

In [28]:
"""Creating the following derived variables based on date:
    1. Weekend_Flag (All Saturdays and Sundays will be marked as 1)
    2. Public_Holiday_Flag (All US Holidays of 2015 will be marked as 1)
    3. Long_Weekend_Flag (All the US Holidays preceeded or succeeded by a weekend will be marked as 1 along with the weekends)
    4. Vacation_Flag (This is a subjective call: 1st to 4th January and 19th to 31st December will be marked as 1)
    5. The library meteostat was used to fetch geo-location-based weather data for the entire year of 2015.
    
Eventually the date, latitude and longitude variables will be dropped after creation of the derived variables"""

US_Traffic_2015['date']=[item.replace("-","") for item in US_Traffic_2015['date'].tolist()]

US_Traffic_2015.loc[((US_Traffic_2015['day_of_week']==1) | (US_Traffic_2015['day_of_week']==7)),'Weekend_Flag']=1
US_Traffic_2015['Weekend_Flag']=US_Traffic_2015['Weekend_Flag'].fillna(0)

Public_Holidays=[]
for date in holidays.US(years=2015).items():
    Public_Holidays.append(str(date[0]).replace("-",""))
    
Public_Holidays=pd.DataFrame({'date':Public_Holidays,'Public_Holiday_Flag':[1]*len(Public_Holidays)})
US_Traffic_2015=US_Traffic_2015.merge(Public_Holidays, on=['date'], how='left')
US_Traffic_2015['Public_Holiday_Flag']=US_Traffic_2015['Public_Holiday_Flag'].fillna(0)

US_Traffic_2015.loc[((US_Traffic_2015['Public_Holiday_Flag']==1) & ((US_Traffic_2015['day_of_week']==6) | (US_Traffic_2015['day_of_week']==2))),'Long_Weekend_Flag']=1
US_Traffic_2015['Long_Weekend_Flag']=US_Traffic_2015['Long_Weekend_Flag'].fillna(0)

Additional_Long_Weekend_Dates=[]

for item in US_Traffic_2015[US_Traffic_2015['Long_Weekend_Flag']==1][['date','day_of_week']].drop_duplicates().values.tolist():
    if item[1]==2:
        Additional_Long_Weekend_Dates.append((datetime.datetime.strptime(str(item[0]),"%Y%m%d")-datetime.timedelta(days=1)).strftime("%Y%m%d"))
        Additional_Long_Weekend_Dates.append((datetime.datetime.strptime(str(item[0]),"%Y%m%d")-datetime.timedelta(days=2)).strftime("%Y%m%d"))
    else:
        Additional_Long_Weekend_Dates.append((datetime.datetime.strptime(str(item[0]),"%Y%m%d")+datetime.timedelta(days=1)).strftime("%Y%m%d"))
        Additional_Long_Weekend_Dates.append((datetime.datetime.strptime(str(item[0]),"%Y%m%d")+datetime.timedelta(days=2)).strftime("%Y%m%d"))
        
Additional_Long_Weekend_Dates=pd.DataFrame({'date':Additional_Long_Weekend_Dates,'Additional_Long_Weekend_Flag':[1]*len(Additional_Long_Weekend_Dates)})

US_Traffic_2015=US_Traffic_2015.merge(Additional_Long_Weekend_Dates, on=['date'], how='left')
US_Traffic_2015['Additional_Long_Weekend_Flag']=US_Traffic_2015['Additional_Long_Weekend_Flag'].fillna(0)

US_Traffic_2015.loc[US_Traffic_2015['Additional_Long_Weekend_Flag']==1,'Long_Weekend_Flag']=1

US_Traffic_2015['date']=pd.to_numeric(US_Traffic_2015['date'])
US_Traffic_2015.loc[((US_Traffic_2015['date']<=20150104) | (US_Traffic_2015['date']>=20151219)), 'Vacation_Flag']=1
US_Traffic_2015['Vacation_Flag']=US_Traffic_2015['Vacation_Flag'].fillna(0)


US_Traffic_2015['date']=[datetime.datetime.strptime(str(item),"%Y%m%d").strftime("%Y-%m-%d") for item in US_Traffic_2015['date'].tolist()]

Start_Date=datetime.datetime(2015,1,1)
End_Date=datetime.datetime(2015,12,31)

Weather_Data=pd.DataFrame({})

for item in US_Traffic_2015[['latitude','longitude']].drop_duplicates().values.tolist():
    temp_df=Daily(Stations().nearby(item[0],item[1]).fetch(1).reset_index()['id'].values[0],Start_Date,End_Date).fetch().reset_index()
    temp_df['latitude']=item[0]
    temp_df['longitude']=item[1]
    temp_df.rename({'time':'date'}, axis=1, inplace=True)
    Weather_Data=Weather_Data.append(temp_df)
    
Weather_Data['date']=Weather_Data['date'].astype(str)

US_Traffic_2015=US_Traffic_2015.merge(Weather_Data, on=['date','latitude','longitude'], how='left')

US_Traffic_2015.drop(['Additional_Long_Weekend_Flag','date','latitude','longitude'],axis=1,inplace=True)

In [29]:
"""Generating a quick missing value report for the newly fetched weather data"""

Missing_Value_Report(US_Traffic_2015[list(set(Weather_Data.columns.tolist())-set(['date','latitude','longitude']))])

Unnamed: 0,Variable,Type,Missing_Values,Percentage_Missing_Values
0,tsun,float64,5782748,99.97
1,tmax,float64,3350940,57.93
2,wspd,float64,5342767,92.36
3,tmin,float64,2753389,47.6
4,wpgt,float64,5784441,100.0
5,snow,float64,5670517,98.03
6,tavg,float64,1645469,28.45
7,wdir,float64,5605531,96.91
8,prcp,float64,4689204,81.07
9,pres,float64,5646447,97.61


## Based on the fill rate only 'tavg' (Average Temperature) will be retained.
## Since precipitation has a strong connection with traffic volumes, hence even with very low fill rate (~19%) the variable 'prcp' (Precipitation) will be retained for identification of potential patterns.

In [30]:
"""Dropping the Weather Variables except 'tavg' and 'prcp'"""

Variables_to_Drop=list(set(Weather_Data.columns.tolist())-set(['date','latitude','longitude','tavg','prcp']))
US_Traffic_2015.drop(Variables_to_Drop, axis=1, inplace=True)

del Weather_Data

In [31]:
"""Missing Value Imputation with a special value of -99999999"""

for var in US_Traffic_2015.columns.to_list():
    if np.issubdtype(US_Traffic_2015[var],np.number):
        US_Traffic_2015.loc[US_Traffic_2015[var].isnull(),var]=-99999999
    else:
        US_Traffic_2015.loc[((US_Traffic_2015[var]=='') | (US_Traffic_2015[var].isnull())),var]='-99999999'

In [32]:
"""Identification of Categorical Variables for String Indexing"""

Categorical_Variables=[]

for var in US_Traffic_2015.columns.to_list():
    if not np.issubdtype(US_Traffic_2015[var],np.number):
        Categorical_Variables.append(var)

In [33]:
"""String Indexing of Categorical Variables and adding the indexed variable with a suffix of '_index' and subsequently dropping
    the original categorical variable"""

Indexed_Categorical_Variables=[var+"_index" for var in Categorical_Variables]

US_Traffic_2015=String_Indexing(US_Traffic_2015, Categorical_Variables, '.\\Output\\')
US_Traffic_2015.drop(Categorical_Variables, axis=1, inplace=True)

## During manual inspection of the Univariate Report it was observed that there exist perfectly identical variables within the data. For example, function_classification and functional_classification_name are identical variables, the former containing the abbreviation of the latter. Hence in the following step we are going to create a correlation matrix of all the variables and drop randomly one from the pairs having perfect correlation coefficient.

In [34]:
"""Computing the correlation coefficient of all possible variable combinations and saving them in a pandas dataframe"""

Variable1=[]
Variable2=[]
CC=[]

for index, var1 in enumerate(US_Traffic_2015.columns.tolist()):
    for var2 in (US_Traffic_2015.columns.tolist())[index+1:]:
        print("Now computing correlation coefficient of: "+var1+" and "+var2)
        print("\n")
        Variable1.append(var1)
        Variable2.append(var2)
        CC.append(US_Traffic_2015[var1].corr(US_Traffic_2015[var2]))
        
CC_Table_of_Variables=pd.DataFrame({'Variable1':Variable1,'Variable2':Variable2,'CC':CC})

Now computing correlation coefficient of: day_of_data and day_of_week


Now computing correlation coefficient of: day_of_data and direction_of_travel


Now computing correlation coefficient of: day_of_data and fips_state_code


Now computing correlation coefficient of: day_of_data and lane_of_travel


Now computing correlation coefficient of: day_of_data and month_of_data


Now computing correlation coefficient of: day_of_data and traffic_volume_counted_after_0000_to_0100


Now computing correlation coefficient of: day_of_data and traffic_volume_counted_after_0100_to_0200


Now computing correlation coefficient of: day_of_data and traffic_volume_counted_after_0200_to_0300


Now computing correlation coefficient of: day_of_data and traffic_volume_counted_after_0300_to_0400


Now computing correlation coefficient of: day_of_data and traffic_volume_counted_after_0400_to_0500


Now computing correlation coefficient of: day_of_data and traffic_volume_counted_after_0500_to_0600


Now computi

Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_0500_to_0600


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_0600_to_0700


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_0700_to_0800


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_0800_to_0900


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_0900_to_1000


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_1000_to_1100


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_1100_to_1200


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_1200_to_1300


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_after_1300_to_1400


Now computing correlation coefficient of: day_of_week and traffic_volume_counted_a

Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1300_to_1400


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1400_to_1500


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1500_to_1600


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1600_to_1700


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1700_to_1800


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1800_to_1900


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: direction_of_travel and traffic_volume_counted_after_2100_to_2200


Now comput

Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_1600_to_1700


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_1700_to_1800


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_1800_to_1900


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: fips_state_code and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: fips_state_code and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: fi

Now computing correlation coefficient of: lane_of_travel and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: lane_of_travel and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: lane_of_travel and concurrent_route_signing


Now computing correlation coefficient of: lane_of_travel and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: lane_of_travel and posted_route_signing


Now computing correlation coefficient of: lane_of_travel and year_station_established


Now computing correlation coefficient of: lane_of_travel and method_of_traffic_volume_counting


Now computing correlation coefficient of: lane_of_travel and lrs_location_point


Now computing correlation coefficient of: lane_of_travel and method_of_vehicle_classification


Now computing correlation coefficient of: lane_of_travel and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficient o

Now computing correlation coefficient of: month_of_data and classification_system_for_vehicle_classification


Now computing correlation coefficient of: month_of_data and fips_county_code


Now computing correlation coefficient of: month_of_data and number_of_lanes_in_direction_indicated


Now computing correlation coefficient of: month_of_data and method_of_data_retrieval


Now computing correlation coefficient of: month_of_data and method_of_truck_weighing


Now computing correlation coefficient of: month_of_data and Weekend_Flag


Now computing correlation coefficient of: month_of_data and Public_Holiday_Flag


Now computing correlation coefficient of: month_of_data and Long_Weekend_Flag


Now computing correlation coefficient of: month_of_data and Vacation_Flag


Now computing correlation coefficient of: month_of_data and tavg


Now computing correlation coefficient of: month_of_data and prcp


Now computing correlation coefficient of: month_of_data and direction_of_travel_name_ind

Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and method_of_data_retrieval


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and method_of_truck_weighing


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and Weekend_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and Public_Holiday_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and Long_Weekend_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and Vacation_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and tavg


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and prcp


Now computing correlation coefficient of: traffic_volume_counted_after_0000_to_0100 and direction_of_travel_name_index


Now computing correlation coeffici

Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and method_of_traffic_volume_counting


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and lrs_location_point


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and method_of_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and year_station_discontinued


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and classification_system_for_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and fips_county_code


Now computing correlation coefficient of: traffic_volume_counted_after_0100_to_0200 and number_of_lanes_in_direction_indicated


Now computing correl

Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and posted_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and year_station_established


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and method_of_traffic_volume_counting


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and lrs_location_point


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and method_of_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_0200_to_0300 and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficie

Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and posted_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and year_station_established


Now computing correlation coefficient of: traffic_volume_counted_after_0300_to_0400 and method_of_traffic_volume_counting


No

Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: traffic_volume_counted_after_0400_to_0500 and 

Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0500_to_0600 and num

Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and traffic_volume_counted_after_1800_to_1900


Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0600_to_0700 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_060

Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0700_to_0800 and num

Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0800_to_0900 and num

Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and traffic_volume_counted_after_2200_to_2300


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: traffic_volume_counted_after_0900_to_1000 and 

Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and traffic_volume_counted_after_2300_to_2400


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and posted_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and year_station_established


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and method_of_traffic_volume_counting


Now computing correlation coefficient of: traffic_volume_counted_after_1000_to_1100 and lrs_location_point


Now computing correlation

Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and posted_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and year_station_established


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and method_of_traffic_volume_counting


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and lrs_location_point


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and method_of_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and year_station_discontinued


Now computing correlation coefficient of: traffic_volume_counted_after_1100_to_1200 and classification_system_for_vehicle_classification


Now computing correlation coef

Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and method_of_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and year_station_discontinued


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and classification_system_for_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and fips_county_code


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and number_of_lanes_in_direction_indicated


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and method_of_data_retrieval


Now computing correlation coefficient of: traffic_volume_counted_after_1200_to_1300 and method_of_truck_weighing


Now computing correlati

Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and number_of_lanes_in_direction_indicated


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and method_of_data_retrieval


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and method_of_truck_weighing


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and Weekend_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and Public_Holiday_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and Long_Weekend_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and Vacation_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and tavg


Now computing correlation coefficient of: traffic_volume_counted_after_1300_to_1400 and prcp


Now computing correlation 

Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and tavg


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and prcp


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and direction_of_travel_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and functional_classification_index


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and functional_classification_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and sample_type_for_truck_weight_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and calibration_of_weighing_system_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1400_to_1500 and sample_type_for_traffic_volume_index


Now computing correlation coefficient of: traffic_v

Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and station_location_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and method_of_vehicle_classification_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and type_of_sensor_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and sample_type_for_vehicle_classification_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and type_of_sensor_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and hpms_sample_identifier_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and hpms_sample_type_index


Now computing correlation coefficient of: traffic_volume_counted_after_1500_to_1600 and method_of_traffic_volume_counting_name_index


Now computing corre

Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and calibration_of_weighing_system_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and shrp_site_identification_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and posted_signed_route_number_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and method_of_data_retrieval_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and concurrent_signed_route_number_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and sample_type_for_vehicle_classification_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and national_highway_system_index


Now computing correlation coefficient of: traffic_volume_counted_after_1600_to_1700 and algorithm_of_vehicle_classificati

Now computing correlation coefficient of: traffic_volume_counted_after_1700_to_1800 and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: traffic_volume_counted_after_1700_to_1800 and primary_purpose_index


Now computing correlation coefficient of: traffic_volume_counted_after_1700_to_1800 and lrs_identification_index


Now computing correlation coefficient of: traffic_volume_counted_after_1700_to_1800 and method_of_truck_weighing_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_1800_to_1900 and traffic_volume_counted_after_1900_to_2000


Now computing correlation coefficient of: traffic_volume_counted_after_1800_to_1900 and traffic_volume_counted_after_2000_to_2100


Now computing correlation coefficient of: traffic_volume_counted_after_1800_to_1900 and traffic_volume_counted_after_2100_to_2200


Now computing correlation coefficient of: traffic_volume_counted_after_1800_to_1900 and traffic_volume_counted_after_22

Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and year_station_established


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and method_of_traffic_volume_counting


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and lrs_location_point


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and method_of_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and year_station_discontinued


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and classification_system_for_vehicle_classification


Now computing correlation coefficient of: traffic_volume_counted_after_1900_to_2000 and fips_county_code


Now computing correlation coeffici

Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and Vacation_Flag


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and tavg


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and prcp


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and direction_of_travel_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and functional_classification_index


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and functional_classification_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and sample_type_for_truck_weight_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_2000_to_2100 and calibration_of_weighing_system_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_200

Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and type_of_sensor_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and hpms_sample_identifier_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and hpms_sample_type_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and method_of_traffic_volume_counting_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and second_type_of_sensor_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and lane_of_travel_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and calibration_of_weighing_system_index


Now computing correlation coefficient of: traffic_volume_counted_after_2100_to_2200 and shrp_site_identification_index


Now computing correlation coefficient of

Now computing correlation coefficient of: traffic_volume_counted_after_2200_to_2300 and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: traffic_volume_counted_after_2200_to_2300 and primary_purpose_index


Now computing correlation coefficient of: traffic_volume_counted_after_2200_to_2300 and lrs_identification_index


Now computing correlation coefficient of: traffic_volume_counted_after_2200_to_2300 and method_of_truck_weighing_name_index


Now computing correlation coefficient of: traffic_volume_counted_after_2300_to_2400 and number_of_lanes_monitored_for_vehicle_class


Now computing correlation coefficient of: traffic_volume_counted_after_2300_to_2400 and concurrent_route_signing


Now computing correlation coefficient of: traffic_volume_counted_after_2300_to_2400 and number_of_lanes_monitored_for_traffic_volume


Now computing correlation coefficient of: traffic_volume_counted_after_2300_to_2400 and posted_route_signing


Now computing correl

Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and method_of_truck_weighing


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and Weekend_Flag


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and Public_Holiday_Flag


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and Long_Weekend_Flag


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and Vacation_Flag


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and tavg


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and prcp


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and direction_of_travel_name_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_vehicle_class and functional_classification_index


Now compu



Now computing correlation coefficient of: concurrent_route_signing and calibration_of_weighing_system_index


Now computing correlation coefficient of: concurrent_route_signing and shrp_site_identification_index


Now computing correlation coefficient of: concurrent_route_signing and posted_signed_route_number_index


Now computing correlation coefficient of: concurrent_route_signing and method_of_data_retrieval_name_index


Now computing correlation coefficient of: concurrent_route_signing and concurrent_signed_route_number_index


Now computing correlation coefficient of: concurrent_route_signing and sample_type_for_vehicle_classification_index


Now computing correlation coefficient of: concurrent_route_signing and national_highway_system_index


Now computing correlation coefficient of: concurrent_route_signing and algorithm_of_vehicle_classification_name_index


Now computing correlation coefficient of: concurrent_route_signing and sample_type_for_truck_weight_index


Now comput

Now computing correlation coefficient of: posted_route_signing and number_of_lanes_in_direction_indicated


Now computing correlation coefficient of: posted_route_signing and method_of_data_retrieval


Now computing correlation coefficient of: posted_route_signing and method_of_truck_weighing


Now computing correlation coefficient of: posted_route_signing and Weekend_Flag


Now computing correlation coefficient of: posted_route_signing and Public_Holiday_Flag


Now computing correlation coefficient of: posted_route_signing and Long_Weekend_Flag


Now computing correlation coefficient of: posted_route_signing and Vacation_Flag


Now computing correlation coefficient of: posted_route_signing and tavg


Now computing correlation coefficient of: posted_route_signing and prcp


Now computing correlation coefficient of: posted_route_signing and direction_of_travel_name_index


Now computing correlation coefficient of: posted_route_signing and functional_classification_index


Now computing 

Now computing correlation coefficient of: year_station_established and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: year_station_established and primary_purpose_index


Now computing correlation coefficient of: year_station_established and lrs_identification_index


Now computing correlation coefficient of: year_station_established and method_of_truck_weighing_name_index


Now computing correlation coefficient of: method_of_traffic_volume_counting and lrs_location_point


Now computing correlation coefficient of: method_of_traffic_volume_counting and method_of_vehicle_classification


Now computing correlation coefficient of: method_of_traffic_volume_counting and number_of_lanes_monitored_for_truck_weight


Now computing correlation coefficient of: method_of_traffic_volume_counting and year_station_discontinued


Now computing correlation coefficient of: method_of_traffic_volume_counting and classification_system_for_vehicle_classification


Now

Now computing correlation coefficient of: lrs_location_point and lane_of_travel_name_index


Now computing correlation coefficient of: lrs_location_point and calibration_of_weighing_system_index


Now computing correlation coefficient of: lrs_location_point and shrp_site_identification_index


Now computing correlation coefficient of: lrs_location_point and posted_signed_route_number_index


Now computing correlation coefficient of: lrs_location_point and method_of_data_retrieval_name_index


Now computing correlation coefficient of: lrs_location_point and concurrent_signed_route_number_index


Now computing correlation coefficient of: lrs_location_point and sample_type_for_vehicle_classification_index


Now computing correlation coefficient of: lrs_location_point and national_highway_system_index


Now computing correlation coefficient of: lrs_location_point and algorithm_of_vehicle_classification_name_index


Now computing correlation coefficient of: lrs_location_point and sample_typ

Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and calibration_of_weighing_system_name_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and sample_type_for_traffic_volume_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and primary_purpose_name_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and station_location_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and method_of_vehicle_classification_name_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and type_of_sensor_name_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and sample_type_for_vehicle_classification_name_index


Now computing correlation coefficient of: number_of_lanes_monitored_for_truck_weight and type_of_sensor_index


No

Now computing correlation coefficient of: classification_system_for_vehicle_classification and Public_Holiday_Flag


Now computing correlation coefficient of: classification_system_for_vehicle_classification and Long_Weekend_Flag


Now computing correlation coefficient of: classification_system_for_vehicle_classification and Vacation_Flag


Now computing correlation coefficient of: classification_system_for_vehicle_classification and tavg


Now computing correlation coefficient of: classification_system_for_vehicle_classification and prcp


Now computing correlation coefficient of: classification_system_for_vehicle_classification and direction_of_travel_name_index


Now computing correlation coefficient of: classification_system_for_vehicle_classification and functional_classification_index


Now computing correlation coefficient of: classification_system_for_vehicle_classification and functional_classification_name_index


Now computing correlation coefficient of: classification_syste

Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and method_of_data_retrieval


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and method_of_truck_weighing


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and Weekend_Flag


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and Public_Holiday_Flag


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and Long_Weekend_Flag


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and Vacation_Flag


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and tavg


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and prcp


Now computing correlation coefficient of: number_of_lanes_in_direction_indicated and direction_of_travel_name_index


Now computing correlation coefficient of: number_of_lanes_in_

Now computing correlation coefficient of: method_of_data_retrieval and lrs_identification_index


Now computing correlation coefficient of: method_of_data_retrieval and method_of_truck_weighing_name_index


Now computing correlation coefficient of: method_of_truck_weighing and Weekend_Flag


Now computing correlation coefficient of: method_of_truck_weighing and Public_Holiday_Flag


Now computing correlation coefficient of: method_of_truck_weighing and Long_Weekend_Flag


Now computing correlation coefficient of: method_of_truck_weighing and Vacation_Flag


Now computing correlation coefficient of: method_of_truck_weighing and tavg


Now computing correlation coefficient of: method_of_truck_weighing and prcp


Now computing correlation coefficient of: method_of_truck_weighing and direction_of_travel_name_index


Now computing correlation coefficient of: method_of_truck_weighing and functional_classification_index


Now computing correlation coefficient of: method_of_truck_weighing and 

Now computing correlation coefficient of: Public_Holiday_Flag and sample_type_for_traffic_volume_index


Now computing correlation coefficient of: Public_Holiday_Flag and primary_purpose_name_index


Now computing correlation coefficient of: Public_Holiday_Flag and station_location_index


Now computing correlation coefficient of: Public_Holiday_Flag and method_of_vehicle_classification_name_index


Now computing correlation coefficient of: Public_Holiday_Flag and type_of_sensor_name_index


Now computing correlation coefficient of: Public_Holiday_Flag and sample_type_for_vehicle_classification_name_index


Now computing correlation coefficient of: Public_Holiday_Flag and type_of_sensor_index


Now computing correlation coefficient of: Public_Holiday_Flag and hpms_sample_identifier_index


Now computing correlation coefficient of: Public_Holiday_Flag and hpms_sample_type_index


Now computing correlation coefficient of: Public_Holiday_Flag and method_of_traffic_volume_counting_name_ind

Now computing correlation coefficient of: Vacation_Flag and algorithm_of_vehicle_classification_name_index


Now computing correlation coefficient of: Vacation_Flag and sample_type_for_truck_weight_index


Now computing correlation coefficient of: Vacation_Flag and sample_type_for_traffic_volume_name_index


Now computing correlation coefficient of: Vacation_Flag and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: Vacation_Flag and primary_purpose_index


Now computing correlation coefficient of: Vacation_Flag and lrs_identification_index


Now computing correlation coefficient of: Vacation_Flag and method_of_truck_weighing_name_index


Now computing correlation coefficient of: tavg and prcp


Now computing correlation coefficient of: tavg and direction_of_travel_name_index


Now computing correlation coefficient of: tavg and functional_classification_index


Now computing correlation coefficient of: tavg and functional_classification_name_index




Now computing correlation coefficient of: direction_of_travel_name_index and concurrent_signed_route_number_index


Now computing correlation coefficient of: direction_of_travel_name_index and sample_type_for_vehicle_classification_index


Now computing correlation coefficient of: direction_of_travel_name_index and national_highway_system_index


Now computing correlation coefficient of: direction_of_travel_name_index and algorithm_of_vehicle_classification_name_index


Now computing correlation coefficient of: direction_of_travel_name_index and sample_type_for_truck_weight_index


Now computing correlation coefficient of: direction_of_travel_name_index and sample_type_for_traffic_volume_name_index


Now computing correlation coefficient of: direction_of_travel_name_index and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: direction_of_travel_name_index and primary_purpose_index


Now computing correlation coefficient of: direction_of_travel_name_i

Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and type_of_sensor_name_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and sample_type_for_vehicle_classification_name_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and type_of_sensor_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and hpms_sample_identifier_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and hpms_sample_type_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and method_of_traffic_volume_counting_name_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and second_type_of_sensor_index


Now computing correlation coefficient of: sample_type_for_truck_weight_name_index and lane_of_travel_name_index


Now computing correlation coefficient of: sample

Now computing correlation coefficient of: sample_type_for_traffic_volume_index and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: sample_type_for_traffic_volume_index and primary_purpose_index


Now computing correlation coefficient of: sample_type_for_traffic_volume_index and lrs_identification_index


Now computing correlation coefficient of: sample_type_for_traffic_volume_index and method_of_truck_weighing_name_index


Now computing correlation coefficient of: primary_purpose_name_index and station_location_index


Now computing correlation coefficient of: primary_purpose_name_index and method_of_vehicle_classification_name_index


Now computing correlation coefficient of: primary_purpose_name_index and type_of_sensor_name_index


Now computing correlation coefficient of: primary_purpose_name_index and sample_type_for_vehicle_classification_name_index


Now computing correlation coefficient of: primary_purpose_name_index and type_of_sensor_inde

Now computing correlation coefficient of: type_of_sensor_name_index and sample_type_for_vehicle_classification_name_index


Now computing correlation coefficient of: type_of_sensor_name_index and type_of_sensor_index


Now computing correlation coefficient of: type_of_sensor_name_index and hpms_sample_identifier_index


Now computing correlation coefficient of: type_of_sensor_name_index and hpms_sample_type_index


Now computing correlation coefficient of: type_of_sensor_name_index and method_of_traffic_volume_counting_name_index


Now computing correlation coefficient of: type_of_sensor_name_index and second_type_of_sensor_index


Now computing correlation coefficient of: type_of_sensor_name_index and lane_of_travel_name_index


Now computing correlation coefficient of: type_of_sensor_name_index and calibration_of_weighing_system_index


Now computing correlation coefficient of: type_of_sensor_name_index and shrp_site_identification_index


Now computing correlation coefficient of: ty

Now computing correlation coefficient of: hpms_sample_identifier_index and sample_type_for_traffic_volume_name_index


Now computing correlation coefficient of: hpms_sample_identifier_index and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: hpms_sample_identifier_index and primary_purpose_index


Now computing correlation coefficient of: hpms_sample_identifier_index and lrs_identification_index


Now computing correlation coefficient of: hpms_sample_identifier_index and method_of_truck_weighing_name_index


Now computing correlation coefficient of: hpms_sample_type_index and method_of_traffic_volume_counting_name_index


Now computing correlation coefficient of: hpms_sample_type_index and second_type_of_sensor_index


Now computing correlation coefficient of: hpms_sample_type_index and lane_of_travel_name_index


Now computing correlation coefficient of: hpms_sample_type_index and calibration_of_weighing_system_index


Now computing correlation co

Now computing correlation coefficient of: calibration_of_weighing_system_index and algorithm_of_vehicle_classification_name_index


Now computing correlation coefficient of: calibration_of_weighing_system_index and sample_type_for_truck_weight_index


Now computing correlation coefficient of: calibration_of_weighing_system_index and sample_type_for_traffic_volume_name_index


Now computing correlation coefficient of: calibration_of_weighing_system_index and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: calibration_of_weighing_system_index and primary_purpose_index


Now computing correlation coefficient of: calibration_of_weighing_system_index and lrs_identification_index


Now computing correlation coefficient of: calibration_of_weighing_system_index and method_of_truck_weighing_name_index


Now computing correlation coefficient of: shrp_site_identification_index and posted_signed_route_number_index


Now computing correlation coefficient of: sh

Now computing correlation coefficient of: algorithm_of_vehicle_classification_name_index and method_of_truck_weighing_name_index


Now computing correlation coefficient of: sample_type_for_truck_weight_index and sample_type_for_traffic_volume_name_index


Now computing correlation coefficient of: sample_type_for_truck_weight_index and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: sample_type_for_truck_weight_index and primary_purpose_index


Now computing correlation coefficient of: sample_type_for_truck_weight_index and lrs_identification_index


Now computing correlation coefficient of: sample_type_for_truck_weight_index and method_of_truck_weighing_name_index


Now computing correlation coefficient of: sample_type_for_traffic_volume_name_index and algorithm_of_vehicle_classification_index


Now computing correlation coefficient of: sample_type_for_traffic_volume_name_index and primary_purpose_index


Now computing correlation coefficient of: s

In [35]:
"""Filtering the non-identical variable combinations which are having a correlation coefficient of 1"""

CC_Table_of_Variables[(CC_Table_of_Variables['CC']==1) & (CC_Table_of_Variables['Variable1']!=CC_Table_of_Variables['Variable2'])]

Unnamed: 0,Variable1,Variable2,CC
2007,number_of_lanes_monitored_for_vehicle_class,method_of_data_retrieval,1.0
2057,concurrent_route_signing,method_of_data_retrieval,1.0
2106,number_of_lanes_monitored_for_traffic_volume,method_of_data_retrieval,1.0
2154,posted_route_signing,method_of_data_retrieval,1.0
2247,method_of_traffic_volume_counting,method_of_data_retrieval,1.0
2336,method_of_vehicle_classification,method_of_data_retrieval,1.0
2379,number_of_lanes_monitored_for_truck_weight,method_of_data_retrieval,1.0
2541,number_of_lanes_in_direction_indicated,method_of_data_retrieval,1.0
2886,functional_classification_index,functional_classification_name_index,1.0
3091,type_of_sensor_name_index,type_of_sensor_index,1.0


## The following variables were dropped to eliminate the identical variable issue:

## ['method_of_data_retrieval', 'functional_classification_name_index', 'type_of_sensor_name_index']

In [36]:
"""Dropping the aforementioned Variables"""

Variables_to_Drop=['method_of_data_retrieval', 'functional_classification_name_index', 'type_of_sensor_name_index']
US_Traffic_2015.drop(Variables_to_Drop, axis=1, inplace=True)

## Since the primary objective is to identify patterns in Traffic Volumes, hence the recorded traffic volume variables (example: traffic_volume_counted_after_0100_to_0200), will be henceforth called as Target Variables

## There are 24 target variables of 24 respective hourly bins, recording the traffic volumes of a particular combination of station_id, locational information (geographical place), traffic flow direction and type of road over a period of 1 year

## Typically, traffic volumes vary throughout the day depending on multiple factors of business hours, non-business hours, night time, etc. However, there's opportunity to club some of the hourly bins into a single variable based on the respective average traffic volumes. For example, traffic volumes can be similar between 10am to 11am and 11am to 12 noon, hence this 2 variables can be clubbed into a single variable representing traffic volume between 10am to 12 noon.

## A K-Means clustering of the means of these 24 variables were performed to create 3 such groups, which will be henceforth named as High Traffic Volume, Medium Traffic Volume and Low Traffic Volume.

In [37]:
"""Identifying Target Variables"""

Target_Variables=[var for var in US_Traffic_2015.columns.tolist() if var.startswith('traffic_volume_counted_')]

In [38]:
"""Applying K-Means Clustering algorithm on the means of the 24-hourly bins of traffic volumes to group the variables into 
    3 major classes: High Traffic Volume, Medium Traffic Volume and Low Traffic Volume"""

Hourly_Traffic_Volume_Means=US_Traffic_2015[Target_Variables].mean().to_frame().reset_index()
Hourly_Traffic_Volume_Means.columns=['Hourly_Bins','Means']
KMeans_model=KMeans(n_clusters=3, random_state=12345)
KMeans_model.fit(np.array(Hourly_Traffic_Volume_Means['Means']).reshape(-1,1))
Cluster_Output = KMeans_model.predict(np.array(Hourly_Traffic_Volume_Means['Means']).reshape(-1,1))
Hourly_Traffic_Volume_Means['Cluster_Output']=Cluster_Output.tolist()
Hourly_Traffic_Volume_Means

Unnamed: 0,Hourly_Bins,Means,Cluster_Output
0,traffic_volume_counted_after_0000_to_0100,120.752596,0
1,traffic_volume_counted_after_0100_to_0200,83.187115,0
2,traffic_volume_counted_after_0200_to_0300,70.222543,0
3,traffic_volume_counted_after_0300_to_0400,74.562939,0
4,traffic_volume_counted_after_0400_to_0500,125.186877,0
5,traffic_volume_counted_after_0500_to_0600,261.220392,0
6,traffic_volume_counted_after_0600_to_0700,455.983632,2
7,traffic_volume_counted_after_0700_to_0800,610.374286,1
8,traffic_volume_counted_after_0800_to_0900,604.2636,1
9,traffic_volume_counted_after_0900_to_1000,586.684575,1


## Based on the kmeans clustering output, we can see that there's heavy traffic volume between 7 am to 7 pm (Business Hours), medium traffic volume in between 6am to 7am in the morning and between 7pm to 10pm at night (Non-Business Hours) and low traffic volume between 10pm to 6am (Night-Time)

## As per the above definitions the 24 Target Variables will be converted to 3 and the original ones will be dropped

In [39]:
"""Clubbing the 24 Target Variables into 3, based on average traffic volumes and classified into Non-Business Hours, Night-Time
    and Business Hours. The original 24 Target variables were dropped after this exercise"""

US_Traffic_2015['Traffic_Volume_Non_Business_Hours']=US_Traffic_2015[Hourly_Traffic_Volume_Means[Hourly_Traffic_Volume_Means['Cluster_Output']==2]['Hourly_Bins'].tolist()].sum(axis=1)
US_Traffic_2015['Traffic_Volume_Night_Time']=US_Traffic_2015[Hourly_Traffic_Volume_Means[Hourly_Traffic_Volume_Means['Cluster_Output']==0]['Hourly_Bins'].tolist()].sum(axis=1)
US_Traffic_2015['Traffic_Volume_Business_Hours']=US_Traffic_2015[Hourly_Traffic_Volume_Means[Hourly_Traffic_Volume_Means['Cluster_Output']==1]['Hourly_Bins'].tolist()].sum(axis=1)

US_Traffic_2015.drop(Target_Variables, axis=1, inplace=True)
Target_Variables=['Traffic_Volume_Non_Business_Hours','Traffic_Volume_Night_Time','Traffic_Volume_Business_Hours']

## Data Visualization:


## All other variables apart from the Target Variables will be hereafter called as the Independent Variables, and the objective of this analysis is to evaluate Bi-Variate relationship between the Target Variables and the Independent Variables.

## The Target variables are traffic volume information and it's continuous in nature. Since all the categorical variables are already indexed, so all the independent variables which we are going to deal with will be numeric.

## However, in order to make the visualizations intuitive, all independent variables which are indexed (originally categorical) or variables having less than 50 unique levels or variables which portray nominal behaviour, will be used to create variable average bar plots. 

## Variable Average Bar Plot: For every unique level of the independent variable the average of the target variable will be computed, and bars will be stacked for each unique level.

## For the rest of the independent variables other than #5.4 (typically with more than 50 unique levels), scatter plots will be created with respect to the target variable. 

In [40]:
"""Segregating Independent Variables into 2 lists, 
    -one containing less than 50 unique levels or indexed variables or variables having nominal pattern 
    -the other one containing more than 50 unique levels"""

Independent_Variables_for_Variable_Average_Plots=[item for item in list(set(US_Traffic_2015.columns.tolist())-set(Target_Variables)) if ((len(US_Traffic_2015[item].unique())<50) or (item.endswith('_index')))]
Independent_Variables_for_Scatter_Plots=[item for item in list(set(US_Traffic_2015.columns.tolist())-set(Target_Variables)-set(Independent_Variables_for_Variable_Average_Plots))]

In [41]:
"""Checking the list of variables, we selected for Scatter Plots"""

print(Independent_Variables_for_Scatter_Plots)
print("\n")

['year_station_established', 'fips_county_code', 'fips_state_code', 'lrs_location_point', 'tavg', 'prcp']




## 'fips_county_code' and 'fips_state_code' are nominal in nature and hence will be removed from the Scatter Plots and list and added in the Variable_Average_Plots_List

In [42]:
"""Transferring the aforementioned variables from Scatter Plots list to Variable Average Bar Plots list"""

Independent_Variables_for_Variable_Average_Plots=Independent_Variables_for_Variable_Average_Plots+['fips_county_code','fips_state_code']
Independent_Variables_for_Scatter_Plots=list(set(Independent_Variables_for_Scatter_Plots)-set(['fips_county_code','fips_state_code']))

print(Independent_Variables_for_Scatter_Plots)

['prcp', 'year_station_established', 'lrs_location_point', 'tavg']


In [43]:
"""Generating Variable Average Bar Plots"""

index=0
for var in Independent_Variables_for_Variable_Average_Plots:
    temp_df=US_Traffic_2015.groupby(var)[Target_Variables].mean().reset_index()
    temp_df=temp_df[temp_df[var]!=-99999999]
    temp_df.index=temp_df[var].tolist()
    temp_df.drop(var, axis=1, inplace=True)
    temp_df.plot(kind='bar', stacked=False, figsize=(18,10), fontsize=20)
    plt.xlabel(var,fontdict={'fontsize':24})
    plt.ylabel('Average_Traffic_Volume',fontdict={'fontsize':24})
    plt.legend(loc=1,fontsize=20);
    plt.savefig('.\Output\Bi_Variate_Plots\{0}_{1}_vs_Average_Traffic_Volume.jpg'.format(str(index),var),dpi=300,bbox_inches='tight')
    plt.close()
    index=index+1

In [44]:
"""Generating Scatter Plots"""

for var in Independent_Variables_for_Scatter_Plots:
    plt.figure(figsize=(18, 10), dpi=300)
    plt.scatter(x=var, y=Target_Variables[0], data=US_Traffic_2015[US_Traffic_2015[var]!=-99999999], marker=".", label=Target_Variables[0])
    plt.scatter(x=var, y=Target_Variables[1], data=US_Traffic_2015[US_Traffic_2015[var]!=-99999999], marker="v", label=Target_Variables[1])
    plt.scatter(x=var, y=Target_Variables[2], data=US_Traffic_2015[US_Traffic_2015[var]!=-99999999], marker="^", label=Target_Variables[2])
    plt.legend(loc=1,fontsize=20)
    plt.xlabel(var)
    plt.ylabel('Average_Traffic_Volume')
    plt.savefig('.\Output\Bi_Variate_Plots\{0}_{1}_vs_Average_Traffic_Volume.jpg'.format(str(index),var),dpi=300,bbox_inches='tight')
    plt.close()
    index=index+1

In [45]:
"""Saving a copy of the cleaned US_Traffic_2015 dataset for predictive model development"""

US_Traffic_2015.to_parquet('.\\Data\\US_Traffic_2015.pqt')