Install packages. In Anaconda Prompt, activate the environment. Use pip install to install packages. 

In [1]:
# pip install pandas
# pip install cufflinks

Import libraries.

In [2]:
import pandas as pd
import cufflinks
import plotly.figure_factory as ff

First, I load the data from the file given.

In [3]:
data = pd.read_csv("traffic_data.csv")

In [4]:
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,New Years Day,263.49,0.0,0,58,Clouds,broken clouds,2013-01-01 00:00:00,1439
1,,263.78,0.0,0,40,Clouds,scattered clouds,2013-01-01 01:00:00,1502
2,,264.16,0.0,0,75,Snow,heavy snow,2013-01-01 02:00:00,933
3,,263.95,0.0,0,90,Clouds,overcast clouds,2013-01-01 03:00:00,576
4,,263.65,0.0,0,90,Clouds,overcast clouds,2013-01-01 04:00:00,372


In [5]:
data.describe()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume
count,8573.0,8573.0,8573.0,8573.0,8573.0
mean,278.976352,0.161284,0.0,52.560947,3286.76216
std,13.322459,1.439626,0.0,37.470062,2028.322769
min,244.82,0.0,0.0,0.0,164.0
25%,269.46,0.0,0.0,1.0,1193.0
50%,277.33,0.0,0.0,64.0,3344.0
75%,290.4,0.0,0.0,90.0,5001.0
max,308.24,55.63,0.0,100.0,7217.0


Split date_time column into 2 columns, one containing date and the other containing time.

In [6]:
new = data["date_time"].str.split(" ", n=1, expand=True)
data["date"] = new[0]
data["time"] = new[1]
data.drop(columns = ["date_time"], inplace=True)
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,time
0,New Years Day,263.49,0.0,0,58,Clouds,broken clouds,1439,2013-01-01,00:00:00
1,,263.78,0.0,0,40,Clouds,scattered clouds,1502,2013-01-01,01:00:00
2,,264.16,0.0,0,75,Snow,heavy snow,933,2013-01-01,02:00:00
3,,263.95,0.0,0,90,Clouds,overcast clouds,576,2013-01-01,03:00:00
4,,263.65,0.0,0,90,Clouds,overcast clouds,372,2013-01-01,04:00:00


There are 10 holidays in the year 2013. The dates are given in the 'date' column.

In [7]:
data.loc[data['holiday'] != 'None']

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,time
0,New Years Day,263.49,0.0,0,58,Clouds,broken clouds,1439,2013-01-01,00:00:00
1138,Washingtons Birthday,258.96,0.0,0,20,Clouds,few clouds,556,2013-02-18,00:00:00
3871,Memorial Day,286.37,0.0,0,90,Clouds,overcast clouds,863,2013-05-27,00:00:00
4855,Independence Day,290.08,0.0,0,1,Clear,sky is clear,1060,2013-07-04,00:00:00
6016,State Fair,297.42,0.0,0,12,Clouds,few clouds,661,2013-08-22,00:00:00
6183,Labor Day,288.78,0.0,0,0,Clear,Sky is Clear,1041,2013-09-02,00:00:00
6896,Columbus Day,277.72,0.0,0,0,Clear,Sky is Clear,615,2013-10-14,00:00:00
7209,Veterans Day,275.44,0.0,0,64,Clouds,broken clouds,514,2013-11-11,00:00:00
7622,Thanksgiving Day,268.24,0.0,0,64,Clouds,broken clouds,929,2013-11-28,00:00:00
8380,Christmas Day,260.17,0.25,0,64,Rain,light rain,712,2013-12-25,00:00:00


Taking the example of 4 July 2013. Only the 00:00:00 time shows the holiday. The other timings of the day are also supposed to be indicated as a holiday.

In [8]:
data.loc[data['date'] == '2013-07-04'].head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,date,time
4855,Independence Day,290.08,0.0,0,1,Clear,sky is clear,1060,2013-07-04,00:00:00
4856,,288.61,0.0,0,1,Clear,sky is clear,611,2013-07-04,01:00:00
4857,,288.1,0.0,0,1,Clear,sky is clear,463,2013-07-04,02:00:00
4858,,287.37,0.0,0,1,Clear,sky is clear,333,2013-07-04,03:00:00
4859,,286.86,0.0,0,1,Clear,sky is clear,369,2013-07-04,04:00:00


I need to input the holiday into the other rows.

In [9]:
data.loc[data['date'] == '2013-01-01', 'holiday'] = 'New Years Day'
data.loc[data['date'] == '2013-02-18', 'holiday'] = 'Washingtons Birthday'
data.loc[data['date'] == '2013-05-27', 'holiday'] = 'Memorial Day'
data.loc[data['date'] == '2013-07-04', 'holiday'] = 'Independence Day'
data.loc[data['date'] == '2013-08-22', 'holiday'] = 'State Fair'
data.loc[data['date'] == '2013-09-02', 'holiday'] = 'Labor Day'
data.loc[data['date'] == '2013-10-14', 'holiday'] = 'Columbus Day'
data.loc[data['date'] == '2013-11-11', 'holiday'] = 'Veterans Day'
data.loc[data['date'] == '2013-11-28', 'holiday'] = 'Thanksgiving Day'
data.loc[data['date'] == '2013-12-25', 'holiday'] = 'Christmas Day'

Earlier, the descriptive statistics show that snow_1h column is uniformed at 0 indicating no snow, even though from the textual description, there is snow. I remove the snow_1h column, since it is not needed in the analysis.

Also, I want to create a column called 'hol' to indicate if the data point occurs on a holiday or not.

In [10]:
data = data.drop(['snow_1h'], axis=1)
data.loc[data['holiday'] != 'None', 'hol'] = 'yes'
data.loc[data['holiday'] == 'None', 'hol'] = 'no'
data

Unnamed: 0,holiday,temp,rain_1h,clouds_all,weather_main,weather_description,traffic_volume,date,time,hol
0,New Years Day,263.49,0.0,58,Clouds,broken clouds,1439,2013-01-01,00:00:00,yes
1,New Years Day,263.78,0.0,40,Clouds,scattered clouds,1502,2013-01-01,01:00:00,yes
2,New Years Day,264.16,0.0,75,Snow,heavy snow,933,2013-01-01,02:00:00,yes
3,New Years Day,263.95,0.0,90,Clouds,overcast clouds,576,2013-01-01,03:00:00,yes
4,New Years Day,263.65,0.0,90,Clouds,overcast clouds,372,2013-01-01,04:00:00,yes
...,...,...,...,...,...,...,...,...,...,...
8568,,248.13,0.0,1,Clear,sky is clear,4504,2013-12-31,19:00:00,no
8569,,247.66,0.0,1,Clear,sky is clear,3478,2013-12-31,20:00:00,no
8570,,248.63,0.0,1,Clear,sky is clear,2711,2013-12-31,21:00:00,no
8571,,248.39,0.0,40,Clouds,scattered clouds,2189,2013-12-31,22:00:00,no


Check out the data types of the attributes.

In [11]:
print(data.dtypes)

holiday                 object
temp                   float64
rain_1h                float64
clouds_all               int64
weather_main            object
weather_description     object
traffic_volume           int64
date                    object
time                    object
hol                     object
dtype: object


Make date attribute datetime. From the date attribute, I can determine the day of week and subsequently whether it is a weekday or a weekend.

In [12]:
data['date'] = pd.to_datetime(data['date'])
data['dow'] = data['date'].dt.dayofweek
data.loc[data['dow'] == 0, 'typeofday'] = 'weekday'
data.loc[data['dow'] == 1, 'typeofday'] = 'weekday'
data.loc[data['dow'] == 2, 'typeofday'] = 'weekday'
data.loc[data['dow'] == 3, 'typeofday'] = 'weekday'
data.loc[data['dow'] == 4, 'typeofday'] = 'weekday'
data.loc[data['dow'] == 5, 'typeofday'] = 'weekend'
data.loc[data['dow'] == 6, 'typeofday'] = 'weekend'

Drop the weather_description column. I am not doing analysis on the text in this project.

In [13]:
df8 = data.drop(['weather_description','date'], axis=1)

Make the various object data into categorical data.

In [14]:
df8['holiday'] = df8['holiday'].astype('category')
df8['weather_main'] = df8['weather_main'].astype('category')
df8['time'] = df8['time'].astype('category')
df8['hol'] = df8['hol'].astype('category')
df8['dow'] = df8['dow'].astype('category')
df8['typeofday'] = df8['typeofday'].astype('category')

This is the data type.

In [15]:
print(df8.dtypes)

holiday           category
temp               float64
rain_1h            float64
clouds_all           int64
weather_main      category
traffic_volume       int64
time              category
hol               category
dow               category
typeofday         category
dtype: object


This is the top few rows of the data.

In [16]:
df8.head()

Unnamed: 0,holiday,temp,rain_1h,clouds_all,weather_main,traffic_volume,time,hol,dow,typeofday
0,New Years Day,263.49,0.0,58,Clouds,1439,00:00:00,yes,1,weekday
1,New Years Day,263.78,0.0,40,Clouds,1502,01:00:00,yes,1,weekday
2,New Years Day,264.16,0.0,75,Snow,933,02:00:00,yes,1,weekday
3,New Years Day,263.95,0.0,90,Clouds,576,03:00:00,yes,1,weekday
4,New Years Day,263.65,0.0,90,Clouds,372,04:00:00,yes,1,weekday


Initialise for data visualisation.

In [17]:
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='white')

The most common temperature fell between 270K and 280K.

In [18]:
df8['temp'].iplot(kind='hist',bins=10,xTitle='Temperature in Kelvin',linecolor='black',yTitle='Count',
                  title='Temperature Distribution')

There was little rain received in the area.

In [19]:
df8['rain_1h'].iplot(kind='hist',bins=10,xTitle='Millimetres of rain in an hour',linecolor='black',yTitle='Count',
                     title='Rain Distribution')

Cloud cover was usually more than 90% or less than 10%.

In [20]:
df8['clouds_all'].iplot(kind='hist',bins=20,xTitle='Percentage of cloud cover',linecolor='black',yTitle='Count',
                        title='Cloud Cover Distribution')

The most common weather situation was Clouds. There were few 1-periods where a squall was happening. It did snow in the year 2013. I have sorted the weather distribution in descending order.

In [21]:
df8['weather_main'].iplot(kind='hist',bins=10,xTitle='Weather condition',linecolor='black',yTitle='Count',
                          title='Weather Distribution',categoryorder="total descending")

Traffic volume was usually below 1000 vehicles per hour. There were few 1-hour periods where traffic volume exceeded 7000 vehicles per hour.

In [22]:
df8['traffic_volume'].iplot(kind='hist',bins=20,xTitle='Vehicles per hour',linecolor='black',yTitle='Count',
                            title='Traffic Volume Distribution')

Correlation heatmap

In [23]:
corrs = df8.corr()
ff.create_annotated_heatmap(z=corrs.values,x=list(corrs.columns),y=list(corrs.index),annotation_text=corrs.round(2).values,
    showscale=True)

Let us look at the traffic volume on the 10 holidays. The highest traffic volume occuring on holidays was on the State Fair holiday.

In [24]:
df8[df8['hol'] == 'yes'].\
         set_index('holiday')['traffic_volume'].iplot(mode='markers',opacity=0.8,size=8,symbol=1,
                                                      xTitle='Holiday',yTitle='Vehicles per hour', 
                                                      title='Traffic Volume on holidays')

On holidays, traffic peaked at 1400 hours.

In [25]:
df8[df8['hol'] == 'yes'].\
         set_index('time')['traffic_volume'].iplot(kind='bar', xTitle='Hour of Day', yTitle='Total Traffic',
                                                   title='Traffic Peak on Holidays')

Subset data into weekend and weekday.

In [26]:
weekend = [5,6]
weekday = [0,1,2,3,4]
df7 = df8.loc[(df8['dow'].isin(weekend))]
df6 = df8.loc[(df8['dow'].isin(weekday))]

On non-holiday weekends, traffic peaked at 1600 hours.

In [27]:
df7[df7['hol'] == 'no'].\
         set_index('time')['traffic_volume'].iplot(kind='bar', xTitle='Hour of Day', yTitle='Total Traffic',
                                                   title='Traffic Peak on Non-holiday Weekends')

On non-holiday weekdays, traffic peaked at 0700 hours and at 1600 hours.

In [28]:
df6[df6['hol'] == 'no'].\
         set_index('time')['traffic_volume'].iplot(kind='bar', xTitle='Hour of Day', yTitle='Total Traffic',
                                                   title='Traffic Peaks on Non-holiday Weekdays')

Are there certain weather conditions that affect traffic volume? Traffic volume was particularly low in a squall.

In [29]:
df8.pivot(columns='weather_main', values='traffic_volume').iplot(kind='box', yTitle='Traffic Volume',
        title='Traffic Volume by Weather')