In [7]:
#Import the required packages
#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

from sklearn.model_selection import train_test_split

In [32]:
# Reading from a csv file, into a data frame
# Can set parameters to remove white space from the beginning and end of column names
# Read more about .read_csv() here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
df = pd.read_csv('/Users/danobeirne/Documents/UCD/dublin-bus-app/DataAnalytics/historical-weather-data.csv', keep_default_na=True, delimiter=',')
# Show data frame first few rows
df.head(30)

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.3,wdsp,ind.4,wddir,ww,w,sun,vis,clht,clamt
0,01-jan-1990 00:00,2,0.1,0,9.0,0,8.1,7.1,10.1,88,...,2,10,2,140,50,52,0.0,7000,7,8
1,01-jan-1990 01:00,2,0.0,0,8.9,0,8.0,7.0,10.0,88,...,2,12,2,140,50,52,0.0,7000,7,8
2,01-jan-1990 02:00,2,0.0,0,8.8,0,7.9,6.9,9.9,88,...,2,14,2,150,20,52,0.0,5000,7,8
3,01-jan-1990 03:00,2,0.0,0,9.0,0,8.1,7.1,10.1,88,...,2,17,2,160,50,52,0.0,6000,7,8
4,01-jan-1990 04:00,0,0.6,0,9.0,0,8.4,7.8,10.5,92,...,2,15,2,160,60,65,0.0,9000,5,8
5,01-jan-1990 05:00,0,0.6,0,8.5,0,7.8,7.1,10.0,90,...,2,3,2,190,21,62,0.0,20000,48,7
6,01-jan-1990 06:00,0,0.0,0,7.9,0,7.2,6.4,9.6,90,...,2,1,2,150,1,65,0.0,25000,60,7
7,01-jan-1990 07:00,0,0.0,0,7.7,0,7.0,6.2,9.5,90,...,2,2,2,210,2,11,0.0,25000,50,7
8,01-jan-1990 08:00,0,0.0,0,8.2,0,6.9,5.3,8.9,82,...,2,5,2,220,2,11,0.0,25000,50,7
9,01-jan-1990 09:00,0,0.0,0,8.2,0,7.1,5.8,9.2,85,...,2,3,2,210,2,11,0.0,30000,60,7


In [33]:
df.dtypes

date      object
ind        int64
rain     float64
ind.1      int64
temp     float64
ind.2      int64
wetb     float64
dewpt    float64
vappr     object
rhum      object
msl      float64
ind.3      int64
wdsp       int64
ind.4      int64
wddir     object
ww         int64
w          int64
sun      float64
vis        int64
clht       int64
clamt      int64
dtype: object

In [34]:
#start by fixing date datatype and drop non-2018 years can be dropped
df['date'] = pd.to_datetime(df.date)
df_n = df[df['date'].dt.year == 2018]

In [35]:
df_n.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.3,wdsp,ind.4,wddir,ww,w,sun,vis,clht,clamt
245448,2018-01-01 00:00:00,2,0.0,0,4.5,0,3.3,1.5,6.8,81,...,2,23,2,240,80,81,0.0,16000,168,7
245449,2018-01-01 01:00:00,2,0.0,0,4.4,0,3.4,1.8,7.0,84,...,2,25,2,240,80,82,0.0,20000,250,5
245450,2018-01-01 02:00:00,2,0.0,0,4.6,0,3.5,1.8,7.0,82,...,2,23,2,240,25,81,0.0,20000,999,3
245451,2018-01-01 03:00:00,2,0.0,0,4.6,0,3.6,2.1,7.1,84,...,2,24,2,240,25,81,0.0,30000,999,4
245452,2018-01-01 04:00:00,0,0.0,0,5.1,0,4.0,2.3,7.2,83,...,2,21,2,240,2,11,0.0,25000,999,4


In [36]:
#Next see what columns are unnecessary and drop them 
df_n.columns

Index(['date', 'ind', 'rain', 'ind.1', 'temp', 'ind.2', 'wetb', 'dewpt',
       'vappr', 'rhum', 'msl', 'ind.3', 'wdsp', 'ind.4', 'wddir', 'ww', 'w',
       'sun', 'vis', 'clht', 'clamt'],
      dtype='object')

In [37]:
#drop the columns that won't be of any use
df_n = df_n.drop(['ind', 'ind.1', 'ind.2', 'wetb', 'dewpt', 'msl', 'ww', 'w', 'ind.4'], axis=1)

In [38]:
df_n.columns

Index(['date', 'rain', 'temp', 'vappr', 'rhum', 'ind.3', 'wdsp', 'wddir',
       'sun', 'vis', 'clht', 'clamt'],
      dtype='object')

In [44]:
df_n = df_n.drop(['ind.3'], axis=1)

In [46]:
#name the columns more udnertandable things
df_n.rename(columns = {'rhum':'humidity', 'vappr':'pressure','clamt':'cloud_cover', 'clht':'cloud_height', 'date':'datetime', 'vis':'visibility', 'wddir':'wind_dir', 'wdsp':'wind_speed'}, inplace = True) 

In [47]:
df_n.columns

Index(['datetime', 'rain', 'temp', 'pressure', 'humidity', 'wind_speed',
       'wind_dir', 'sun', 'visibility', 'cloud_height', 'cloud_cover'],
      dtype='object')

In [50]:
df_n.head()

Unnamed: 0,datetime,rain,temp,pressure,humidity,wind_speed,wind_dir,sun,visibility,cloud_height,cloud_cover
245448,2018-01-01 00:00:00,0.0,4.5,6.8,81,23,240,0.0,16000,168,7
245449,2018-01-01 01:00:00,0.0,4.4,7.0,84,25,240,0.0,20000,250,5
245450,2018-01-01 02:00:00,0.0,4.6,7.0,82,23,240,0.0,20000,999,3
245451,2018-01-01 03:00:00,0.0,4.6,7.1,84,24,240,0.0,30000,999,4
245452,2018-01-01 04:00:00,0.0,5.1,7.2,83,21,240,0.0,25000,999,4


In [49]:
df_n.dtypes

datetime        datetime64[ns]
rain                   float64
temp                   float64
pressure                object
humidity                object
wind_speed               int64
wind_dir                object
sun                    float64
visibility               int64
cloud_height             int64
cloud_cover              int64
dtype: object

In [53]:
#fix orther dtypes
df_n['pressure'] = df_n['pressure'].astype('float64')
df_n['humidity'] = df_n['pressure'].astype('int64')

In [59]:
numeric_columns = df_n.select_dtypes(['int64', 'float64']).columns
columns = list(df_n)

In [68]:
#have a look at max and min values
for col in columns:
    print("Max ", col, ":", df_n[col].max())
    print("Min ", col, ":", df_n[col].min())
    print("Range: ", col, ":", df_n[col].max() - df_n[col].min())

Max  datetime : 2018-12-31 23:00:00
Min  datetime : 2018-01-01 00:00:00
Range:  datetime : 364 days 23:00:00
Max  rain : 7.7
Min  rain : 0.0
Range:  rain : 7.7
Max  temp : 26.3
Min  temp : -4.8
Range:  temp : 31.1
Max  pressure : 21.7
Min  pressure : 3.1
Range:  pressure : 18.599999999999998
Max  humidity : 21
Min  humidity : 3
Range:  humidity : 18
Max  wind_speed : 34
Min  wind_speed : 1
Range:  wind_speed : 33
Max  wind_dir : 360
Min  wind_dir : 10
Range:  wind_dir : 350
Max  sun : 1.0
Min  sun : 0.0
Range:  sun : 1.0
Max  visibility : 70000
Min  visibility : 100
Range:  visibility : 69900
Max  cloud_height : 999
Min  cloud_height : 1
Range:  cloud_height : 998
Max  cloud_cover : 8
Min  cloud_cover : 0
Range:  cloud_cover : 8


In [67]:
# have a look at the shape of the df
df_n.describe()

Unnamed: 0,rain,temp,pressure,humidity,wind_speed,sun,visibility,cloud_height,cloud_cover
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,0.079429,9.953356,10.149498,9.700457,9.988128,0.176027,28390.468037,307.813014,5.517352
std,0.361811,5.504088,3.238347,3.246926,5.256138,0.333543,14046.468858,412.192618,2.333908
min,0.0,-4.8,3.1,3.0,1.0,0.0,100.0,1.0,0.0
25%,0.0,6.0,7.7,7.0,6.0,0.0,20000.0,24.0,4.0
50%,0.0,9.6,9.7,9.0,9.0,0.0,30000.0,50.0,7.0
75%,0.0,13.9,12.2,12.0,13.0,0.1,40000.0,999.0,7.0
max,7.7,26.3,21.7,21.0,34.0,1.0,70000.0,999.0,8.0


# Data Quality Report
Overall the data is of excelklent quality - there are almost no anomolies, the ranges eem appropriate, the max and min values seem appropriate.
Going forward the only thing to watch for is the variance in the range of the different features - perhaps and standardisation of the values would be appropriate.
Deleted columns are columns that are irrelevant information

In [73]:
df_n.to_csv('/Users/danobeirne/Documents/UCD/dublin-bus-app/DataAnalytics/weather-data-cleaned.csv')