This notebook reads and cleans the hourly climate data into a format that can be merged with the bus delay information. 

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
directory = os.getcwd()

climate_data_path = os.path.join(directory, 'climate')

# List all files in the 'climate' folder
climate_files = os.listdir(climate_data_path)

# Initialize an empty DataFrame to concatenate the data
df_climate_raw = pd.DataFrame()

# Loop through each file in the folder
for file in climate_files:
    # Check if the file is a CSV file
    if file.endswith('.csv'):
        # Construct the full path to the CSV file
        file_path = os.path.join(climate_data_path, file)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Concatenate the DataFrame to the combined DataFrame
        df_climate_raw = pd.concat([df_climate_raw, df], ignore_index=True)

In [7]:
df_climate_raw.head()

Unnamed: 0,x,y,LOCAL_HOUR,WEATHER_ENG_DESC,WIND_DIRECTION,WINDCHILL,PRECIP_AMOUNT,DEW_POINT_TEMP_FLAG,HUMIDEX,ID,...,UTC_YEAR,HUMIDEX_FLAG,WIND_SPEED,LOCAL_DAY,WIND_DIRECTION_FLAG,DEW_POINT_TEMP,LOCAL_MONTH,UTC_DATE,WINDCHILL_FLAG,VISIBILITY
0,-79.396111,43.6275,17,,25.0,-15.0,,,,6158359.2009.12.10.17,...,2009,,55.0,10,,-9.5,12,2009-12-10T22:00:00,,16.1
1,-79.396111,43.6275,18,,24.0,-17.0,,,,6158359.2009.12.10.18,...,2009,,59.0,10,,-10.8,12,2009-12-10T23:00:00,,16.1
2,-79.396111,43.6275,19,,25.0,-18.0,,,,6158359.2009.12.10.19,...,2009,,55.0,10,,-12.0,12,2009-12-11T00:00:00,,16.1
3,-79.396111,43.6275,20,,25.0,-18.0,,,,6158359.2009.12.10.20,...,2009,,55.0,10,,-12.0,12,2009-12-11T01:00:00,,16.1
4,-79.396111,43.6275,21,,25.0,-18.0,,,,6158359.2009.12.10.21,...,2009,,57.0,10,,-11.2,12,2009-12-11T02:00:00,,16.1


In [9]:
df_climate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121135 entries, 0 to 121134
Data columns (total 37 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   x                       121135 non-null  float64
 1   y                       121135 non-null  float64
 2   LOCAL_HOUR              121135 non-null  int64  
 3   WEATHER_ENG_DESC        14919 non-null   object 
 4   WIND_DIRECTION          117371 non-null  float64
 5   WINDCHILL               22114 non-null   float64
 6   PRECIP_AMOUNT           82143 non-null   float64
 7   DEW_POINT_TEMP_FLAG     798 non-null     object 
 8   HUMIDEX                 18858 non-null   float64
 9   ID                      121135 non-null  object 
 10  VISIBILITY_FLAG         327 non-null     object 
 11  CLIMATE_IDENTIFIER      121135 non-null  int64  
 12  WIND_SPEED_FLAG         145 non-null     object 
 13  RELATIVE_HUMIDITY       114035 non-null  float64
 14  STATION_PRESSURE_FLA

In [3]:
columns_to_drop = ['x', 'y', 'WEATHER_ENG_DESC','DEW_POINT_TEMP_FLAG','ID','WIND_SPEED_FLAG','STATION_PRESSURE_FLAG','STATION_NAME','PRECIP_AMOUNT_FLAG','HUMIDEX_FLAG','WIND_DIRECTION_FLAG','WINDCHILL_FLAG','RELATIVE_HUMIDITY_FLAG','PROVINCE_CODE','WEATHER_FRE_DESC','CLIMATE_IDENTIFIER','VISIBILITY_FLAG']

In [4]:
df_climate_raw = df_climate_raw.drop(columns=columns_to_drop)

In [31]:
df_climate_raw.head()

Unnamed: 0,LOCAL_HOUR,WIND_DIRECTION,WINDCHILL,PRECIP_AMOUNT,HUMIDEX,RELATIVE_HUMIDITY,LOCAL_YEAR,TEMP_FLAG,UTC_MONTH,UTC_DAY,LOCAL_DATE,STATION_PRESSURE,TEMP,UTC_YEAR,WIND_SPEED,LOCAL_DAY,DEW_POINT_TEMP,LOCAL_MONTH,UTC_DATE,VISIBILITY
0,17,25.0,-15.0,,,71.0,2009,,12,10,2009-12-10 17:00:00,99.57,-5.0,2009,55.0,10,-9.5,12,2009-12-10T22:00:00,16.1
1,18,24.0,-17.0,,,69.0,2009,,12,10,2009-12-10 18:00:00,99.69,-6.0,2009,59.0,10,-10.8,12,2009-12-10T23:00:00,16.1
2,19,25.0,-18.0,,,68.0,2009,,12,11,2009-12-10 19:00:00,99.79,-7.0,2009,55.0,10,-12.0,12,2009-12-11T00:00:00,16.1
3,20,25.0,-18.0,,,68.0,2009,,12,11,2009-12-10 20:00:00,99.88,-7.0,2009,55.0,10,-12.0,12,2009-12-11T01:00:00,16.1
4,21,25.0,-18.0,,,72.0,2009,,12,11,2009-12-10 21:00:00,99.89,-7.0,2009,57.0,10,-11.2,12,2009-12-11T02:00:00,16.1


In [32]:
df_climate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121135 entries, 0 to 121134
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   LOCAL_HOUR         121135 non-null  int64  
 1   WIND_DIRECTION     117371 non-null  float64
 2   WINDCHILL          22114 non-null   float64
 3   PRECIP_AMOUNT      82143 non-null   float64
 4   HUMIDEX            18858 non-null   float64
 5   RELATIVE_HUMIDITY  114035 non-null  float64
 6   LOCAL_YEAR         121135 non-null  int64  
 7   TEMP_FLAG          422 non-null     object 
 8   UTC_MONTH          121135 non-null  int64  
 9   UTC_DAY            121135 non-null  int64  
 10  LOCAL_DATE         121135 non-null  object 
 11  STATION_PRESSURE   120692 non-null  float64
 12  TEMP               120712 non-null  float64
 13  UTC_YEAR           121135 non-null  int64  
 14  WIND_SPEED         120990 non-null  float64
 15  LOCAL_DAY          121135 non-null  int64  
 16  DE

In [25]:
df_climate_raw['LOCAL_DATE']

0         2009-12-10 17:00:00
1         2009-12-10 18:00:00
2         2009-12-10 19:00:00
3         2009-12-10 20:00:00
4         2009-12-10 21:00:00
                 ...         
121130    2020-03-25 02:00:00
121131    2020-03-25 03:00:00
121132    2020-03-25 04:00:00
121133    2020-03-25 05:00:00
121134    2020-03-25 06:00:00
Name: LOCAL_DATE, Length: 121135, dtype: object

In [7]:
def classify_precipitation(row):
    conditions = [
        (pd.isna(row['PRECIP_AMOUNT']) and pd.isna(row['TEMP'])),
        (pd.isna(row['PRECIP_AMOUNT'])),
        (row['TEMP'] > 0),
        (row['TEMP'] <= 0)
    ]
    values = [None, 'dry', 'rain', 'snow']
    return np.select(conditions, values, default=None)

In [8]:
df_climate_raw['weather'] = df_climate_raw.apply(classify_precipitation, axis=1)

In [10]:
print(len(df_climate_raw[df_climate_raw['weather']== None]))
print(len(df_climate_raw[df_climate_raw['weather']== 'dry']))
print(len(df_climate_raw[df_climate_raw['weather']== 'rain']))
print(len(df_climate_raw[df_climate_raw['weather']== 'snow']))

0
38906
67786
14020


In [11]:
df_climate_raw.to_csv('climate_data_raw.csv')