In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../../csv/v2/010_weather_data_processed.csv.zip")
df.head()

Unnamed: 0,weather,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h
0,800,10000,2022-07-14 12:45:00,18.69,18.68,79,2.57,340,0,0.0,0.0
1,800,10000,2022-07-14 12:51:44,19.16,19.17,78,2.57,340,0,0.0,0.0
2,800,10000,2022-07-14 12:55:27,19.16,19.17,78,2.57,340,0,0.0,0.0
3,800,10000,2022-07-14 13:00:00,19.38,19.36,76,2.57,340,0,0.0,0.0
4,800,10000,2022-07-14 13:05:00,19.42,19.4,76,2.57,340,0,0.0,0.0


# Process date and time

In [3]:
df["collectedAt"] = pd.to_datetime(df["collectedAt"],utc=True)
df.head()

Unnamed: 0,weather,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h
0,800,10000,2022-07-14 12:45:00+00:00,18.69,18.68,79,2.57,340,0,0.0,0.0
1,800,10000,2022-07-14 12:51:44+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0
2,800,10000,2022-07-14 12:55:27+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0
3,800,10000,2022-07-14 13:00:00+00:00,19.38,19.36,76,2.57,340,0,0.0,0.0
4,800,10000,2022-07-14 13:05:00+00:00,19.42,19.4,76,2.57,340,0,0.0,0.0


## Extract month, day, day of week, hour and minute from "collectedAt"

In [4]:
df["month"] = df.collectedAt.dt.month_name()
df["day"] = df.collectedAt.dt.day
df["day_of_week"] = df.collectedAt.dt.day_of_week
df["day_of_week_name"] = df.collectedAt.dt.day_name()
df["hour"] = df.collectedAt.dt.hour
df["minute"] = df.collectedAt.dt.minute
df.head()

Unnamed: 0,weather,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h,month,day,day_of_week,day_of_week_name,hour,minute
0,800,10000,2022-07-14 12:45:00+00:00,18.69,18.68,79,2.57,340,0,0.0,0.0,July,14,3,Thursday,12,45
1,800,10000,2022-07-14 12:51:44+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0,July,14,3,Thursday,12,51
2,800,10000,2022-07-14 12:55:27+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0,July,14,3,Thursday,12,55
3,800,10000,2022-07-14 13:00:00+00:00,19.38,19.36,76,2.57,340,0,0.0,0.0,July,14,3,Thursday,13,0
4,800,10000,2022-07-14 13:05:00+00:00,19.42,19.4,76,2.57,340,0,0.0,0.0,July,14,3,Thursday,13,5


# Group weather categories and onehot encode them
- Leave 800 as is (clear)
- Map 801-804 (cloudy)
- Map 7xx: atmosphere (what may decrease visibility)
- Map 6xx: snowy conditions
- Map 5xx: rainy conditions
- Map 3xx: drizzle (can be included as wet condition in rain)
- Map 2xx: thunderstorm (can also be included as wet conditions in rain)

In [5]:
rainy_condition_codes = np.concatenate([np.arange(200,300,1),np.arange(300,400,1),np.arange(500,600,1)])

df["cloudy"] = df.apply(lambda x: 1 if 801 <= x["weather"] <= 804 else 0,axis=1)
df["clear"] = df.apply(lambda x: 1 if x["weather"] == 800 else 0,axis=1)
df["atmosphere"] = df.apply(lambda x: 1 if 700 <= x["weather"] <= 799 else 0,axis=1)
df["snowy"] = df.apply(lambda x: 1 if 600 <= x["weather"] <= 699 else 0,axis=1)
df["rainy"] = df.apply(lambda x: 1 if x["weather"] in rainy_condition_codes else 0,axis=1)

df.drop("weather",inplace=True,axis=1)

df.head()

Unnamed: 0,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h,...,day,day_of_week,day_of_week_name,hour,minute,cloudy,clear,atmosphere,snowy,rainy
0,10000,2022-07-14 12:45:00+00:00,18.69,18.68,79,2.57,340,0,0.0,0.0,...,14,3,Thursday,12,45,0,1,0,0,0
1,10000,2022-07-14 12:51:44+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0,...,14,3,Thursday,12,51,0,1,0,0,0
2,10000,2022-07-14 12:55:27+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0,...,14,3,Thursday,12,55,0,1,0,0,0
3,10000,2022-07-14 13:00:00+00:00,19.38,19.36,76,2.57,340,0,0.0,0.0,...,14,3,Thursday,13,0,0,1,0,0,0
4,10000,2022-07-14 13:05:00+00:00,19.42,19.4,76,2.57,340,0,0.0,0.0,...,14,3,Thursday,13,5,0,1,0,0,0


# Process Visibility
- Current scale is in metres. Changing to kilometres

In [6]:
df["visibility"] = df.apply(lambda x: x["visibility"]/1000,axis=1)
df.head()

Unnamed: 0,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h,...,day,day_of_week,day_of_week_name,hour,minute,cloudy,clear,atmosphere,snowy,rainy
0,10.0,2022-07-14 12:45:00+00:00,18.69,18.68,79,2.57,340,0,0.0,0.0,...,14,3,Thursday,12,45,0,1,0,0,0
1,10.0,2022-07-14 12:51:44+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0,...,14,3,Thursday,12,51,0,1,0,0,0
2,10.0,2022-07-14 12:55:27+00:00,19.16,19.17,78,2.57,340,0,0.0,0.0,...,14,3,Thursday,12,55,0,1,0,0,0
3,10.0,2022-07-14 13:00:00+00:00,19.38,19.36,76,2.57,340,0,0.0,0.0,...,14,3,Thursday,13,0,0,1,0,0,0
4,10.0,2022-07-14 13:05:00+00:00,19.42,19.4,76,2.57,340,0,0.0,0.0,...,14,3,Thursday,13,5,0,1,0,0,0


In [7]:
df.visibility.value_counts()

10.000    66723
9.656      1164
8.047       954
4.828       874
6.437       492
          ...  
6.794         1
9.305         1
8.540         1
3.508         1
3.476         1
Name: visibility, Length: 196, dtype: int64

# Process Rain:
- Current scale is mm/h (millimetres per hour). Changing to cm/h (centimetres per hour)

In [8]:
df["rain1h"] = df.apply(lambda x: x["rain1h"]*10,axis=1)
df.sample(5)

Unnamed: 0,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h,...,day,day_of_week,day_of_week_name,hour,minute,cloudy,clear,atmosphere,snowy,rainy
8802,10.0,2022-08-14 02:25:00+00:00,19.87,19.53,62,3.6,70,20,0.0,0.0,...,14,6,Sunday,2,25,1,0,0,0,0
31812,4.828,2022-11-02 00:05:00+00:00,13.55,13.13,83,3.09,270,0,0.0,0.0,...,2,2,Wednesday,0,5,0,0,1,0,0
28691,10.0,2022-10-22 04:00:00+00:00,12.89,11.78,59,2.57,0,0,0.0,0.0,...,22,5,Saturday,4,0,0,1,0,0,0
59510,10.0,2023-02-06 04:45:00+00:00,2.72,-1.02,80,4.12,290,100,0.0,0.0,...,6,0,Monday,4,45,1,0,0,0,0
72166,10.0,2023-03-22 03:50:00+00:00,3.77,-1.33,68,7.72,50,75,0.0,0.0,...,22,2,Wednesday,3,50,1,0,0,0,0


# Process Snow
- Current scale is mm/h (millimetres per hour). Changing to cm/h (centimetres per hour)

In [9]:
df["snow1h"] = df.apply(lambda x: x["snow1h"]*10,axis=1)
df.sample(5)

Unnamed: 0,visibility,collectedAt,temperature,FeelsLike,humidity,windSpeed,windDegree,cloudsAll,rain1h,snow1h,...,day,day_of_week,day_of_week_name,hour,minute,cloudy,clear,atmosphere,snowy,rainy
364,10.0,2022-07-15 19:10:00+00:00,26.36,26.36,46,2.24,181,89,0.0,0.0,...,15,4,Friday,19,10,1,0,0,0,0
54056,10.0,2023-01-18 06:05:00+00:00,3.66,-0.41,94,5.14,240,100,0.0,0.0,...,18,2,Wednesday,6,5,1,0,0,0,0
57861,10.0,2023-01-31 11:20:00+00:00,-12.09,-18.66,89,3.54,301,66,0.0,0.0,...,31,1,Tuesday,11,20,1,0,0,0,0
56867,10.0,2023-01-28 00:30:00+00:00,-1.12,-7.87,74,8.23,190,75,0.0,0.0,...,28,5,Saturday,0,30,1,0,0,0,0
32983,10.0,2022-11-06 01:40:00+00:00,20.25,20.11,68,6.17,140,75,0.0,0.0,...,6,6,Sunday,1,40,1,0,0,0,0


In [11]:
df.to_csv("./data/weather_data.csv.zip",index=False,compression="zip")