This notebook is for cleaning meteostat data for use in machine learning models.
Data is cleaned in accordance to the Python Meteostat Data Quality Plan (can be found in the data quality reports folder).
A pipeline will be created for adding predicted weather from meteostat to the database.


In [19]:
from meteostat import Point, Hourly
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
point = Point(40.7789, -73.9692) # Central park USW00094728 station

# 1.5 yrs worth of data
start =  pd.to_datetime('2021-01-01')
end = pd.to_datetime('2024-06-13')

data = Hourly(point, start, end)
df = data.fetch()
print(df.shape)

(30217, 11)




In [21]:
df.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 00:00:00,4.0,-4.0,56.0,,,330.0,20.5,,1026.3,,4.0
2021-01-01 01:00:00,3.0,-4.0,60.0,0.0,,0.0,0.0,,1027.7,,4.0
2021-01-01 02:00:00,3.0,-4.0,60.0,0.0,,340.0,5.4,,1028.1,,4.0
2021-01-01 03:00:00,2.0,-3.9,65.0,0.0,,270.0,9.4,,1029.2,,3.0
2021-01-01 04:00:00,2.0,-5.0,60.0,0.0,,340.0,5.4,,1029.9,,3.0


In [22]:
# Remove snow, wpgt, tsun

df = df.drop(['snow', 'wpgt', 'tsun'], axis=1)


In [23]:
# fill missing precip values with 0
df.loc[df['prcp'].isna(), 'prcp'] = 0

In [24]:
# Fill missing weather condition codes with 2
df.loc[df['coco'].isna(), 'coco'] = 2

In [25]:
# Drop wind speed and wind direction
df = df.drop(['wspd', 'wdir'], axis=1)

In [26]:
df = df.reset_index()
df

Unnamed: 0,time,temp,dwpt,rhum,prcp,pres,coco
0,2021-01-01 00:00:00,4.0,-4.0,56.0,0.0,1026.3,4.0
1,2021-01-01 01:00:00,3.0,-4.0,60.0,0.0,1027.7,4.0
2,2021-01-01 02:00:00,3.0,-4.0,60.0,0.0,1028.1,4.0
3,2021-01-01 03:00:00,2.0,-3.9,65.0,0.0,1029.2,3.0
4,2021-01-01 04:00:00,2.0,-5.0,60.0,0.0,1029.9,3.0
...,...,...,...,...,...,...,...
30212,2024-06-12 20:00:00,25.6,10.6,39.0,0.0,1015.0,3.0
30213,2024-06-12 21:00:00,24.6,11.2,43.0,0.0,1014.9,3.0
30214,2024-06-12 22:00:00,23.1,14.4,58.0,0.0,1015.0,3.0
30215,2024-06-12 23:00:00,21.5,15.6,69.0,0.0,1015.1,3.0


In [27]:
df.to_csv("meteostat_weather.csv", index=False)