In [4]:
# Package usage

#!pip install scikit-learn
#!pip install pandas

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd 

In [1]:
# Import datasets
mythenquai = pd.read_csv('./input/messwerte_mythenquai_2007-2021.csv')
tiefenbrunnen = pd.read_csv('./input/messwerte_tiefenbrunnen_2007-2021.csv')

# Setting name of each dataset to keep track of the source
mythenquai['station'] = 'mythenquai'
tiefenbrunnen['station'] = 'tiefenbrunnen'

# Concatenating datasets from both stations
data = pd.concat([mythenquai, tiefenbrunnen])

# Reference metrics from datapoint 10 minutes before
data['wind_speed_avg_10min_before'] = data['wind_speed_avg_10min'].shift(1)
data['wind_direction_10min_before'] = data['wind_direction'].shift(1)
data['air_temperature_10min_before'] = data['wind_direction'].shift(1)

# Put Day, Month and Year into separate columns
data['timestamp_utc']= pd.to_datetime(data['timestamp_utc'])
data['day'] = data.timestamp_utc.dt.day
data['month'] = data.timestamp_utc.dt.month
data['year'] = data.timestamp_utc.dt.year

# Drop timestamp_utc, global_radiation, water_level and na values
data = data.filter(['station', 'air_temperature_10min_before', 'wind_speed_avg_10min_before', 'wind_direction_10min_before', 'wind_speed_avg_10min', 'wind_direction', 'day', 'month', 'year'], axis=1)
data = data.dropna(how='any')

# Display current dataset
data

Unnamed: 0,station,air_temperature_10min_before,wind_speed_avg_10min_before,wind_direction_10min_before,wind_speed_avg_10min,wind_direction,day,month,year
1,mythenquai,321.0,0.7,321.0,0.8,346,22,4,2007
2,mythenquai,346.0,0.8,346.0,0.2,4,22,4,2007
3,mythenquai,4.0,0.2,4.0,0.1,235,22,4,2007
4,mythenquai,235.0,0.1,235.0,0.3,178,22,4,2007
5,mythenquai,178.0,0.3,178.0,0.5,153,22,4,2007
...,...,...,...,...,...,...,...,...,...
757532,tiefenbrunnen,345.0,0.4,345.0,0.0,0,31,12,2021
757533,tiefenbrunnen,0.0,0.0,0.0,0.0,0,31,12,2021
757534,tiefenbrunnen,0.0,0.0,0.0,0.0,0,31,12,2021
757535,tiefenbrunnen,0.0,0.0,0.0,0.0,0,31,12,2021


In [2]:
# Set LabelEncoder to process station name inside the model
le = preprocessing.LabelEncoder()
data['station'] = le.fit_transform(data['station'])

# Set features and goal variable(s)
Y = data[['wind_speed_avg_10min','wind_direction']]
X = data.drop(columns = ['wind_speed_avg_10min','wind_direction'])

# Set train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=9)

# Defining the KNN regressor
model = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=35,n_jobs=-1))

# Training the model
model.fit(X_train, Y_train)

In [3]:
# Accuracy of trained model (n_correct_predictions/n_all_predictions)
test_acc_model = round(model.fit(X_train,Y_train).score(X_test, Y_test)* 100, 2)
train_acc_model = round(model.fit(X_train, Y_train).score(X_train, Y_train)* 100, 2)

model_accuracy = pd.DataFrame({
    'Model': ['KNN MultiOutputRegressor'],
    'Train Score': [train_acc_model],
    'Test Score': [test_acc_model]
})

model_accuracy.sort_values(by='Test Score', ascending=False)

Unnamed: 0,Model,Train Score,Test Score
0,KNN,68.3,66.29


In [20]:
sample_input = pd.DataFrame({
    'station': [1],
    'air_temperature_10min_before': [2],
    'wind_speed_avg_10min_before': [12],
    'wind_direction_10min_before': [200],
    'day': [10],
    'month': [12],
    'year': [2010]
})

Y_prediction = model.predict(sample_input)

Y_prediction

array([[  4.96571429, 111.97142857]])

In [None]:
import pickle

# Persist model
filename = 'knn_weather_model.pkl'
pickle.dump(model, open(filename, 'wb'))