In [8]:
"""Builds a model for predicting the air temperature based on our cleaned and engineered NDBC data in our DB

"""

import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from statsmodels.tsa.deterministic import CalendarFourier
from sklearn.linear_model import LinearRegression



In [None]:
#Format a dataframe after impoting from SQL
def format_df(df_in):
    """Function for formatting a dataframe from the database file with model training NDBC data

    Args:
            df_in (dataframe): Dataframe that has been uploaded from NDBC SQL database

    Returns:
            dataframe: formatted dataframe
    """

    df=df_in.copy(deep=True)
    df["datetime"] = pd.to_datetime(df["datetime"])
    df=df.set_index('datetime')
    df["date"] = pd.to_datetime(df["date"])
    return df

In [10]:
#Query a list of tables from our SQL model building DB
con = sqlite3.connect(r"C:\Users\dakot\Desktop\DataScience\projects\weather_prediction\NDBC_model_building_database.db")
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('NDBC_historical_raw_data',), ('NDBC_historical_raw_data_St#{STATIONNUMBER}',), ('NDBC_historical_raw_data_St{STATIONNUMBER}',), ('NDBC_historical_raw_data_St46054',), ('NDBC_historical_cleaned_data',), ('NDBC_historical_data_for_training',)]


In [11]:
#Query engineered data from DB
conn = sqlite3.connect(r"C:\Users\dakot\Desktop\DataScience\projects\weather_prediction\NDBC_model_building_database.db")
df_training = pd.read_sql_query("SELECT * FROM NDBC_historical_data_for_training", conn, index_col=None)

In [12]:
#use our function defined above to format our data
df_training=format_df(df_training)

In [14]:
#import our custom BoostedHybrid model class from a seperate file
#note: saving our custom mode class as a seperate file makes it easier to pickle and unpickle the model for productionalization
from BoostedHybridModel import BoostedHybridModel

In [15]:
df_training.columns

Index(['WDIR', 'WSPD', 'GST', 'PRES', 'ATMP', 'WTMP', 'date', 'const', 'trend',
       'sin(1,freq=A-DEC)', 'cos(1,freq=A-DEC)', 'sin(2,freq=A-DEC)',
       'cos(2,freq=A-DEC)', 'sin(3,freq=A-DEC)', 'cos(3,freq=A-DEC)',
       'ATMP_lag_1', 'ATMP_lag_2', 'ATMP_lag_3', 'ATMP_lag_4', 'ATMP_lag_5',
       'ATMP_lag_6', 'ATMP_lag_7', 'ATMP_lag_8', 'WTMP_lag_1', 'WTMP_lag_2',
       'WTMP_lag_3', 'WTMP_lag_4', 'WTMP_lag_5', 'WTMP_lag_6', 'WTMP_lag_7',
       'WTMP_lag_8', 'WSPD_lag_1', 'WSPD_lag_2'],
      dtype='object')

In [16]:

# Our boosted hybrid model has two models: here we use a linear model that is just our fourier fit and a XGB model that predicts the residals
#The features for our first model are just fourier features which are generated automatically by our class

#For the XGB part of our model we use all of our lag features to predict the target value in 1 day (one timestep)
X_model_2=df_training[['ATMP_lag_1', 'ATMP_lag_2', 'ATMP_lag_3',
       'ATMP_lag_4', 'ATMP_lag_5', 'ATMP_lag_6', 'ATMP_lag_7', 'ATMP_lag_8',
       'WTMP_lag_1', 'WTMP_lag_2', 'WTMP_lag_3', 'WTMP_lag_4', 'WTMP_lag_5',
       'WTMP_lag_6', 'WTMP_lag_7', 'WTMP_lag_8', 'WSPD_lag_1', 'WSPD_lag_2']]
#Here we are using air temp as our target 
y=df_training['ATMP']

In [17]:
#use train test split to split our target and features into test and training sets
X_model_2_train, X_model_2_test, y_train, y_test = train_test_split( X_model_2, y, test_size=.2, shuffle=False)

In [18]:
#set our first and second models of our boosted hybrid model and fit to our training data
model_1=LinearRegression()
model_2=XGBRegressor()
model=BoostedHybridModel(model_1, model_2)
model.fit(X_model_2_train, y_train)

In [19]:
##Apply our model to testing data
y_pred = model.predict(X_model_2_test)

In [20]:
#calculate MAE for our predicted data
from sklearn.metrics import mean_absolute_error

mean_absolute_error( y_test, y_pred)

0.9662063072859188

In [21]:
#Pickle our model for use in productionalization
import pickle
modelFile = open('TemperatureModel_OneDay.p', 'wb')
pickle.dump(model, modelFile)                     
modelFile.close()

In [22]:
#Testing our pickled model to make sure everything works well before productionalization
modelFile = open('TemperatureModel_OneDay.p', 'rb')     
pmodel = pickle.load(modelFile)
y_predp=pmodel.predict(X_model_2_test)
modelFile.close()
mean_absolute_error( y_test, y_predp)

0.9662063072859188

In [23]:
#We now make a second model to predict the value of ou target in two time steps
X_model_2=df_training[[ 'ATMP_lag_2', 'ATMP_lag_3',
       'ATMP_lag_4', 'ATMP_lag_5', 'ATMP_lag_6', 'ATMP_lag_7', 'ATMP_lag_8',
        'WTMP_lag_2', 'WTMP_lag_3', 'WTMP_lag_4', 'WTMP_lag_5',
       'WTMP_lag_6', 'WTMP_lag_7', 'WTMP_lag_8', 'WSPD_lag_2']]

y=df_training['ATMP']

In [24]:
#create new training and testing data since this model is indempendent from our 1 day model
X_model_2_train, X_model_2_test, y_train, y_test = train_test_split( X_model_2, y, test_size=.2, shuffle=False)

In [25]:
##Follow same procedure to build 2-day model with our new train/test data
model_1=LinearRegression()
model_2=XGBRegressor()
model=BoostedHybridModel(model_1, model_2)
model.fit(X_model_2_train, y_train)

y_pred = model.predict(X_model_2_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error( y_test, y_pred)

In [28]:
#Serialize our two day model 
import pickle
modelFile = open('TemperatureModel_TwoDay.p', 'wb')
pickle.dump(model, modelFile)                     
modelFile.close()

In [29]:
#Test our serialized model before deployment
modelFile = open('TemperatureModel_TwoDay.p', 'rb')     
pmodel = pickle.load(modelFile)
y_predp=pmodel.predict(X_model_2_test)
modelFile.close()
mean_absolute_error( y_test, y_predp)

1.2087390774034952