In [9]:
import sqlite3

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor



In [10]:
#Format a dataframe after impoting from SQL
def format_df(df_in):
    df=df_in.copy(deep=True)
    df["datetime"] = pd.to_datetime(df["datetime"])
    df=df.set_index('datetime')
    df["date"] = pd.to_datetime(df["date"])
    return df

In [11]:
con = sqlite3.connect(r"C:\Users\dakot\Desktop\DataScience\projects\weather_prediction\NDBC_model_building_database.db")
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('NDBC_historical_raw_data',), ('NDBC_historical_raw_data_St#{STATIONNUMBER}',), ('NDBC_historical_raw_data_St{STATIONNUMBER}',), ('NDBC_historical_raw_data_St46054',), ('NDBC_historical_cleaned_data',), ('NDBC_historical_data_for_training',)]


In [13]:
#Query cleaned data from DB
conn = sqlite3.connect(r"C:\Users\dakot\Desktop\DataScience\projects\weather_prediction\NDBC_model_building_database.db")
df_training = pd.read_sql_query("SELECT * FROM NDBC_historical_data_for_training", conn, index_col=None)

In [14]:
df_training=format_df(df_training)

In [36]:
class BoostedHybridModel:
    def __init__(self, model_1, model_2):
        self.model_1=model_1
        self.model_2=model_2
        self.y_column=None

    def fit(self, X_model_1, X_model_2, y):
        self.model_1.fit(X_model_1, y)
        y_fit=pd.Series(self.model_1.predict(X_model_1), index=X_model_1.index, name=y.name)
        y_residuals=y-y_fit
        self.model_2.fit(X_model_2, y_residuals)
        self.y_columns=y.name
        self.y_fit=y_fit
        self.y_residuals=y_residuals

    def predict(self, X_model_1, X_model_2):
        y_predict=pd.Series(self.model_1.predict(X_model_1), index=X_model_1.index, name=self.y_column)
        y_predict+=self.model_2.predict(X_model_2)
        return y_predict

In [16]:
df_training.columns

Index(['WDIR', 'WSPD', 'GST', 'PRES', 'ATMP', 'WTMP', 'date', 'const', 'trend',
       'sin(1,freq=A-DEC)', 'cos(1,freq=A-DEC)', 'sin(2,freq=A-DEC)',
       'cos(2,freq=A-DEC)', 'sin(3,freq=A-DEC)', 'cos(3,freq=A-DEC)',
       'ATMP_lag_1', 'ATMP_lag_2', 'ATMP_lag_3', 'ATMP_lag_4', 'ATMP_lag_5',
       'ATMP_lag_6', 'ATMP_lag_7', 'ATMP_lag_8', 'WTMP_lag_1', 'WTMP_lag_2',
       'WTMP_lag_3', 'WTMP_lag_4', 'WTMP_lag_5', 'WTMP_lag_6', 'WTMP_lag_7',
       'WTMP_lag_8', 'WSPD_lag_1', 'WSPD_lag_2'],
      dtype='object')

In [40]:
X_model_1=df_training[['const', 'trend', 'sin(1,freq=A-DEC)', 'cos(1,freq=A-DEC)',
       'sin(2,freq=A-DEC)', 'cos(2,freq=A-DEC)', 'sin(3,freq=A-DEC)',
       'cos(3,freq=A-DEC)']]

X_model_2=df_training[['ATMP_lag_1', 'ATMP_lag_2', 'ATMP_lag_3',
       'ATMP_lag_4', 'ATMP_lag_5', 'ATMP_lag_6', 'ATMP_lag_7', 'ATMP_lag_8',
       'WTMP_lag_1', 'WTMP_lag_2', 'WTMP_lag_3', 'WTMP_lag_4', 'WTMP_lag_5',
       'WTMP_lag_6', 'WTMP_lag_7', 'WTMP_lag_8', 'WSPD_lag_1', 'WSPD_lag_2']]

X_model_2=df_training[[ 'ATMP_lag_2', 'ATMP_lag_3',
       'ATMP_lag_4', 'ATMP_lag_5', 'ATMP_lag_6', 'ATMP_lag_7', 'ATMP_lag_8',
        'WTMP_lag_2', 'WTMP_lag_3', 'WTMP_lag_4', 'WTMP_lag_5',
       'WTMP_lag_6', 'WTMP_lag_7', 'WTMP_lag_8', 'WSPD_lag_2']]


y=df_training['ATMP']

In [41]:
X_model_1_train, X_model_1_test, X_model_2_train, X_model_2_test, y_train, y_test = train_test_split(X_model_1, X_model_2, y, test_size=.2, shuffle=False)

In [42]:
model_1=LinearRegression()
model_2=XGBRegressor()

model=BoostedHybridModel(model_1, model_2)

model.fit(X_model_1_train, X_model_2_train, y_train)

In [43]:
y_pred = model.predict(X_model_1_test, X_model_2_test)
#y_pred = y_pred.clip(0.0)

In [44]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error( y_test, y_pred)

1.2723344316093381