In [1]:
import os
import pandas as pd
import numpy as np

cwd = os.getcwd()
data_path = os.path.join(cwd, "appml-assignment1-dataset.pkl")
data = pd.read_pickle(data_path)
X = data["X"]
y = data["y"]

<font size = "5">**Model 1: Linear Regression**</font>

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class myTransform(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='linear'):
        self.strategy = strategy
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_sorted = X.sort_values(by=['date'])
        X_interpolated = X_sorted.interpolate(method=self.strategy, limit_direction='both')
        return X_interpolated.drop('date', axis=1)

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline_1 = Pipeline([
        ('my_transform', myTransform()),
        ('std_scaler', StandardScaler()),
    ])

In [5]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
X_train_prepared_1 = pipeline_1.fit_transform(X_train)
X_test_prepared_1 = pipeline_1.transform(X_test)

In [6]:
model_1 = lin_reg.fit(X_train_prepared_1, y_train)

In [7]:
y_predict = model_1.predict(X_test_prepared_1)

In [8]:
from sklearn.metrics import mean_squared_error

mse_1 = mean_squared_error(y_predict, y_test)
print(mse_1)

0.017578429015134883


In [20]:
class myTransform_2(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='linear'):
        self.strategy = strategy
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_sorted = X.sort_values(by=['date'])
        X_interpolated = X_sorted.interpolate(method=self.strategy, limit_direction='both')
        return X_interpolated["CAD-high"].to_numpy().reshape(-1,1)
    
pipeline_test = Pipeline([
        ('my_transform', myTransform_2()),
        ('std_scaler', StandardScaler()),
    ])

# X_sorted = X_train.sort_values(by=['date'])
# X_interpolated = X_sorted.interpolate(method='linear', limit_direction='both')
# X_test = X_interpolated["CAD-high"]
# type(X_test)

X_train_prepared_test = pipeline_test.fit_transform(X_train)
X_test_prepared_test = pipeline_test.transform(X_test)



<font size = "5">**Model 2: Stochastic Gradient Descent**</font>

In [9]:
pipeline_2 = Pipeline([
        ('my_transform', myTransform()),
        ('std_scaler', StandardScaler()),
    ])

In [10]:
X_train_prepared_2 = pipeline_2.fit_transform(X_train)
X_test_prepared_2 = pipeline_2.transform(X_test)

In [11]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from copy import deepcopy

sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,
    penalty=None, learning_rate="constant", eta0=0.001)

minimum_val_error = float("inf")
best_epoch = None
model_2 = None
for epoch in range(1000):
    sgd_reg.fit(X_train_prepared_2, y_train.ravel())
    y_test_predict = sgd_reg.predict(X_test_prepared_2)
    val_error = mean_squared_error(y_test, y_test_predict)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        model_2 = deepcopy(sgd_reg)
        

In [12]:
mse_2 = mean_squared_error(model_2.predict(X_test_prepared_2), y_test)
print(mse_2)

0.017623795185392567


<font size = "5">**Save models and pipelines into pickle format**</font>

In [13]:
import joblib

file_pipeline_1 = open('pipeline1.pkl', 'wb') 
joblib.dump(pipeline_1, file_pipeline_1)
file_pipeline_1.close()

file_model_1 = open('model1.pkl', 'wb')
joblib.dump(model_1, file_model_1)
file_model_1.close()

file_pipeline_2 = open('pipeline2.pkl', 'wb')
joblib.dump(pipeline_2, file_pipeline_2)
file_pipeline_2.close()

file_model_2 = open('model2.pkl', 'wb')
joblib.dump(model_2, file_model_2)
file_model_2.close()

<font size = "5">**Load models and pipelines from pickled files**</font>

In [14]:
# clear all variables
%reset -f

In [15]:
import os
import pandas as pd
import numpy as np
import joblib
from code1 import *
from code2 import *
from sklearn.metrics import mean_squared_error

file_model = open('model1.pkl', 'rb')
model_1 = joblib.load(file_model)
file_model.close()

file_pipeline = open('pipeline1.pkl', 'rb')
pipeline_1 = joblib.load(file_pipeline)
file_pipeline.close

file_model = open('model2.pkl', 'rb')
model_2 = joblib.load(file_model)
file_model.close()

file_pipeline = open('pipeline2.pkl', 'rb')
pipeline_2 = joblib.load(file_pipeline)
file_pipeline.close

<function BufferedReader.close>

In [16]:
# Load data for testing
cwd = os.getcwd()
data_path = os.path.join(cwd, "appml-assignment1-dataset.pkl")
data = pd.read_pickle(data_path)
X = data["X"]
y = data["y"]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
X_test_1 = pipeline_1.transform(X_test)
X_test_2 = pipeline_2.transform(X_test)

from sklearn.metrics import mean_squared_error
mse_1 = mean_squared_error(model_1.predict(X_test_1), y_test)
mse_2 = mean_squared_error(model_2.predict(X_test_2), y_test)
print(mse_1)
print(mse_2)

0.017578429015134883
0.017623795185392567
