In [30]:
import pandas as pd
import numpy as np
import numpy.ma as npma
import scipy as sc
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from scipy.special import boxcox1p

from sklearn.linear_model import ElasticNet, Lasso, LassoLars,  BayesianRidge, LassoLarsIC, Ridge
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
os.sys.path.append(os.path.pardir)
from src.models.ConvexRegressor import ConvexRegressor
from src.models.ConvexRegressor import convregparam
from src.models.ConvexRegressor import convregconv

#############################################################################################################
project_dir =os.path.pardir
raw_path = os.path.join(project_dir,'data','processed')
train_path = os.path.join(raw_path, 'train.csv')
test_path = os.path.join(raw_path, 'test.csv')
sns.set()
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
train = train.drop(['Id','Unnamed: 0'],axis = 1)
test = test.drop(['Unnamed: 0'],axis = 1)
y = train['SalePrice'].values
Ids = test['Id'].values
train_x = train.drop('SalePrice',axis = 1).values
test_x = test.drop('Id',axis = 1).values

#################################################################################################################
def Score(x, y, scaler, solver, q = 0.4, fldmask = None):
    scaler = scaler.fit(x)
    x = scaler.transform(x)
    yscaler = QuantileTransformer(output_distribution='normal').fit(y.reshape([-1,1]))
    y = yscaler.transform(y.reshape([-1,1])).ravel()
    if fldmask is not None:
        x = x[:,fldmask]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=q, random_state=0)   
    yscaler = QuantileTransformer(output_distribution='normal').fit(y_train.reshape([-1,1]))
    y_train = yscaler.transform(y_train.reshape([-1,1])).ravel()        
    solver = solver.fit(x_train, y_train)
    score = solver.score(x_test, y_test)
    print(score)
    return score

def LearnAndPredict(x_test,x, y, scaler, solver, ids, plot = False, fldmask = None, filename = 'temp.csv'):
    yscaler = QuantileTransformer(output_distribution='normal').fit(y.reshape([-1,1]))
    y = yscaler.transform(y.reshape([-1,1])).ravel()
    if fldmask is not None:
        x = x[:,fldmask]
        x_test = x_test[:,fldmask]
    scaler = scaler.fit(x)
    x = scaler.transform(x)
    solver = solver.fit(x, y)
    x_test = scaler.transform(x_test)
    y = solver.predict(x_test)
    y = yscaler.inverse_transform(y.reshape([-1,1])).ravel()
    df_submission = pd.DataFrame({'Id': ids, 'SalePrice' : y} )
    submission_data_path = os.path.join(os.path.pardir,'data','external')
    submission_file_path = os.path.join(submission_data_path, filename)
    df_submission.to_csv(submission_file_path, index=False)
    if hasattr(solver, 'best_score_'):
        print(solver.best_score_)
    if hasattr(solver, 'best_params_'):
        print(solver.best_params_)
    if plot == True:
        sns.distplot(y,fit = norm)
    return solver

def linear_ml_solve(x, y, scaler, solver, q = 0.4):
    yscaler = QuantileTransformer(output_distribution='normal').fit(y.reshape([-1,1]))
    y = yscaler.transform(y.reshape([-1,1])).ravel()
    x = scaler.fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=q, random_state=0)
    solver = solver.fit(x_train, y_train)
    score = solver.score(x_test,y_test)
    print(score)
    y = solver.predict(x_test)
    y = yscaler.inverse_transform(y.reshape([-1,1])).ravel()
    if hasattr(solver, 'best_score_'):
        print(solver.best_score_)
    if hasattr(solver, 'best_params_'):
        print(solver.best_params_)
    return y

# Some Global Parameters

In [27]:
solver = LearnAndPredict(test_x,train_x, y, scaler = RobustScaler(), solver = Lasso(alpha = 0.001,max_iter = 10000,tol = 0.0001), ids = Ids, filename = 'LassoRidge.csv')
fldmask = solver.coef_ == 0.0
scaler = RobustScaler()

# 1. Lasso

In [31]:
solver = Lasso(alpha = 0.001,max_iter = 10000,tol = 0.0001)
Score(train_x, y, scaler, solver, q = 0.4, fldmask = fldmask)
Score(train_x, y, scaler, solver, q = 0.4, fldmask = None)

0.6838057972470732
0.9182196591388715


0.9182196591388715

# 2.Ridge

In [None]:
scaler = RobustScaler()
est1 = Ridge()
est2 = Lasso()
estimators = [est1,est2]
cest = ConvexRegressor(ests=estimators)
param_grid = convregparam(est2,{'alpha': [ 7.0, 6.0, 3.0, 0.02,0.001, 0.00099, 0.009, 0.0008]})
param_grid.update(convregparam(est1,{'alpha' : [7.0, 6.0, 4.0, 2.0, 0.6,0.5,0.55,0.45]}))
param_grid.update({'convparams' : [convregconv(estimators,[0.11, 0.89]), convregconv(estimators,[0.1,0.9]),
                                   convregconv(estimators,[0.09, 0.91]), convregconv(estimators,[0.05,0.95]),
                                   convregconv(estimators,[0.01,0.99]), convregconv(estimators,[0.001,0.999])]})

gsolver = GridSearchCV(cest, cv=5,param_grid=param_grid)
house_price = LearnAndPredict(test_x,train_x, y, scaler, solver = gsolver, ids = Ids, q = 0.4, filename = 'LassoRidge.csv')
#0.919300261382389
#{'Lasso$$&&$$alpha': 0.001, 'Ridge$$&&$$alpha': 7.0, 'convparams': {'Ridge': 0.001, 'Lasso': 0.999}}

In [None]:
scaler = RobustScaler()
est1 = Ridge()
est2 = Lasso()
estimators = [est1,est2]
cest = ConvexRegressor(ests=estimators)
param_grid = convregparam(est2,{'alpha': [0.0005, 0.0009, 0.001, 0.0015, 0.0019, 0.002]})
param_grid.update(convregparam(est1,{'alpha' : [7.0, 7.5, 8.0, 8.5, 9.0]}))
param_grid.update({'convparams' : [convregconv(estimators,[0.0001,0.9999]), convregconv(estimators,[0.001,0.999])]})

gsolver = GridSearchCV(cest, cv=5,param_grid=param_grid)
house_price = LearnAndPredict(test_x,train_x, y, scaler, solver = gsolver, ids = Ids, q = 0.4, filename = 'LassoRidge.csv')

In [21]:
scaler = RobustScaler()
est = Ridge()
param_grid = {'alpha': [8.0, 7.8, 7.5, 7.3, 7.0]}
gsolver = GridSearchCV(Ridge(max_iter = 10000,tol = 0.00001), cv=5,param_grid=param_grid)
house_price = LearnAndPredict(test_x,train_x, y, scaler, solver = gsolver, ids = Ids, filename = 'Ridge.csv')

In [None]:
scaler = RobustScaler()
est = Lasso()
param_grid = {'alpha': [5.0,3.0,2.0,1.5,1.3,1.2,1.1,1e0,0.8,0.5,0.2,0.1, 1e-2,0.05,1e-3,0.005, 1e-4,0.0005, 1e-5,0.000051e-6,1e-7]}
param_grid = {'alpha': [ 0.004,0.003, 0.002,0.001, 0.000888889, 0.0008, 0.0005, 0.0009, 0.0015, 0.0019]}
param_grid = {'alpha': [ 0.001]}
gsolver = GridSearchCV(Lasso(max_iter = 10000,tol = 0.0001), cv=5,param_grid=param_grid)
gsolver = Lasso(max_iter = 10000,tol = 0.0001,alpha = 0.001)
house_price = LearnAndPredict(test_x,train_x, y, scaler, solver = gsolver, ids = Ids, q = 0.4, filename = 'Lasso.csv')
#linear_ml_solve(train_x, y, scaler, gsolver, q = 0.4)

In [None]:
(house_price == 0.0).sum()