# Regression-House-Pricing 

Let's study regression by using Light Gradient Boosting and different variations of a dataset:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('../main/')

In [None]:
from sklearn.model_selection import train_test_split
from dsbase.ModelDSBase import ModelDSBaseWrapper

In [None]:
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModel
from dsbase.models.regression.LightGradientBoostingRegressionDSBase import LightGradientBoostingRegressionDSBaseModelParamsToMap

In [None]:
dataset_path = '../../datasets/house-pricing'

In [None]:
df = pd.read_csv(dataset_path + '/tt_null_impute_encode.csv')

In [None]:
columuns_without_value = []

In [None]:
df_train = df[df['TT']==1]
X = df_train.drop(['Unnamed: 0','TT','SalePrice'], axis=1).drop(columuns_without_value, axis=1).values
y = df_train['SalePrice'].values

In [None]:
X.shape

## Simple case

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
params = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=31, n_estimators=100, learning_rate=0.1,num_leaves=31, subsample_for_bin=200000, reg_alpha=0, reg_lambda=0)
#params = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=20, n_estimators=70, learning_rate=0.1, num_leaves=31, subsample_for_bin=1000000, reg_alpha=100000, reg_lambda=100)
lgbr = ModelDSBaseWrapper('LGBR',X_train,y_train,X_test, y_test,[20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100],LightGradientBoostingRegressionDSBaseModel,params)

In [None]:
lgbr.train()

Check importance of variables: 

In [None]:
from dsbase.SearchOptimumParams import getColumnsWithLessValue

In [None]:
df_train.drop(['Unnamed: 0','TT','SalePrice'], axis=1).columns[lgbr.model.model.feature_importances_.argsort()]

In [None]:
ser, columuns_without_value = getColumnsWithLessValue(df_train.drop(['Unnamed: 0','TT','SalePrice'], axis=1).columns,
                                                 lgbr.model.model.feature_importances_,
                                                 9)

In [None]:
# We could try the model by removing previously this variables with just a few importance in the model
columuns_without_value

Let's continue with evaluation: 

In [None]:
lclgdbr=lgbr.getLearningCurves()

In [None]:
plt.plot(lclgdbr[0,:],'b',lclgdbr[1,:],'r')

In [None]:
lgbr.getScore()

# Parameter optimization. Random Search over K-Fold Validation

In [None]:
from dsbase.SearchOptimumParams import evaluateParams, randomElement, showSearchOptimumHyperParametersReport

In [None]:
max_depth=[10,20,30,40,50,60,70,80,90,100]
n_estimators=[70,80,90,100,110,120,130,140,150]
learning_rate=[0.01,0.03,0.1,0.3,1]
subsample_for_bin=[75000,100000,150000,200000,500000,1000000,1300000,2000000,2300000]
num_leaves=[7,15,31,63,127]
reg_alpha=[0,50000,75000,90000,100000,120000]
reg_lambda=[0,20,50,100,150,200]

In [None]:
num_tries = 100

In [None]:
params = []
for i in range(num_tries):
    p = LightGradientBoostingRegressionDSBaseModelParamsToMap(max_depth=randomElement(max_depth),
                                                                   n_estimators=randomElement(n_estimators),
                                                                   learning_rate=randomElement(learning_rate),
                                                                   subsample_for_bin=randomElement(subsample_for_bin),
                                                                   reg_alpha=randomElement(reg_alpha),
                                                                   reg_lambda=randomElement(reg_lambda))
    params.append(p)

In [None]:
tries = evaluateParams(X, y, 5, LightGradientBoostingRegressionDSBaseModel, 'LGBR', params, num_tries)

In [None]:
showSearchOptimumHyperParametersReport(tries)

In [None]:
optimum_index = 52

In [None]:
params[optimum_index]

In [None]:
model = tries[optimum_index][2]
model.save()

In [None]:
# Some best cases so far:
{'n_estimators': 70,
 'max_depth': 20,
 'learning_rate': 0.1,
 'objetive': 'regression',
 'n_jobs': 1,
 'num_leaves': 31,
 'reg_alpha': 100000,
 'reg_lambda': 100,
 'subsample_for_bin': 1000000
},
{'n_estimators': 130,
 'max_depth': 10,
 'learning_rate': 0.1,
 'objetive': 'regression',
 'n_jobs': 1,
 'num_leaves': 31,
 'reg_alpha': 75000,
 'reg_lambda': 200,
 'subsample_for_bin': 1000000
},
{'n_estimators': 130,
 'max_depth': 70,
 'learning_rate': 0.1,
 'objetive': 'regression',
 'n_jobs': 1,
 'num_leaves': 31,
 'reg_alpha': 100000,
 'reg_lambda': 200,
 'subsample_for_bin': 1300000}

# Evaluating with test dataset 

In [1]:
# TODO

# End of Modeling! 