In [1]:
%run utils.ipynb

In [2]:
warnings.filterwarnings('ignore')

In [3]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [4]:
df=pd.read_csv("../data/train.csv")
df_copy=df.copy()

Let's retrieve the regressor made previously, together with some of the columns to impute

In [5]:
cols_to_impute_with_none=["Pool QC","Misc Feature","Alley","Fence","Fireplace Qu",
                          "Garage Finish","Garage Qual","Garage Cond","Garage Type",
                         "Bsmt Exposure","BsmtFin Type 2","Bsmt Cond","Bsmt Qual","BsmtFin Type 1",
                         "Mas Vnr Type"]

cols_to_impute_with_zero=["Garage Yr Blt","Mas Vnr Area","Bsmt Full Bath","Bsmt Half Bath","Garage Area",
                         "Garage Cars","Total Bsmt SF","Bsmt Unf SF","BsmtFin SF 2","BsmtFin SF 1"]

cols_to_impute_with_mode=["Electrical"]

In [6]:
#get a full list of one-hot-encoded feature names, as this will become misaligned once KFold splits up the training set.
full_ohe_list=list(pd.get_dummies(df.select_dtypes(exclude='number'),drop_first=True).columns)

In [7]:
#Create a regressor to dynamically creates a pipeline
def create_regressor(model,**kwargs):
    '''
    model: estimator instance
    return: TransformedTargetRegressor
    '''
    pipeline=Pipeline(steps=[
#         ('selector',FeatureSelector(**kwargs)),
        ('standard_impute',StandardImpute(none=cols_to_impute_with_none,
                                          zero=cols_to_impute_with_zero,
                                          mode=cols_to_impute_with_mode)),
        ('lot_frotage_impute',LotFrontageImpute()),
        ('ordinal_to_numerical',OrdinalToNumeric()),
        ('selector',FeatureSelector(**kwargs)),
#         ('onehotencode',OneHotEncode()),
#         ('align_train_and_predict',AlignTrainPredict(full_feature_list=full_ohe_list)), #This is to align the train and predict DF in case they are different
        ('passthrough',Passthrough()), #Passthrough step, does nothing. Only exists to allow external code to retrieve feature names.
        ('model',model)
    ])
    
    return TransformedTargetRegressor(regressor=pipeline,
                                    func=np.log1p,
                                    inverse_func=np.expm1
                                    )

In [8]:
#set up X and Y
X=df.drop(columns="SalePrice")
y=df["SalePrice"]

#make a backup copy
X_copy=X.copy()
y_copy=y.copy()

In [9]:
RMSE=[]
coefficients=[]
feature_names=None

#create regressor and instruct to keep only numerical features
lr=create_regressor(LinearRegression(),by="numerical")

#run 10 K Folds and store the coefficients and RMSE to a list.
kf=KFold(n_splits=5,random_state=30,shuffle=True)
for train_index, test_index in tqdm(kf.split(X_copy.copy())):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train,y_train)
    ypred=lr.predict(X_test)
    RMSE.append(mean_squared_error(y_test,ypred,squared=False))
    coefficients.append(lr.regressor_['model'].coef_)
    feature_names=list(lr.regressor_['passthrough'].get_feature_names())

0it [00:00, ?it/s]

In [10]:
np.mean(RMSE)

59998.51682492746

That's not very good. Let's look at the coefficients.

In [35]:
#retrieve list of features whose |coefficients| are top X
coeff_top_x=10
dfcoeff=np.abs(pd.DataFrame(coefficients,columns=feature_names).mean())

# features_to_keep=list(dfcoeff.mean().loc[lambda x:np.abs(x)>coeff_cutoff].index)
# features_to_keep
list_=list(dfcoeff.sort_values(ascending=False).head(coeff_top_x).index)

Let's have a new list of features from above, and those from part 2

In [36]:
features_to_keep=list(set([
*['Overall Qual',
 'Overall Cond',
 'Bsmt Full Bath',
 'Full Bath',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars',
 'Yr Sold'],
    *list_]
))
features_to_keep

['Overall Cond',
 'Kitchen AbvGr',
 'Fireplaces',
 'Bsmt Full Bath',
 'Full Bath',
 'TotRms AbvGrd',
 'Garage Cars',
 'Land Slope',
 'Overall Qual',
 'Central Air',
 'Yr Sold',
 'Kitchen Qual']

In [37]:
RMSE=[]
coefficients=[]
feature_names=None

#create regressor and instruct to keep only numerical features
lr=create_regressor(LinearRegression(),feature_names=features_to_keep)

#run 10 K Folds and store the coefficients and RMSE to a list.
kf=KFold(n_splits=5,random_state=30,shuffle=True)
for train_index, test_index in tqdm(kf.split(X_copy.copy())):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train,y_train)
    ypred=lr.predict(X_test)
    RMSE.append(mean_squared_error(y_test,ypred,squared=False))
    coefficients.append(lr.regressor_['model'].coef_)
    feature_names=list(lr.regressor_['passthrough'].get_feature_names())

0it [00:00, ?it/s]

In [38]:
np.mean(RMSE)

33355.352814555205

### Try a submission

In [None]:
#Instantiate a copy of Tester helper class
tester=Tester("../data/test.csv")

In [None]:
lr.fit(X, y)#obtain a copy of the test set
df_test=tester.get_test_df()

In [None]:
#reset the df
X=X_copy.copy()

#instantiate and train the model
lr=create_regressor(LinearRegression(),feature_names=features_to_keep)
lr.fit(X,y)

In [None]:
#make prediction
ypred=lr.predict(df_test)

In [None]:
tester.write_submission(ypred,"../data/03_ordinals_to_numerical.csv")