In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler

from utils import RMSE_vs_feat_count
from transformers import DropCorrelated, MSSubClassConvert, StandardImpute, LotFrontageImpute, OrdinalToNumeric, \
    FeatureSelector, Passthrough
from tester import Tester

from tqdm.notebook import tqdm
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("../data/train.csv")

#drop some useless columns
cols_to_drop=["Id","PID"]
df=df.drop(columns=cols_to_drop)

#drop outliers identified in part 1
df=df[df['Gr Liv Area']<=4500]
df=df[df['SalePrice']>np.expm1(10)]

df_copy=df.copy()

Let's look at the rows which have null, and the number of nulls in them.

In [None]:
df_null=df.isnull().sum().sort_values(ascending=False).loc[lambda x: x>0]

In [None]:
plt.figure(figsize=(16,4))
sns.barplot(x=df_null.index, y=df_null).set_title("Feature names and their null row count")
_=plt.xticks(rotation=90)

We will impute them according to their description, as outlined [here](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt).

In [None]:
cols_to_impute_with_none=["Pool QC","Misc Feature","Alley","Fence","Fireplace Qu",
                          "Garage Finish","Garage Qual","Garage Cond","Garage Type",
                         "Bsmt Exposure","BsmtFin Type 2","Bsmt Cond","Bsmt Qual","BsmtFin Type 1",
                         "Mas Vnr Type"]

cols_to_impute_with_zero=["Garage Yr Blt","Mas Vnr Area","Bsmt Full Bath","Bsmt Half Bath","Garage Area",
                         "Garage Cars","Total Bsmt SF","Bsmt Unf SF","BsmtFin SF 2","BsmtFin SF 1"]

cols_to_impute_with_mode=["Electrical"]

In [None]:
correlated_to_drop=["1st Flr SF","Garage Yr Blt","TotRms AbvGrd","Garage Area"]

In [None]:
#Create a regressor to dynamically creates a pipeline
def create_regressor(model,**kwargs):
    '''
    model: estimator instance
    return: TransformedTargetRegressor
    '''
    pipeline=Pipeline(steps=[
        ('drop_correlated',DropCorrelated(correlated_to_drop)),
        ('ms_subclass_convert',MSSubClassConvert()),
        ('standard_impute',StandardImpute(none=cols_to_impute_with_none,
                                          zero=cols_to_impute_with_zero,
                                          mode=cols_to_impute_with_mode)),
        ('lot_frotage_impute',LotFrontageImpute()),
        ('ordinal_to_numerical',OrdinalToNumeric()),
        ('selector',FeatureSelector(**kwargs)),
        ('passthrough',Passthrough()), #Passthrough step, does nothing. Only exists to allow external code to retrieve feature names.
        ('robustscalar',RobustScaler()),
        ('model',model)
    ])
    
    return TransformedTargetRegressor(regressor=pipeline,
                                    func=np.log1p,
                                    inverse_func=np.expm1
                                    )

In [None]:
#set up X and Y
X=df.drop(columns="SalePrice")
y=df["SalePrice"]

#make a backup copy
X_copy=X.copy()
y_copy=y.copy()

In [None]:
RMSE=[]
coefficients=[]
feature_names=None

#create regressor and instruct to keep only numerical features
lr=create_regressor(LinearRegression(),by="numerical")

#run 10 K Folds and store the coefficients and RMSE to a list.
kf=KFold(n_splits=5,random_state=30,shuffle=True)
for train_index, test_index in tqdm(kf.split(X_copy.copy())):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    lr.fit(X_train,y_train)
    ypred=lr.predict(X_test)
    RMSE.append(mean_squared_error(y_test,ypred,squared=False))
    coefficients.append(lr.regressor_['model'].coef_)
    feature_names=list(lr.regressor_['passthrough'].get_feature_names())

In [None]:
np.mean(RMSE)

That's slightly better than part 2 (23350).  
Let's iterate through all the features to find the top X again.

In [None]:
#sort the coefficients from the regressor
dfcoeff=np.abs(pd.DataFrame(coefficients,columns=feature_names).mean()).sort_values(ascending=False)

In [None]:
results=[]

for coeff_top_x in tqdm(range(3,len(feature_names)+1)):
    # features_to_keep
    features_to_keep=list(dfcoeff.head(coeff_top_x).index)
    
    RMSE=[]

    lr=create_regressor(LinearRegression(),feature_names=features_to_keep)

    kf=KFold(n_splits=10,random_state=30,shuffle=True)
    for train_index, test_index in kf.split(X_copy.copy()):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        lr.fit(X_train,y_train)
        ypred=lr.predict(X_test)
        RMSE.append(mean_squared_error(y_test,ypred,squared=False))

    results.append({'Feature Count':coeff_top_x,'RMSE':np.mean(RMSE)})

In [None]:
RMSE_vs_feat_count(pd.DataFrame(results),drawline=30)

Sweet spot seems to be a sweet spot at 30, about 22.0 RMSE.
Let's pick the top 30 features here.

In [None]:
features_to_keep=list(dfcoeff.head(30).index)
features_to_keep

### Try a submission

In [None]:
#Instantiate a copy of Tester helper class
tester=Tester("../data/test.csv")

In [None]:
lr.fit(X, y)#obtain a copy of the test set
df_test=tester.get_test_df()

In [None]:
#reset the df
X=X_copy.copy()

#instantiate and train the model
lr=create_regressor(LinearRegression(),feature_names=features_to_keep)
lr.fit(X,y)

In [None]:
#make prediction
ypred=lr.predict(df_test)

In [None]:
tester.write_submission(ypred,"../data/03_ordinals_to_numerical.csv")

**Results**  
Score:27856  
Private score: 22765