In [1]:
%run utils.ipynb

In [2]:
warnings.filterwarnings('ignore')

In [3]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [4]:
df=pd.read_csv("../data/train.csv")
df_copy=df.copy()

Let's look at the perecentage of null in each feature.

In [5]:
df_null=df.isnull().mean().sort_values(ascending=False)*100
df_null

Pool QC            99.561190
Misc Feature       96.830814
Alley              93.174061
Fence              80.497318
Fireplace Qu       48.756704
Lot Frontage       16.089712
Garage Finish       5.558264
Garage Qual         5.558264
Garage Yr Blt       5.558264
Garage Cond         5.558264
Garage Type         5.509508
Bsmt Exposure       2.827889
BsmtFin Type 2      2.730375
BsmtFin Type 1      2.681619
Bsmt Cond           2.681619
Bsmt Qual           2.681619
Mas Vnr Area        1.072647
Mas Vnr Type        1.072647
Bsmt Half Bath      0.097513
Bsmt Full Bath      0.097513
Garage Area         0.048757
Total Bsmt SF       0.048757
Bsmt Unf SF         0.048757
BsmtFin SF 2        0.048757
BsmtFin SF 1        0.048757
Garage Cars         0.048757
Mo Sold             0.000000
Sale Type           0.000000
Full Bath           0.000000
Half Bath           0.000000
Bedroom AbvGr       0.000000
Kitchen AbvGr       0.000000
Kitchen Qual        0.000000
Yr Sold             0.000000
Misc Val      

### Missing values

In [6]:
cols_to_impute_with_none=["Pool QC","Misc Feature","Alley","Fence","Fireplace Qu",
                          "Garage Finish","Garage Qual","Garage Cond","Garage Type",
                         "Bsmt Exposure","BsmtFin Type 2","Bsmt Cond","Bsmt Qual","BsmtFin Type 1",
                         "Mas Vnr Type"]

cols_to_impute_with_zero=["Garage Yr Blt","Mas Vnr Area","Bsmt Full Bath","Bsmt Half Bath","Garage Area",
                         "Garage Cars","Total Bsmt SF","Bsmt Unf SF","BsmtFin SF 2","BsmtFin SF 1"]

cols_to_impute_with_mode=["Electrical"]

Ok! everything looks good now.

### We will change ordinal features to numerical

### We will one hot encode categorical features

Let's take a look at the correlation heatmap

Not very useful, so let's just ignore for now.

From the 7 primary variables we had from part 1,  
`Overall Qual`, `Gr Liv Area`, `Garage Cars`, `Year Built`, `Total Bsmt SF`, `Year Remod/Add`, `Full Bath`,   
we now have a few more:  
`Exter Qual`, `Kitchen Qual`, `Bsmt Qual`, `Garage Finish`, `Foundation_PConc`,`Fireplace Qu`, `Heating QC`.

### Let's run a simple Linear Regression using all the numerical features

### Set up model

In [7]:
#retrieve df_train from backup
df=df_copy.copy()

#set up X and Y
X=df.drop(columns="SalePrice")
y=df["SalePrice"]

#make a backup copy
X_copy=X.copy()
y_copy=y.copy()

#### Set up a `TransformedTargetRegressor` to handle the log / exponential transformation of the label  
Uses Pipeline to perform preprocessing and prediction

In [8]:
#Create a regressor to dynamically creates a pipeline
def create_regressor(model,**kwargs):
    '''
    model: estimator instance
    return: TransformedTargetRegressor
    '''
    pipeline=Pipeline(steps=[
        ('selector',FeatureSelector(**kwargs)),
        ('simple_imputer',SimpleImputer(strategy="constant",fill_value=0)),
#         ('standard_impute',StandardImpute(none=cols_to_impute_with_none,
#                                           zero=cols_to_impute_with_zero,
#                                           mode=cols_to_impute_with_mode)),
#         ('lot_frontage_impute',LotFrontageImpute()),
#         ('ordinal_to_numerical',OrdinalToNumeric()),
#         ('numerical_scaling',Scaling()) reserved for scaling
#         ('onehotencode',OneHotEncode()),
#         ('selector',FeatureSelector(features_to_keep)),
        
        
#         ('categorical_numerical_split',Categorical_Numerical_Split),
        ('align_train_and_predict',AlignTrainPredict()),
        ('display',DispStep()),
        ('model',model)
    ])
    
#     display(pipeline)
    
    return TransformedTargetRegressor(regressor=pipeline,
                                    func=np.log1p,
                                    inverse_func=np.expm1)

In [10]:
X=X_copy.copy()

lr=create_regressor(LinearRegression(),by="numerical")

# np.sqrt(-cross_val_score(lr,X,y,cv=5,scoring='neg_mean_squared_error')).mean()

In [12]:
lr.fit(X,y)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
RMSE=[]
coefficients=[]
feature_names=None

In [None]:
kf=KFold(n_splits=10,random_state=30,shuffle=True)
for train_index, test_index in kf.split(X_copy.copy()):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train,y_train)
    ypred=lr.predict(X_test)
    RMSE.append(mean_squared_error(y_test,ypred,squared=False))
#     display(lr.regressor_['model'].coef_)
    coefficients.append(lr.regressor_['model'].coef_)
    feature_names=list(lr.regressor_['display'].get_feature_names())

In [None]:
np.mean(RMSE)

Ok, this is slightly better!

In [None]:
dfcoeff=pd.DataFrame(coefficients,columns=feature_names)

In [None]:
features_to_keep=list(dfcoeff.mean().loc[lambda x:np.abs(x)>0.01].index)
features_to_keep

Let's look at coefficient.

### Try a submission

In [None]:
#Instantiate a copy of Tester helper class
tester=Tester("../data/test.csv")

In [None]:
#obtain a copy of the test set
df_test=tester.get_test_df()

In [None]:
#reset the df
X=X_copy.copy()

#instantiate and train the model
lr=create_regressor(LinearRegression())
lr.fit(X,y)

In [None]:
#make prediction
ypred=lr.predict(df_test)

In [None]:
tester.write_submission(ypred,"../data/02_all_features_submission.csv")