In [1]:
%run utils.ipynb

In [2]:
warnings.filterwarnings('ignore')

In [3]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [4]:
df=pd.read_csv("../data/train.csv")
df_copy=df.copy()

### Let's run a simple Linear Regression using all the numerical features

### Set up model

In [6]:
#set up X and Y
X=df.drop(columns="SalePrice")
y=df["SalePrice"]

#make a backup copy
X_copy=X.copy()
y_copy=y.copy()

#### Set up a `TransformedTargetRegressor` to handle the log / exponential transformation of the label  
Uses Pipeline to perform preprocessing and prediction

In [7]:
#Create a regressor to dynamically creates a pipeline
def create_regressor(model,**kwargs):
    '''
    model: estimator instance
    return: TransformedTargetRegressor
    '''
    pipeline=Pipeline(steps=[
        ('selector',FeatureSelector(**kwargs)),
        ('impute_zero',ImputeZero()),
        ('align_train_and_predict',AlignTrainPredict()), #This is to align the train and predict DF in case they are different
        ('passthrough',Passthrough()), #Passthrough step, does nothing. Only exists to allow external code to retrieve feature names.
        ('model',model)
    ])
        
    return TransformedTargetRegressor(regressor=pipeline,
                                    func=np.log1p,
                                    inverse_func=np.expm1)

In [8]:
RMSE=[]
coefficients=[]
feature_names=None

#create regressor and instruct to keep only numerical features
lr=create_regressor(LinearRegression(),by="numerical")

#run 10 K Folds and store the coefficients and RMSE to a list.
kf=KFold(n_splits=10,random_state=30,shuffle=True)
for train_index, test_index in kf.split(X_copy.copy()):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train,y_train)
    ypred=lr.predict(X_test)
    RMSE.append(mean_squared_error(y_test,ypred,squared=False))
    coefficients.append(lr.regressor_['model'].coef_)
    feature_names=list(lr.regressor_['passthrough'].get_feature_names())

In [9]:
np.mean(RMSE)

50489.44097087149

Ok, this is worse than what we had in part 1 (49648).  
However, this is becuase we used **all** numerical features.  
Let's look at their coefficients to pick out only the useful ones.

In [11]:
#retrieve list of features whose |coefficients| are >= 0.01
coeff_cutoff=0.01
dfcoeff=pd.DataFrame(coefficients,columns=feature_names)
features_to_keep=list(dfcoeff.mean().loc[lambda x:np.abs(x)>coeff_cutoff].index)
features_to_keep

['Overall Qual',
 'Overall Cond',
 'Bsmt Full Bath',
 'Full Bath',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars',
 'Yr Sold']

Let's try again with reduced number of features.

In [12]:
RMSE=[]
coefficients=[]
feature_names=None

lr=create_regressor(LinearRegression(),feature_names=features_to_keep)

kf=KFold(n_splits=10,random_state=30,shuffle=True)
for train_index, test_index in kf.split(X_copy.copy()):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr.fit(X_train,y_train)
    ypred=lr.predict(X_test)
    RMSE.append(mean_squared_error(y_test,ypred,squared=False))
    coefficients.append(lr.regressor_['model'].coef_)
    feature_names=list(lr.regressor_['passthrough'].get_feature_names())

In [13]:
np.mean(RMSE)

34644.771941119994

Ahh. Now it performs better.

### Try a submission

In [14]:
#Instantiate a copy of Tester helper class
tester=Tester("../data/test.csv")

In [15]:
#obtain a copy of the test set
df_test=tester.get_test_df()

In [16]:
#instantiate and train the model
lr=create_regressor(LinearRegression(),feature_names=features_to_keep)
lr.fit(X_copy.copy(),y)

In [17]:
#make prediction
ypred=lr.predict(df_test)

In [18]:
tester.write_submission(ypred,"../data/02_all_features_submission.csv")

Output saved to ../data/02_all_features_submission.csv
