In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import time
from scipy import stats
from scipy.special import boxcox1p
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test_data.head()

In [None]:
test_data.shape

In [None]:
test_data.info()

In [None]:
train_data.isna().sum()

Since Id column is not useful in the analysis of our data , we will actually remove it from both the test and training data .

In [None]:
train_data.drop('Id', inplace = True,axis = 1)
test_id = test_data['Id']
test_data.drop('Id', inplace = True,axis = 1)


In [None]:
print(f'After dropping ID feature , train data : {train_data.shape} and test data = {test_data.shape}')

# OUTLIERS CHECK 

Trying to check for outliers and removing them using inline deleting technique !
Using only numerical features for this process !

In [None]:
min_percentile= 0.001
max_percentile= 0.999
# Use numeric features
features = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
            'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
target= 'SalePrice'
nrows= int(np.ceil(len(features)/2))
ncols= 2 

def detect_and_remove_outliers(inline_delete= True):
    global train_data
    fig, ax = plt.subplots(nrows = nrows, ncols = ncols, figsize = (24, nrows * 6))
    outliers = []
    cnt = 0
    for row in range (0, nrows):
        for col in range (0, ncols):
            # df_outliers = outlier_detection_using_percentile(features[cnt])
            # Outlier detection using percentile
            min_thresold, max_thresold = train_data[features[cnt]].quantile([min_percentile, max_percentile])
            # print(f'outlier_detection_using_percentile()> min_thresold: {min_thresold}, max_thresold: {max_thresold}')
            # print (f'No of outliers below min_thresold: {len(train[train[feature] < min_thresold])}')
            # print (f'No of outliers above max_thresold: {len(train[train[feature] > max_thresold])}')
            df_outliers = train_data[(train_data[features[cnt]] < min_thresold) | (train_data[features[cnt]] > max_thresold)]

            # Updaing list of outliers
            outliers = outliers + df_outliers.index.tolist()

            # Plot feature vs target using scatter plot
            ax[row][col].scatter(x = train_data[features[cnt]], y= train_data[target])
     
            # Mark outlier records in same scatter plot
            ax[row][col].scatter(x= df_outliers[features[cnt]],  y=df_outliers[target], marker ="o", edgecolor ="red", s = 100)
            ax[row][col].set_xlabel(features[cnt])
            ax[row][col].set_ylabel(target)
            ax[row][col].set_title('Outlier detection for feature ' + features[cnt])

            if inline_delete: 
                # Drop the outliers inline
                # drop_outliers(df_outliers.index.tolist())
                # print(f'Shape of train data= {train.shape}')
                train_data = train_data.drop(df_outliers.index.tolist())
                train_data.reset_index(drop = True, inplace = True)
                # print(f'Shape of train data= {train.shape}')

            cnt = cnt + 1
            if cnt >= len(features):
                break
    plt.show()

    print(f'outliers: {outliers}')
    unique_outliers= list(set(outliers))
    print(f'unique_outliers: {unique_outliers}')
    
    if inline_delete == False: 
        # Drop the unique outliers from final list
        print(f'Shape of train data= {train_data.shape}')
        train_data = train_data.drop(unique_outliers)
        train_data.reset_index(drop = True, inplace = True)
        print(f'Shape of train data= {train_data.shape}')
        
        
detect_and_remove_outliers(inline_delete= False)

# Data Preprocessing 

Now that we have some insights , it will help us now to perform some operations and remove unuseful features , identify the null values, fix the datatypes of some features and also group the data based on the features , they contain !.


In [None]:
y_train = train_data['SalePrice']
train_data.drop('SalePrice', inplace = True, axis = 1)
print('Shape of train data = {} and test data = {}'.format(train_data.shape,test_data.shape))
train_data.head()

In [None]:
train_data['Utilities'].value_counts(dropna = False)

In [None]:
train_data['Street'].value_counts(dropna = False)

In [None]:
train_data['PoolQC'].value_counts(dropna = False)

# Deleting unuseful Features
* Before we start with missing values , first we need to remove the unuseful features !
* 'Utilities' feature contains almost all the values of one type of utility only. Since it wont be usefull in modeling we can drop this feature.
* 'Street' feature also contains the unbalance data of type of road access to property. We can drop it.
* 'PoolQC' most of the data is missing for this feature, we can drop it.

In [None]:
train_data = train_data.drop(['Utilities','Street','PoolQC'],axis = 1)
test_data = test_data.drop(['Utilities','Street','PoolQC'],axis = 1)
print('shape of train_data = {} and test_data = {}'.format(train_data.shape,test_data.shape))

In [None]:
for col in ('MSSubClass', 'YrSold', 'MoSold'):
    train_data[col] = train_data[col].astype(str)

In [None]:
# Get the list of column that have null values
col_na = train_data.columns[train_data.isnull().any()]

# Get missing value count in each column
train_data_na_cnt = train_data[col_na].isnull().sum()

# Get missing values percentage for each column
train_data_na = (train_data[col_na].isnull().sum()/len(train_data)) * 100
#train_data_na = train_data_na.sort_values(ascending = False)
train_data_na = pd.DataFrame({'Total Null Val': train_data_na_cnt, 'Null Value %': train_data_na})
train_data_na = train_data_na.sort_values(by='Null Value %', ascending=False)
train_data_na

In [None]:
for col in ('MSSubClass', 'YrSold', 'MoSold'):
    test_data[col] = test_data[col].astype(str)

In [None]:
# Get the list of column that have null values
col_na = test_data.columns[test_data.isnull().any()]

# Get missing value count in each column
test_data_na_cnt = test_data[col_na].isnull().sum()

# Get missing values percentage for each column
test_data_na = (test_data[col_na].isnull().sum()/len(test_data)) * 100
#test_data_na = test_data_na.sort_values(ascending = False)
test_data_na = pd.DataFrame({'Total Null Val': test_data_na_cnt, 'Null Value %': test_data_na})
test_data_na = test_data_na.sort_values(by='Null Value %', ascending=False)
test_data_na

### Filling NULL VALUES
#### Replace with None
We will replace the categorical ordinal as well as nominal features missing values with 'None'
In case of nominal feature it will become one of the category and in case of ordinal feature it will be trated as least order value

In [None]:
for col in ('MiscFeature', 'Alley','FireplaceQu', 'GarageFinish', 'GarageQual', 'Fence', 'GarageType', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSSubClass'):
    train_data[col] = train_data[col].fillna('None')
    print(f'Feature: {col}, Null Count: {train_data[col].isnull().sum()}, Unique Values: {train_data[col].unique()}')


for col in ('MiscFeature', 'Alley','FireplaceQu', 'GarageFinish', 'GarageQual', 'Fence', 'GarageType', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSSubClass'):
    test_data[col] = test_data[col].fillna('None')
    print(f'Feature: {col}, Null Count: {test_data[col].isnull().sum()}, Unique Values: {test_data[col].unique()}')

#### Replace with Median
LotFrontage: Linear feet of street connected to property. Since the area of each street connected to the house property most likely have a similar area to other houses in its neighborhood, we can fill in missing values by the median LotFrontage of the neighborhood.
Since 'LotFrontage containes continuous data we are taking 'median' value.

In [None]:
train_data['LotFrontage'] = train_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test_data['LotFrontage'] = test_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

#### Replace numerical missing values with 0

In [None]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    train_data[col] = train_data[col].fillna(0)
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    test_data[col] = test_data[col].fillna(0)

In [None]:
# As per the data description, assume 'Typ' home functionality unless deductions are warranted.
train_data['Functional'] = train_data['Functional'].fillna('Typ')
test_data['Functional'] = test_data['Functional'].fillna('Typ')

#### Replace with Most Frequent Value
For low percentage of null values, we will use most frequent value to replace the categorical missing value.

In [None]:
for col in ('MSZoning','Electrical','KitchenQual','Exterior1st','Exterior2nd', 'SaleType'):
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
    print(f'Feature: {col}, Null Count: {train_data[col].isnull().sum()}, Unique Values: {train_data[col].unique()}')

In [None]:
for col in ('MSZoning','Electrical','KitchenQual','Exterior1st','Exterior2nd', 'SaleType'):
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0])
    print(f'Feature: {col}, Null Count: {test_data[col].isnull().sum()}, Unique Values: {test_data[col].unique()}')

In [None]:
print(f'Shape of data: {train_data.shape}')
print(f'Count of null values: {train_data.isnull().sum().sum()}')

In [None]:
print(f'Shape of data: {test_data.shape}')
print(f'Count of null values: {test_data.isnull().sum().sum()}')

In [None]:
train_data.columns

In [None]:
train_data = train_data.drop(['LowQualFinSF'], axis= 1) # Default drop axis is 0 i.e. rows 
train_data.reset_index(drop = True, inplace = True)

test_data = test_data.drop(['LowQualFinSF'], axis= 1) # Default drop axis is 0 i.e. rows 
test_data.reset_index(drop = True, inplace = True)

### Numeric Feature Scaling¶
In order to give every feature same importance we perform feature scaling. There are many technioques like Min-Max Scaler, Robust Scaler etc. to do feature scaling.
Before we can finalize any scaling technique lets check the skewness of our numeric features. Skewness is the measure of degree of asymmetry of a distribution.
* skewness = 0 : normally distributed.
* skewness > 0 : more weight in the left tail of the distribution.
* skewness < 0 : more weight in the right tail of the distribution.

In [None]:
#Combining both test and train data for ease of feature remoing and adding new features !!
all_data = pd.concat((train_data,test_data)).reset_index(drop = True) #  Drops the current index of the DataFrame and replaces it with an index of increasing integers
#Lets check the count of numerical and categorical features
cat_feats = all_data.dtypes[all_data.dtypes == "object"].index
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
print(f"Number of categorical features: {len(cat_feats)}, Numerical features: {len(numeric_feats)}")

skew_features = all_data[numeric_feats].apply(lambda x: stats.skew(x)).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew': skew_features})

print(f'Skew in numerical features. Shape of skewness: {skewness.shape}')
skewness.head(10)

In [None]:
# We will use threshold '0.5' to apply Box Cox transformation.
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

# Use box-cox transformation to transform numeric values with high skew into normal distribution. 
# Here we are using 'boxcox_normmax()' function to compute optimal Box-Cox transform parameter(lmbda) for input data.
# We are using the default method(pearsonr) to determine the optimal transform parameter (lmbda) for boxcox1p
# boxcox1p compute the Box-Cox transformation of 1 + x. (log 0 is undefined)
for i in skew_index:
    all_data[i] = boxcox1p(all_data[i], stats.boxcox_normmax(all_data[i] + 1))

Adding New Features¶
* Since area related features are very important to determine the house price, we will create new feature by name 'TotalSF' by adding 'TotalBsmtSF', '1stFlrSF' and '2ndFlrSF'.
* Similarly we will create one more new feature by name 'TotalSF1' by adding 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF' and '2ndFlrSF'. Here 'BsmtFinSF1' and 'BsmtFinSF2' represent finished square feet of all area, thats why we are creating separate feature using it.
* Create new feature 'YrBltAndRemod' by adding 'YearBuilt' and 'YearRemodAdd'
* Create new feature 'TotalBathrooms' by adding all the bathrooms in the house.
* Create new feature 'TotalPorchSF' by adding all porch area.

In [None]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalSF1'] = all_data['BsmtFinSF1'] + all_data['BsmtFinSF2'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

all_data['YrBltAndRemod']= all_data['YearBuilt'] + all_data['YearRemodAdd']

all_data['TotalBathrooms'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) +
                               all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))

all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                              all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                              all_data['WoodDeckSF'])

print(f'Shape all_data: {all_data.shape}')

Now lets add new features based on the availability of the swimming pool, second floor, garage, basement and firepalce.

In [None]:
all_data['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

print(f'Shape all_data: {all_data.shape}')

In [None]:
#Lets check the count of numerical and categorical features
cat_feats = all_data.dtypes[all_data.dtypes == "object"].index
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
print(f"Number of categorical features: {len(cat_feats)}, Numerical features: {len(numeric_feats)}")

print(f"\nList of cetagorical features: { cat_feats.to_list() }\n\nList of numerical features: { numeric_feats.to_list() }")

In [None]:
cat_feats_nominal = ['MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'HouseStyle', 'CentralAir', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'Electrical', 'MasVnrType', 'Exterior1st', 'Exterior2nd', 'Heating', 'Foundation']
cat_feats_ordinal = ['Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'BldgType', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond','BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','PavedDrive', 'Fence']

numeric_feats_cont= ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'TotalSF', 'TotalSF1', 'YrBltAndRemod', 'TotalBathrooms', 'TotalPorchSF']
numeric_feats_ordinal= ['OverallQual', 'OverallCond']
numeric_feats_descrete= ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars','haspool', 'has2ndfloor', 'hasgarage', 'hasbsmt', 'hasfireplace']

print(f"Number of cat_feats_nominal: {len(cat_feats_nominal)}, cat_feats_ordinal: {len(cat_feats_ordinal)}, numeric_feats_cont: {len(numeric_feats_cont)}, numeric_feats_ordinal: {len(numeric_feats_ordinal)}, numeric_feats_descrete: {len(numeric_feats_descrete)} ")

We cannot delete the categorical features directly as they play an important role in the estimation of our Saleprice though some features like Alley , LotShape , LandContour etc , dont seem to play a direct role ,nonetheless they are important when considering a house_price and so are not removed !

### Encoding Categorical Variables
We will use *Manual Label Encoding* for ordinal categorical variables and *One Hot Encoding* for nominal categorical variables.
Reason for doing manual label encoding for known order is, if we use sklearn's label encoder, it will randomly encode these ordinal variables and therefore ordinality would be lost.
Remember that for missing values we have added 'None' category, which we will encode with '0'.

In [None]:
# List of categorical ordinal feature
print(f'List of categorical ordinal features: {cat_feats_ordinal}')

all_data['Alley'].replace(to_replace = ['None', 'Grvl', 'Pave'], value = [0, 1, 2], inplace = True)
all_data['LotShape'].replace(to_replace = ['Reg', 'IR1', 'IR2', 'IR3'], value = [3, 2, 1,0], inplace = True)
all_data['LandContour'].replace(to_replace = ['Lvl', 'Bnk', 'Low', 'HLS'], value = [3, 2, 1,0], inplace = True)
all_data['LotConfig'].replace(to_replace = ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'], value = [0, 3, 1, 2, 4], inplace = True)
all_data['LandSlope'].replace(to_replace = ['Gtl', 'Mod', 'Sev'], value = [2, 1, 0], inplace = True)
all_data['BldgType'].replace(to_replace = ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'], value = [4, 3, 2, 1, 0], inplace = True)
all_data['RoofStyle'].replace(to_replace = ['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'], value = [4, 2, 3, 1, 5, 0], inplace = True)
all_data['RoofMatl'].replace(to_replace = ['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl'], value = [7, 6, 5, 4, 3, 2, 1, 0], inplace = True)
all_data['ExterQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa'], value = [3, 2, 1, 0], inplace = True)
all_data['ExterCond'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], value = [4, 3, 2, 1, 0], inplace = True)
all_data['BsmtQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'None'], value = [4, 3, 2, 1, 0], inplace = True)
all_data['BsmtCond'].replace(to_replace = ['Gd', 'TA', 'Fa', 'Po', 'None'], value = [4, 3, 2, 1, 0], inplace = True)
all_data['BsmtExposure'].replace(to_replace = ['Gd', 'Av', 'Mn', 'No', 'None'], value = [4, 3, 2, 1, 0], inplace = True)
all_data['BsmtFinType1'].replace(to_replace = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'None'], value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
all_data['BsmtFinType2'].replace(to_replace = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'None'], value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
all_data['HeatingQC'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], value = [4, 3, 2, 1, 0], inplace = True)
all_data['KitchenQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa'], value = [3, 2, 1, 0], inplace = True)
all_data['Functional'].replace(to_replace = ['Typ', 'Min1', 'Min2', 'Mod',  'Maj1', 'Maj2', 'Sev'], value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
all_data['FireplaceQu'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'], value = [5, 4, 3, 2, 1, 0], inplace = True)
all_data['GarageType'].replace(to_replace = ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'None'], value = [6, 5, 4, 3, 2, 1, 0], inplace = True)
all_data['GarageFinish'].replace(to_replace = ['Fin', 'RFn', 'Unf', 'None'], value = [3, 2, 1, 0], inplace = True)
all_data['GarageQual'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'], value = [5, 4, 3, 2, 1, 0], inplace = True)
all_data['GarageCond'].replace(to_replace = ['Ex', 'Gd', 'TA', 'Fa',  'Po', 'None'], value = [5, 4, 3, 2, 1, 0], inplace = True)
all_data['PavedDrive'].replace(to_replace = ['Y', 'P', 'N'], value = [2, 1, 0], inplace = True)
all_data['Fence'].replace(to_replace = ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'None'], value = [4, 3, 2, 1, 0], inplace = True)

print(f'\nShape of all_data: {all_data.shape}')
all_data.head()

In [None]:
print(f'List of categorical nominal features: {cat_feats_nominal}')

In [None]:
# Get k-1 dummies to avoid multicollinearity
cat_feats_nominal_one_hot = pd.get_dummies(all_data[cat_feats_nominal], drop_first= True).reset_index(drop=True)

print(f'Shape of cat_feats_nominal_one_hot: {cat_feats_nominal_one_hot.shape}')
cat_feats_nominal_one_hot.head()

In [None]:
# First we need to drop the catgorical nominal columns from all_data
all_data = all_data.drop(cat_feats_nominal, axis= 'columns')

all_data = pd.concat([all_data, cat_feats_nominal_one_hot], axis='columns')
print(f'Shape of all_data: {all_data.shape}')
all_data.head()

In [None]:
train_data = all_data[:len(y_train)]
test_data = all_data[len(y_train):]
print(f'Shape of train: {train_data.shape}, test:{test_data.shape}')

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_tr,y_te=train_test_split(train_data,y_train,test_size=0.01,random_state=80)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_tr)
lr.score(X_test,y_te)

In [None]:
pred =np.round(lr.predict(test_data),2)

In [None]:
pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv').head()

In [None]:
Submission = pd.DataFrame(data = pred,columns = ['SalePrice'])
Submission['Id'] = test_id
Submission = Submission[['Id','SalePrice']]
Submission.head()

In [None]:
Submission.set_index('Id', inplace = True)

In [None]:
Submission.to_csv('Submission.csv')