In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import matplotlib.pyplot as plt 
from sklearn.impute import SimpleImputer


import seaborn as sns
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sample_submission.csv
/kaggle/input/sample_submission.csv.gz
/kaggle/input/train.csv.gz
/kaggle/input/data_description.txt
/kaggle/input/test.csv.gz
/kaggle/input/train.csv
/kaggle/input/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/train.csv')
test_y_df = pd.read_csv('/kaggle/input/sample_submission.csv')
test_df = pd.read_csv('/kaggle/input/test.csv')

In [3]:
def PrintMissingforEachColumns(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent_1 = data.isnull().sum()/data.isnull().count()*100
    missing_data = pd.concat([total,percent_1],axis=1,keys=['Total','%'])
    print(missing_data.head())

# Find missing columns

In [4]:
PrintMissingforEachColumns(train_df)

             Total          %
PoolQC        1453  99.520548
MiscFeature   1406  96.301370
Alley         1369  93.767123
Fence         1179  80.753425
FireplaceQu    690  47.260274


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Columns

### Columns that No value in train

In [6]:
train_df = train_df.drop(train_df[(train_df['OverallQual']>9) & (train_df['SalePrice']<220000)].index)
train_df = train_df.drop(train_df[(train_df['GrLivArea']>4000) & (train_df['SalePrice']<300000)].index)
Y_train = train_df.SalePrice
train_df = train_df.drop(["SalePrice","Id"],axis=1)
test_df = test_df.drop("Id",axis=1)


## Sep low and High Cardinality columns

In [7]:
def LowcardiCol(Train,object_cols):
    Train = Train.copy()
    low_cardinality_cols = [col for col in object_cols if Train[col].nunique() < 10]
    high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
    return low_cardinality_cols,high_cardinality_cols

In [8]:
n = (train_df.dtypes != 'object')
num_cols = list(n[n].index)

In [9]:
# Categorical columns in the training data
object_cols = [col for col in train_df.columns if train_df[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(test_df[col]).issubset(set(train_df[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Functional', 'Exterior2nd', 'Utilities', 'SaleType', 'Exterior1st', 'KitchenQual', 'MSZoning']


### bad_label_cols mean we have some feature values that is not in Train dataset (Same Feature)

In [10]:
train_df = train_df.drop(bad_label_cols,axis=1)
test_df = test_df.drop(bad_label_cols,axis=1)
object_cols = good_label_cols

In [11]:
low_cardinality_cols,high_cardinality_cols = LowcardiCol(train_df,object_cols)
print(low_cardinality_cols)
print(high_cardinality_cols)

['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']
['Neighborhood']


# Train Test split

In [12]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_df, Y_train, random_state=1)

# preprocessing

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
def preprocess(low_cardinality_cols,num_cols,high_cardinality_cols):
    numerical_transformer = SimpleImputer(strategy='mean')
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    categorical_High_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories="auto"))])
    
    preprocessor = ColumnTransformer( transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, low_cardinality_cols),
    ('ord',categorical_High_transformer,high_cardinality_cols)])
    return preprocessor
def Pipe(preprocessor,model):
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('scaler_2', preprocessing.MaxAbsScaler()),
                              ('model', model)])
    return my_pipeline

In [14]:
preprocessor = preprocess(low_cardinality_cols,num_cols,high_cardinality_cols)

# Test

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
rf_model = RandomForestRegressor(random_state=1)
pipe = Pipe(preprocessor,rf_model)
pipe.fit(train_X, train_y)
rf_val_predictions = pipe.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 16,548


## Random Forest

In [16]:
# Define a random forest model
rf_model = RandomForestRegressor(random_state=1)
pipe = Pipe(preprocessor,rf_model)
pipe.fit(train_df, Y_train)
rf_val_predictions = pipe.predict(test_df)
rf_val_mae = mean_absolute_error(rf_val_predictions, test_y_df.SalePrice)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 54,156


## XGB

In [17]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4, random_state=42)
pipe = Pipe(preprocessor,model)
pipe.fit(train_df, Y_train)
XG_val_predictions = pipe.predict(test_df)
XG_val_mae = mean_absolute_error(XG_val_predictions, test_y_df.SalePrice)

print("Validation MAE for Random Forest Model: {:,.0f}".format(XG_val_mae))

Validation MAE for Random Forest Model: 55,077


# Generate a submission

Run the code cell below to generate a CSV file with your predictions that you can use to submit to the competition.

In [18]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_y_df.Id,
                       'SalePrice': XG_val_predictions})
output.to_csv('submission.csv', index=False)
print(output)

        Id      SalePrice
0     1461  132300.296875
1     1462  155203.625000
2     1463  184575.218750
3     1464  187259.984375
4     1465  196484.625000
...    ...            ...
1454  2915   82267.523438
1455  2916   78725.773438
1456  2917  168850.703125
1457  2918  116010.953125
1458  2919  214290.062500

[1459 rows x 2 columns]
