# Housing Prices Competition

#### What we used:
* "Decision Tree Regression" as model
* "MAE" as metric
* "train_test_split" to trunk data to train/validate/test parts
* "LabelEncoder" to decode object features to digit

In [1]:
pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

In [2]:
# The main module for manipulating with the data:
import pandas as pd

In [3]:
# The main familiar using model:
from sklearn.tree import DecisionTreeRegressor
# To encode categories parameters:
from sklearn.preprocessing import LabelEncoder
# The main metric for model:
from sklearn.metrics import mean_absolute_error
# The method for trunking data to train and validate parts:
from sklearn.model_selection import train_test_split

In [4]:
# The paths of data:
test_path = "../test_data.csv"
train_path = "../train_data.csv"

# Clone data to dataframe:
test_dataframe = pd.read_csv(test_path)
train_dataframe = pd.read_csv(train_path)

# Check columns:
train_dataframe.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
# Take important features (a first view)
# we remove all subjective parameters by type qualities:
features = [
            "Heating", "LotArea", "1stFlrSF", "PoolArea", "2ndFlrSF", "FullBath", 
            "Utilities", "YearBuilt", "CentralAir", "GarageArea", "Fireplaces", 
            "Foundation", "Exterior1st", "OpenPorchSF", "BedroomAbvGr", "KitchenAbvGr", 
            "TotRmsAbvGrd", "EnclosedPorch"
           ]

# Take target feature:
Y = train_dataframe.SalePrice

# Take samples with necessary data columns:
train_data = train_dataframe[features]

# Check that we don't have null rows:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Heating        1460 non-null   object
 1   LotArea        1460 non-null   int64 
 2   1stFlrSF       1460 non-null   int64 
 3   PoolArea       1460 non-null   int64 
 4   2ndFlrSF       1460 non-null   int64 
 5   FullBath       1460 non-null   int64 
 6   Utilities      1460 non-null   object
 7   YearBuilt      1460 non-null   int64 
 8   CentralAir     1460 non-null   object
 9   GarageArea     1460 non-null   int64 
 10  Fireplaces     1460 non-null   int64 
 11  Foundation     1460 non-null   object
 12  Exterior1st    1460 non-null   object
 13  OpenPorchSF    1460 non-null   int64 
 14  BedroomAbvGr   1460 non-null   int64 
 15  KitchenAbvGr   1460 non-null   int64 
 16  TotRmsAbvGrd   1460 non-null   int64 
 17  EnclosedPorch  1460 non-null   int64 
dtypes: int64(13), object(5)
memo

In [6]:
# Check all values of categorical columns:
train_data.Utilities.unique()

array(['AllPub', 'NoSeWa'], dtype=object)

In [7]:
train_data.CentralAir.unique()

array(['Y', 'N'], dtype=object)

In [8]:
train_data.Exterior1st.unique()

array(['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
       'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'BrkComm', 'AsphShn',
       'Stone', 'ImStucc', 'CBlock'], dtype=object)

In [9]:
train_data.Foundation.unique()

array(['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'], dtype=object)

In [10]:
train_data.Heating.unique()

array(['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'], dtype=object)

In [11]:
# Encoder other parameters (don't use dummies because for model will be better use label encoder):
label_encoder = LabelEncoder()
train_data[[
            "Heating", "Utilities", 
            "Foundation", "CentralAir",
            "Exterior1st"
          ]] =\
train_data[[                            
            "Heating", "Utilities",
            "Foundation", "CentralAir",
            "Exterior1st"
          ]].apply(lambda x: label_encoder.fit_transform(x))

# Check final data:
train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[[


Unnamed: 0,Heating,LotArea,1stFlrSF,PoolArea,2ndFlrSF,FullBath,Utilities,YearBuilt,CentralAir,GarageArea,Fireplaces,Foundation,Exterior1st,OpenPorchSF,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,EnclosedPorch
0,1,8450,856,0,854,2,0,2003,1,548,0,2,12,61,3,1,8,0
1,1,9600,1262,0,0,2,0,1976,1,460,1,1,8,0,3,1,6,0
2,1,11250,920,0,866,2,0,2001,1,608,1,2,12,42,3,1,6,0
3,1,9550,961,0,756,1,0,1915,1,642,1,0,13,35,3,1,7,272
4,1,14260,1145,0,1053,2,0,2000,1,836,1,2,12,84,4,1,9,0


In [12]:
# Split data for fitting and validating:
train_X, validate_X, train_Y, validate_Y = train_test_split(train_data, Y, test_size=0.2, random_state=1)

In [13]:
def get_depth(depth: int):
    dt_model = DecisionTreeRegressor(
                                        random_state=1, 
                                        max_depth=depth, 
                                        criterion="absolute_error"
                                    )
    
    dt_model.fit(train_X, train_Y)
    
    predictions = dt_model.predict(validate_X)
    
    return mean_absolute_error(predictions, validate_Y)

def get_max_leaf_nodes(depth: int, max_leaf_nodes: int):
    dt_model = DecisionTreeRegressor(
                                        random_state=1, 
                                        max_depth=depth, 
                                        criterion="absolute_error", 
                                        max_leaf_nodes=max_leaf_nodes
                                    )
    
    dt_model.fit(train_X, train_Y)
    
    predictions = dt_model.predict(validate_X)
    
    return mean_absolute_error(predictions, validate_Y)

# To get the best max depth and max leaf counter:
def get_the_best_tree_parameters():
    depth_dict = {depth: get_depth(depth) for depth in range(10, 300)}
    best_depth = min(depth_dict, key=depth_dict.get)
    leaf_dict = {leaf: get_max_leaf_nodes(leaf, best_depth) for leaf in range(3, 50)}
    
    return (best_depth, min(leaf_dict, key=leaf_dict.get))

In [14]:
# The best params from training data:
best_parameters = get_the_best_tree_parameters()

# Check them:
print(best_parameters)

(22, 5)


In [15]:
# We found the best values of parameters for decision tree, then we can use for training all samples:
final_training_samples = pd.concat([train_X, validate_X], axis=0)
final_training_targets = pd.concat([train_Y, validate_Y], axis=0)

# Check dataframe of samples for example:
final_training_samples

Unnamed: 0,Heating,LotArea,1stFlrSF,PoolArea,2ndFlrSF,FullBath,Utilities,YearBuilt,CentralAir,GarageArea,Fireplaces,Foundation,Exterior1st,OpenPorchSF,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,EnclosedPorch
921,1,8777,1272,0,928,2,0,1900,1,0,0,1,8,70,4,2,9,0
520,1,10800,694,0,600,2,0,1900,0,0,0,0,8,114,3,2,7,210
401,1,8767,1310,0,0,2,0,2005,1,400,1,2,12,0,3,1,6,0
280,1,11287,1175,0,807,2,0,1989,1,575,1,1,9,84,3,1,7,0
1401,1,7415,864,0,729,2,0,2004,1,398,1,2,12,75,3,1,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,1,5500,882,0,0,1,0,1956,1,0,0,1,8,0,1,1,4,0
47,1,11096,1656,0,0,2,0,2006,1,826,0,2,12,146,3,1,7,0
1432,1,10800,968,0,0,2,0,1927,1,216,0,0,13,0,4,1,5,0
98,1,10625,835,0,0,1,0,1920,0,366,0,0,13,0,2,1,5,77


In [16]:
# Create the best model for our data:
dt_model = DecisionTreeRegressor(
                                    random_state=1, 
                                    criterion="absolute_error", 
                                    max_depth=best_parameters[0], 
                                    max_leaf_nodes=best_parameters[1]
                                )

# Make the final fitting:
dt_model.fit(final_training_samples, final_training_targets)

# Add Id for future submission:
features.append("Id")

# Prepare test data:
test_samples = test_dataframe[features]

# Check that we don't have null values:
test_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Heating        1459 non-null   object 
 1   LotArea        1459 non-null   int64  
 2   1stFlrSF       1459 non-null   int64  
 3   PoolArea       1459 non-null   int64  
 4   2ndFlrSF       1459 non-null   int64  
 5   FullBath       1459 non-null   int64  
 6   Utilities      1457 non-null   object 
 7   YearBuilt      1459 non-null   int64  
 8   CentralAir     1459 non-null   object 
 9   GarageArea     1458 non-null   float64
 10  Fireplaces     1459 non-null   int64  
 11  Foundation     1459 non-null   object 
 12  Exterior1st    1458 non-null   object 
 13  OpenPorchSF    1459 non-null   int64  
 14  BedroomAbvGr   1459 non-null   int64  
 15  KitchenAbvGr   1459 non-null   int64  
 16  TotRmsAbvGrd   1459 non-null   int64  
 17  EnclosedPorch  1459 non-null   int64  
 18  Id      

In [17]:
# Calculate mean value for null cells:
mean_price = int((test_samples["GarageArea"].mean()))

# Check the getting value:
print(mean_price)

472


In [18]:
# Fill null cell by mean value of parameter (we have only 1 string):
test_samples.GarageArea.fillna(mean_price, inplace=True)

# Prepare data:
test_samples[[
                "Heating", 
                "Utilities", 
                "CentralAir", 
                "Foundation", 
                "Exterior1st" 
            ]] = \
    test_samples[[
                    "Heating", 
                    "Utilities", 
                    "CentralAir", 
                    "Foundation", 
                    "Exterior1st"
                ]].apply(lambda x: label_encoder.fit_transform(x))

# Check result test data:
test_samples

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_samples.GarageArea.fillna(mean_price, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_samples[[


Unnamed: 0,Heating,LotArea,1stFlrSF,PoolArea,2ndFlrSF,FullBath,Utilities,YearBuilt,CentralAir,GarageArea,Fireplaces,Foundation,Exterior1st,OpenPorchSF,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,EnclosedPorch,Id
0,0,11622,896,0,0,1,0,1961,1,730.0,0,1,10,0,2,1,5,0,1461
1,0,14267,1329,0,0,1,0,1958,1,312.0,0,1,11,36,3,1,6,0,1462
2,0,13830,928,0,701,2,0,1997,1,482.0,1,2,10,34,3,1,6,0,1463
3,0,9978,926,0,678,2,0,1998,1,470.0,1,2,10,36,3,1,7,0,1464
4,0,5005,1280,0,0,2,0,1992,1,506.0,0,2,6,82,2,1,5,0,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,1936,546,0,546,1,0,1970,1,0.0,0,1,5,0,3,1,5,0,2915
1455,0,1894,546,0,546,1,0,1970,1,286.0,0,1,5,24,3,1,6,0,2916
1456,0,20000,1224,0,0,1,0,1960,1,576.0,1,1,10,0,4,1,7,0,2917
1457,0,10441,970,0,0,1,0,1992,1,0.0,0,2,6,32,3,1,6,0,2918


In [19]:
# Check filling of prepare data:
test_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Heating        1459 non-null   int32  
 1   LotArea        1459 non-null   int64  
 2   1stFlrSF       1459 non-null   int64  
 3   PoolArea       1459 non-null   int64  
 4   2ndFlrSF       1459 non-null   int64  
 5   FullBath       1459 non-null   int64  
 6   Utilities      1459 non-null   int32  
 7   YearBuilt      1459 non-null   int64  
 8   CentralAir     1459 non-null   int32  
 9   GarageArea     1459 non-null   float64
 10  Fireplaces     1459 non-null   int64  
 11  Foundation     1459 non-null   int32  
 12  Exterior1st    1459 non-null   int32  
 13  OpenPorchSF    1459 non-null   int64  
 14  BedroomAbvGr   1459 non-null   int64  
 15  KitchenAbvGr   1459 non-null   int64  
 16  TotRmsAbvGrd   1459 non-null   int64  
 17  EnclosedPorch  1459 non-null   int64  
 18  Id      

In [20]:
# The submissions Ids:
submission_ids = test_samples.Id

# Drop Id column for fitting:
test_samples.drop(axis=1, columns=["Id"], inplace=True)

# Make the final prediction:
test_prediction = dt_model.predict(test_samples)

# Check the final price predictions:
test_prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_samples.drop(axis=1, columns=["Id"], inplace=True)


array([125000., 125000., 194201., ..., 174500., 194201., 194201.])

In [21]:
# Create submission dataframe:
submisson_file = pd.DataFrame(
                                {
                                    "Id": submission_ids,
                                    "SalePrice": test_prediction
                                }
                             )

# Check the result:
submisson_file

Unnamed: 0,Id,SalePrice
0,1461,125000.0
1,1462,125000.0
2,1463,194201.0
3,1464,194201.0
4,1465,194201.0
...,...,...
1454,2915,125000.0
1455,2916,125000.0
1456,2917,174500.0
1457,2918,194201.0


In [22]:
# Create a submission file:
submisson_file.to_csv("submission.csv", index=False,)

SyntaxError: positional argument follows keyword argument (3541215040.py, line 2)