In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [21]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
Id = test['Id']
test = test.drop('Id', axis=1)
y_train = train['SalePrice']
X_train = train.drop(['Id','SalePrice'], axis=1)
train_num = len(train)
train.shape

(1460, 81)

In [22]:
test.columns
test.shape

(1459, 79)

In [23]:
# check empty 
print("Train: ",train.isnull().any().describe())
print("Test: ", test.isnull().any().describe())
df = pd.concat([X_train, test], sort=False, ignore_index =True)
df.shape

Train:  count        81
unique        2
top       False
freq         62
dtype: object
Test:  count        79
unique        2
top       False
freq         46
dtype: object


(2919, 79)

In [24]:
obj_features = []
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        obj_features.append(feature)
    else:
        num_features.append(feature)
print(f'''Object Features: {obj_features}
Numeric Features: {num_features}''')

Object Features: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Numeric Features: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', '

In [37]:
df[df['Alley'].notnull()]['Alley']
# Find incomplete features
na_features = []
for feature in df.columns:
    if len(set(df[feature].isnull())) > 1:
        na_features.append(feature)
print(na_features)

['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']


In [None]:
featu
for feature in na_features:

In [48]:
obj_na_features =[]
num_na_features =[]
for feature in na_features:
    if feature in obj_features:
        obj_na_features.append(feature)
    else:
        num_na_features.append(feature)

In [49]:
obj_na_features

['MSZoning',
 'Alley',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType']

In [51]:
# Numeric Features FIlL NA
for feature in num_na_features:
    df[feature] = df[feature].fillna(0)

In [52]:
df[num_na_features].describe()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,57.766701,101.396026,441.272011,49.565262,560.579993,1051.417266,0.429599,0.061322,1870.364166,1.766016,472.712573
std,33.481636,178.854579,455.606014,169.179104,439.590889,441.120498,0.524676,0.245608,449.686321,0.762196,215.535686
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,43.0,0.0,0.0,0.0,220.0,793.0,0.0,0.0,1957.0,1.0,320.0
50%,63.0,0.0,368.0,0.0,467.0,989.0,0.0,0.0,1977.0,2.0,480.0
75%,78.0,163.5,733.0,0.0,805.0,1302.0,1.0,0.0,2001.0,2.0,576.0
max,313.0,1600.0,5644.0,1526.0,2336.0,6110.0,3.0,2.0,2207.0,5.0,1488.0


In [46]:
na_f = pd.Series(na_features)
na_f.str.startswith('Garage')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30    False
31    False
32    False
33    False
dtype: bool

In [44]:
a = 'Garage'
b= a+'Finish'
re.match(na_features[25], '^Garage')

In [7]:
df['PoolQC'].describe()

count     10
unique     3
top       Gd
freq       4
Name: PoolQC, dtype: object

In [8]:
# desc = pd.read_csv('./data_description.txt', sep='\t', header =None)

In [9]:
set(df['PoolQC'])

{'Ex', 'Fa', 'Gd', nan}