In [213]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest

In [225]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('train.csv', index_col='Id') 
X_test = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)


X.columns


Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [215]:
X_test.shape

(1459, 79)

### 1. Take Care of Missing Values

In [235]:
## In this case, I will replace missing values with Median of corresponding values
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


scalar =StandardScaler()

in_median = SimpleImputer(strategy = 'median')

numerical_cols = [col for col in X.columns if X[col].dtype != "object"]

X_numerical = X[numerical_cols]
X_test_numerical = X_test[numerical_cols]


imputed_X = pd.DataFrame(in_median.fit_transform(X_numerical))
imputed_X_test = pd.DataFrame(in_median.transform(X_test_numerical))



imputed_X.columns = X[numerical_cols].columns
imputed_X_test.columns = X_test[numerical_cols].columns

imputed_X.columns
#imputed_X = pd.DataFrame(in_median.fit_transform()



Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [250]:
#Applying one-hot encoding
object_cols = [col for col in X.columns if X[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X[col]) == set(X_test[col])]


low_cardinality_cols = [col for col in good_label_cols if X[col].nunique() < 10]

X_low_cardinality = X[low_cardinality_cols]
X_test_low_cardinality= X_test[low_cardinality_cols]

nan_columns = [col for col in low_cardinality_cols if X_low_cardinality[col].isna().any().sum() > 0]


X_low_cardinality.drop(columns=nan_columns, axis = 1, inplace=True)
X_test_low_cardinality.drop(columns=nan_columns, axis = 1, inplace=True)

oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)

OH_cols_X = pd.DataFrame(oh_encoder.fit_transform(X_low_cardinality))
OH_cols_Xtest = pd.DataFrame(oh_encoder.transform(X_test_low_cardinality))

OH_cols_X.index = X_low_cardinality.index
OH_cols_Xtest.index = X_test_low_cardinality.index
                                                  
                                                  
cate_dropped_columns_X = X.drop(object_cols, axis = 1)
cate_dropped_columns_X_test = X_test.drop(object_cols, axis = 1)

cate_dropped_columns_X.columns = X[numerical_cols].columns
cate_dropped_columns_X_test.columns = X[numerical_cols].columns

OH_X = pd.concat([cate_dropped_columns_X, OH_cols_X], axis = 1)
OH_X_test = pd.concat([cate_dropped_columns_X_test, OH_cols_Xtest], axis = 1)



listt = list(OH_X.columns)
for i in listt:
    print(i)
#Concat numerical and categorical columns

final_X = pd.concat([imputed_X, OH_X], axis=1)
final_X_test = pd.concat([imputed_X_test, OH_X_test], axis=1)




MSSubClass
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
MasVnrArea
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
TotRmsAbvGrd
Fireplaces
GarageYrBlt
GarageCars
GarageArea
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
YrSold
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [264]:
final_X = final_X.iloc[:, 0:1460]
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(final_X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

ValueError: Found input variables with inconsistent numbers of samples: [1461, 1460]

In [258]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, X_valid)

ValueError: could not convert string to float: 'Normal'

### Getting the columns with categorical variable

In [84]:
s = (home_data.dtypes == 'object')
categorical_var = list(s[s].index)

In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## Dropping the columns with categorical data

In [85]:
drop_X_train = X_train.select_dtypes(exclude= ['object'])
drop_X_valid = X_valid.select_dtypes(exclude= ['object'])

### Checking for the NaN values

In [80]:
missing_values_byColumn = drop_X_train.isnull().sum()
missing_values_byColumn[missing_values_byColumn>0]

##Getting the name of the columns that includes a missing value

col_name = [col for col in drop_X_train.columns if drop_X_train[col].isnull().any()]
col_name

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

### Checking the effectiveness of the result

In [86]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
17837.82570776256


### Normalization

In [65]:
from sklearn.preprocessing import StandardScaler

#features = ['LotArea','MiscVal','2ndFlrSF','PoolArea','BsmtFinSF2','GrLivArea','TotalBsmtSF', 'GarageCars']

X = home_data[features]
X.isna().sum()|

SyntaxError: invalid syntax (<ipython-input-65-e3bc7a0cd50b>, line 5)

In [35]:
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

In [37]:
#Getting the best max_leaf_nodes
def get_accuracy(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    acc = MAE(val_y, preds_val)
    
    return acc

    
max_leaf = [x for x in range(50, 250)]

 
best_accuracy = {}  
for i in range(len(max_leaf)):
    best_accuracy[i] = get_accuracy(max_leaf[i], train_X,val_X, train_y, val_y)

In [39]:
#Getting best n_estimators
def get_accuracy(n_estimators, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=4, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    acc = MAE(val_y, preds_val)
    return acc

best_estimator = [x for x in range(50, 250)]
 
best_accuracy_n_estimator = {}  
for i in range(len(max_leaf)):
    best_accuracy_n_estimator[i] = get_accuracy(best_estimator[i], train_X,val_X, train_y, val_y)

In [38]:
#Best max_leaf
min_key = min(best_accuracy, key=best_accuracy.get)
print(min_key)

4


In [40]:
#Best n_estimator
min_key = min(best_accuracy_n_estimator, key=best_accuracy_n_estimator.get)
print(min_key)

68


In [41]:
rf_model_on_full_data = RandomForestRegressor(n_estimators=68, max_leaf_nodes=4)
rf_model_on_full_data.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=4,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=68, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [50]:
test_data = pd.read_csv("test.csv")
test_X = test_data[features]
print(test_X.isna().sum())
test_X.fillna(0, inplace=True)


sc = StandardScaler()
sc.fit(test_X)
test_X = sc.transform(test_X)

#dummies = pd.get_dummies(test_X.Neighborhood)
#df_dummies = pd.concat([test_X, dummies], axis = 'columns')
#df_dummies.drop(['Neighborhood'], axis='columns', inplace=True)
#test_X = df_dummies


test_preds = rf_model_on_full_data.predict(test_X)
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission3.csv', index=False)




LotArea        0
MiscVal        0
2ndFlrSF       0
PoolArea       0
BsmtFinSF2     1
GrLivArea      0
TotalBsmtSF    1
GarageCars     1
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
