In [18]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_validate, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

PATH = '../in/train.csv'

In [19]:
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [20]:
columns_object = [column for column in df.columns if df[column].dtypes == 'object']
columns_numbers = [column for column in df.columns if df[column].dtypes in ['float64', 'int64']]
columns_numbers.pop(-1)

'SalePrice'

In [21]:
imputer = SimpleImputer(strategy="mean")

X = df[columns_numbers]
y = df['SalePrice']

X_ = pd.DataFrame(imputer.fit_transform(X))

X_.columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.25, random_state=7)

### Modelo on-hot

In [22]:
model = RandomForestRegressor(n_estimators=10, criterion="absolute_error", random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(mean_absolute_error(pred, y_test))

19398.425479452057


#### Cross-Validation

In [23]:
model = RandomForestRegressor(n_estimators=10, criterion="absolute_error", random_state=0)
cross_val = cross_validate(model, X_, y, cv=5)

In [24]:
cross_val

{'fit_time': array([1.76173735, 1.7309134 , 1.72945952, 1.73175836, 1.75110173]),
 'score_time': array([0.0065403 , 0.00499725, 0.00460315, 0.00699806, 0.00599885]),
 'test_score': array([0.84471842, 0.85611228, 0.85583717, 0.87102596, 0.80122391])}

#### Cross validation score

In [25]:
model = RandomForestRegressor(n_estimators=10, max_depth=10, criterion='squared_error')
cross_score = cross_val_score(model, X_, y, cv=5, scoring="neg_mean_absolute_error")
cross_score

array([-19815.60994582, -18722.85993727, -19884.29204076, -19298.15251038,
       -21324.93815156])

In [26]:
baseline = RandomForestRegressor()

parameters = {
    'n_estimators': range(5,11),
    'criterion': ['squared_error','absolute_error'],
    'max_depth': range(5,11)
}

rank_estimator = RandomizedSearchCV(baseline, parameters, n_iter=10, cv=5, scoring='neg_mean_absolute_error')
rank_estimator.fit(X_, y)


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error'],
                                        'max_depth': range(5, 11),
                                        'n_estimators': range(5, 11)},
                   scoring='neg_mean_absolute_error')

In [27]:
rank_estimator.best_params_

{'n_estimators': 9, 'max_depth': 10, 'criterion': 'absolute_error'}

In [28]:
pred_example = pd.read_csv('../in/sample_submission.csv')

In [29]:
test = pd.read_csv('../in/test.csv')

test = test[columns_numbers]
test_imp = pd.DataFrame(imputer.transform(test), columns=columns_numbers)

In [30]:
predict = pd.DataFrame(rank_estimator.predict(test_imp), columns=['SalePrice'])
predict['Id'] = index=pred_example.Id
predict = predict.set_index('Id')
predict.to_csv('../out/baseline.csv')
predict.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,125066.666667
1462,147472.222222
1463,170472.222222
1464,177116.666667
1465,184833.333333
