In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Load the data and separate the target from the features
home_data = pd.read_csv('data/train.csv')
y = home_data.SalePrice

features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]

# Select columns corresponding to features and preview the data
X = home_data[features]
print(X.head())

# Split into validation and training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

# Define a random forest model
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)
rf_val_predictions = rf_model.predict(X_valid)
rf_val_mae = mean_absolute_error(y_valid, rf_val_predictions)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))


   LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9  
Validation MAE for Random Forest Model: 21,857


## Train a model for competition

In [2]:
# Create a Random Forest with full X and y data
rf_model_full = RandomForestRegressor(random_state=1)
rf_model_full.fit(X, y)


RandomForestRegressor(random_state=1)

In [3]:
# Now read the file of test data and make predictions
test_data = pd.read_csv('data/test.csv')
test_X = test_data[features]

# make predictions
test_predictions = rf_model_full.predict(test_X)
