# 1 - Starting your ML project

In [1]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)

print(data.describe())

# 2 - Selecting and filtering in Pandas

In [2]:
# Print table in data file
print(data.columns)

# Select a column with dot-notation and print it
data_price = data.SalePrice
print()
print(data_price.head())

In [3]:
# Selecting multiple columns
columns_of_interests = ['SalePrice', 'TotRmsAbvGrd']
selected_data_columns = data[columns_of_interests]

print(selected_data_columns.describe())

# 3 - Your first Scikit-Learn Model

In [4]:
# Choosing the column to predict
y = data.SalePrice
# Choosing the features
data_predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
x = data[data_predictors]

# Define Model
from sklearn.tree import DecisionTreeRegressor

data_model = DecisionTreeRegressor()
# Fit model
data_model.fit(x, y)

# Make the prediction
print(x.head())
print(data_model.predict(x.head()))


#  4 - Model Validation

In [5]:
# Calculate the Mean Absolute Error (MEA)
from sklearn.metrics import mean_absolute_error

predicted_home_prices = data_model.predict(x)
mean_absolute_error(y, predicted_home_prices)

In [6]:
# Split data into training and validation data
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)

# Define a model
data_model = DecisionTreeRegressor()
# Fit model
data_model.fit(train_x, train_y)
# Get predicted prices on validation data
val_predictions = data_model.predict(val_x)
print(mean_absolute_error(val_y, val_predictions))

# 5 - Underfitting, Overfitting and Model Optimization


In [18]:
# Compare MAE scores from different values for max_leaf_nodes
def get_mae (max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return (mae)

# Use the method for test Overfitting and Underfitting in our data
for max_leaf in [5, 50, 500, 5000]:
    mae = get_mae(max_leaf, train_x, val_x, train_y, val_y)
    print("Max leaf nodes: ", max_leaf, "\t\t Mean Absolute Error: ", mae)

# 6 - Random forests

In [26]:
# Test with RandomForest Regressor
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(max_depth=8, random_state=0)
forest_model.fit(X=train_x, y=train_y)

model_prediction = forest_model.predict(val_x)
print(mean_absolute_error(val_y, model_prediction))

# 7 - Submitting from a Kernel

In [33]:
# Read and predict test data
test = pd.read_csv('../input/test.csv')
test_x = test[data_predictors]
predicted_prices = forest_model.predict(test_x)

print(predicted_prices)

# Submit as file
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
submission.to_csv('submission.csv', index=False)