# Introduction to Machine Learning


In [7]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read
iowa_file_path = 'train.csv'
X_test_full = pd.read_csv('test.csv', index_col='Id')

home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 29,653


In [8]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
data = {}
for max_leaf_nodes in candidate_max_leaf_nodes:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    data[max_leaf_nodes] = mae

smallest_val = min(zip(data.values(), data.keys()))

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = smallest_val[1]

In [9]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

X_test = X_test_full[features].copy()

tree_pred = final_model.predict(X_test)

# How to create submission file.
submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': tree_pred})
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,125362.573333
1,1462,157249.181818
2,1463,182392.319444
3,1464,182392.319444
4,1465,190290.7125


In [10]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
val_pred = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, val_pred)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 21857.15912981083


# Intermediate Machine Learning

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


###  Determine the best model to use

In [24]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

model_mae = {}
for model in models:
    mae = score_model(model)
    model_mae[model] = mae
    print(f'{model}: {mae}')

# get the model with the lowest MAE
best_model = (min(zip(model_mae.values(), model_mae.keys())))[1]
print(f'The best model is: {best_model}')

RandomForestRegressor(n_estimators=50, random_state=0): 24015.492818003917
RandomForestRegressor(random_state=0): 23740.979228636657
RandomForestRegressor(criterion='mae', random_state=0): 23528.78421232877
RandomForestRegressor(min_samples_split=20, n_estimators=200, random_state=0): 23996.676789668687
RandomForestRegressor(max_depth=7, random_state=0): 23706.672864217904
The best model is: RandomForestRegressor(criterion='mae', random_state=0)


In [23]:
# my_model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
best_model.fit(X, y)

pred = best_model.predict(X_test)

# How to create submission file.
submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': pred})
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,119433.08
1,1462,158367.5
2,1463,185351.21
3,1464,178343.12
4,1465,192898.29


## Handling Missing Values

Most machine learning models will raise an error if you build the model using dataset with missing values. 
So it is important to take care of the missing values before building a model.

There are various approaches that can be used to take care of missing values, some of which includes
* Drop columns with missing values
* Input missing values using one of the different strategies that exists.

### Determine columns with missing values

In [32]:
cols_with_missing = [col for col in X_full.columns if X_full[col].isnull().any()]

X_full[cols_with_missing].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64