In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv(r"C:\Users\David gathara marigi\Downloads\train (2).csv", index_col = 'Id')
test_data = pd.read_csv(r"C:\Users\David gathara marigi\Downloads\test (2).csv", index_col = 'Id')
submission_csv = pd.read_csv(r"C:\Users\David gathara marigi\Downloads\sample_submission.csv")

In [3]:
# Remove rows with missing target, separate target from predictors
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_data.SalePrice
train_data.drop(['SalePrice'], axis=1, inplace=True)

In [4]:
# To keep things simple, we'll use only numerical predictors
X = train_data.select_dtypes(exclude=['object'])
X_test = test_data.select_dtypes(exclude=['object'])

In [5]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)                                                    

In [6]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(978, 36)
LotFrontage    183
MasVnrArea       2
GarageYrBlt     53
dtype: int64


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [8]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [9]:
# Fill in the line below: get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()] # Your code here

# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

In [10]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17364.16206085754


In [11]:
from sklearn.impute import SimpleImputer

In [12]:
# Fill in the lines below: imputation
my_imputer = SimpleImputer() # Your code here
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [13]:
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):
17463.143395574


In [14]:
# Preprocessed training and validation features
final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))

# Imputation removed column names; put them back
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

In [15]:
# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print("MAE (Your approach):")
print(mean_absolute_error(y_valid, preds_valid))

MAE (Your approach):
17574.97359612725


In [16]:
# Preprocess test data
final_X_test = pd.DataFrame(final_imputer.transform(X_test), columns=X_train.columns)

# Get test predictions
preds_test = model.predict(final_X_test)
print(preds_test)

[128546.5  155599.5  191534.39 ... 162603.   112061.   229971.86]


In [17]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)