# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [None]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)

#describe the data of the file train.csv from house prices
print(data.describe())

#select any one column from data and describe it
house_style_data = data.HouseStyle
house_style_data.describe()

#show top values of house_style
print(house_style_data.head())

#selecting multiple columns
columns_of_interest = ['RoofStyle','Street']
two_columns_of_data=data[columns_of_interest]
two_columns_of_data.describe()

#predict the sales prices of houses
y=data.SalePrice
house_predictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']
X=data[house_predictors]
from sklearn.tree import DecisionTreeRegressor

#define model
data_model= DecisionTreeRegressor()
data_model.fit(X,y)

#make prediction for 5 houses from training data
print("Making predictions for the first 5 houses")
print(X.head())
print("The Predictions are:")
print(data_model.predict(X.head()))

from sklearn.metrics import mean_absolute_error

#calculating mean absolute error
predicted_home_prices = data_model.predict(X)
mean_absolute_error(y,predicted_home_prices)

from sklearn.model_selection import train_test_split
train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=0)
h_model = DecisionTreeRegressor()
h_model.fit(train_X,train_y)

val_predictions = h_model.predict(val_X)
print(mean_absolute_error(val_y,val_predictions))

#overfitting and underfitting of data
def get_mae(max_leaf_nodes,predictors_train,predictors_val,targ_train,targ_val):
    h1_model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    h1_model.fit(predictors_train,targ_train)
    preds_val = h1_model.predict(predictors_val)
    mae=mean_absolute_error(targ_val,preds_val)
    return mae

for max_leaf_nodes in [5,50,500,5000]:
    my_mae  = get_mae(max_leaf_nodes,train_X,val_X,train_y,val_y)
    print("Max leaf nodes:%d \t\t Mean absolute Error:%d"%(max_leaf_nodes,my_mae))

#calculation of mean_absolute_error using ForestRegressor    
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor()
forest_model.fit(train_X,train_y)
h_preds= forest_model.predict(val_X)
print(mean_absolute_error(val_y,h_preds))

#predict on the test data from ForestRegressor
test = pd.read_csv('../input/test.csv')
test_X = test[house_predictors]
predict_prices= forest_model.predict(test_X)
print(predict_prices)

#create file for submission
#submission = pd.DataFrame({'Id':test.Id,'SalePrice': predict_prices})
#submission.to_csv('submission.csv',index=False)

#calculate missing/null values
print(data.isnull().sum())

#remove one column
d_target = data.SalePrice
d_predictors = data.drop(['SalePrice'],axis=1)

#remove non-numeric values
d_numeric_predictors = d_predictors.select_dtypes(exclude=['object'])
X_train,X_test,y_train,y_test = train_test_split(d_numeric_predictors,d_target,train_size=0.7,test_size=0.3,random_state=0)
#calculates mean error
def score_dataset(X_train,X_test,y_train,y_test):
    model1 = RandomForestRegressor()
    model1.fit(X_train,y_train)
    preds = model1.predict(X_test)
    return mean_absolute_error(y_test,preds)
#calculate Mean absolute error from missing values
cols_with_missing = [col for col in X_train.columns 
                                 if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test  = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train,y_test ))


from sklearn.preprocessing import Imputer
my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.fit_transform(X_test)
print("Mean absolute error from Imputation:")
print(score_dataset(imputed_X_train,imputed_X_test,y_train,y_test))
      
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation to handle missing data
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

#prediction of categorical data
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

train_data.dropna(axis=0,subset=['SalePrice'], inplace=True)
target = train_data.SalePrice
cols_with_missing = [cols for cols in train_data.columns if train_data[cols].isnull().any()]
candidate_train_predictors = train_data.drop(['Id','SalePrice'] + cols_with_missing,axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing,axis=1)
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]
train_predictors.dtypes.sample(10)
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)

