# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [None]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print('hello world')

In [None]:
print(data.describe())

In [None]:
data.info()

In [None]:
data.head(5)

In [None]:
print(data.columns)

In [None]:
data.PoolArea.head(5)

In [None]:
mycolumns = ['Id','SalePrice']
data[mycolumns].describe()

In [None]:
data[mycolumns].head()

In [None]:
y = data['SalePrice']

In [None]:
pred = ['LotArea','YearBuilt','YearBuilt','1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

In [None]:
X = data[pred]

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Define model
my_model = DecisionTreeRegressor()

# Fit model
my_model.fit(X,y)


In [None]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(my_model.predict(X.head()))

In [None]:
from sklearn.metrics import mean_absolute_error
my_pred = my_model.predict(X)
#print(my_pred)
mean_absolute_error(y,my_pred)

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state = 0)
# Define model
my_model = DecisionTreeRegressor()
# Fit model
my_model.fit(train_X, train_y)
# get predicted prices on validation data
val_predictions = my_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))


In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mea(max_leaf_nodes, predictors_train,predictors_val,targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(predictors_train,targ_train)
    pred_val = model.predict(predictors_val)
    mea = mean_absolute_error(targ_val,pred_val)
    return(mea)
    
    



In [None]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [10,100,1000,5000]:
    my_mea = get_mea(max_leaf_nodes,train_X,val_X,train_y,val_y)
    print("Max Leaf Nodes: %d \t\t  Mean Absolute Error: %d " %(max_leaf_nodes,my_mea))


In [None]:
##Random Forests

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor()
forest_model.fit(train_X,train_y)
iowa_pred = forest_model.predict(val_X)
print(mean_absolute_error(val_y, iowa_pred))

# improvement found in 'Random Forest' over 'Decision Tree' 

In [None]:
##Submitting From A Kernel

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Read the Data
train = pd.read_csv('../input/train.csv')

# pull data into target (y) and predictors (X)
train_y = train.SalePrice
predictor_columns = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd']

# Create training predictors data
train_X = train[predictor_columns]

my_model = RandomForestRegressor()
my_model.fit(train_X,train_y)

In [None]:
# Read the test data
test = pd.read_csv('../input/test.csv')
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test[predictor_columns]
# Use the model to make predictions
predicted_price = my_model.predict(test_X)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_price)


In [None]:
#Prepare Submission File
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_price})
# File Name: MD1stSubmission.csv
my_submission.to_csv('MD1stSubmission.csv', index=False)




In [None]:
##Handling Missing Values
print(data.isnull().sum())


In [None]:
#Handling Missing Values


In [None]:
main_file_path = '../input/train.csv'
iowa_data = pd.read_csv(main_file_path)


In [None]:
iowa_data.info()

In [None]:
iowa_data.describe()

In [None]:
iowa_data.head(5)

In [None]:
print(iowa_data.isnull().sum())

In [None]:
iowa_target = iowa_data.SalePrice
iowa_predictors = iowa_data.drop(['SalePrice'],axis=1)
# For the sake of keeping the example simple, we'll use only numeric predictors.
iowa_numeric_predictors = iowa_predictors.select_dtypes(exclude=['object'])

In [None]:
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer 
my_imputer = SimpleImputer()
data_with_imputed_values  = my_imputer.fit_transform(iowa_numeric_predictors)

In [None]:
#data_with_imputed_values is became array , so changing to DataFrame and adding columns back as original DF
data_with_imputed_values = pd.DataFrame(my_imputer.fit_transform(iowa_numeric_predictors))
data_with_imputed_values.columns = iowa_numeric_predictors.columns
data_with_imputed_values

In [None]:
# pull data into target (y) and predictors (X)
y_n = iowa_target
new_predictor_columns = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd','LotFrontage']

# Create training predictors data
X_n = data_with_imputed_values[new_predictor_columns]

train_X_n, test_X_n, train_y_n, test_y_n = train_test_split(X_n,y_n,train_size=0.7,test_size=0.3,random_state = 0)

# fit, predict, mea in same def
def score_dataset(train_X_n, test_X_n, train_y_n, test_y_n):
    my_iowa_model = RandomForestRegressor()
    my_iowa_model.fit(train_X_n, train_y_n)
    preds_n = my_iowa_model.predict(test_X_n)
    return mean_absolute_error(test_y_n, preds_n)

#my_iowa_model = RandomForestRegressor()
#my_iowa_model.fit(train_X_n,train_y_n)
# Use the model to make predictions
#predicted_price_n = my_iowa_model.predict(test_X_n)
# We will look at the predicted prices to ensure we have something sensible.
#print(predicted_price_n)


In [None]:
print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(train_X_n, test_X_n, train_y_n, test_y_n))

In [None]:
#Get Model Score from Dropping Columns with Missing Values

cols_with_missing = [col for col in train_X_n.columns 
                                 if train_X_n[col].isnull().any()]
reduced_train_X_n = train_X_n.drop(cols_with_missing,axis=1) 
reduced_test_X_n = test_X_n.drop(cols_with_missing,axis=1) 
print("Mean Absolute Error from dropping columns with Missing Values:")
score_dataset(reduced_train_X_n,reduced_test_X_n,train_y_n,test_y_n)


In [None]:
#Get Score from Imputation with Extra Columns Showing What Was Imputed

imputed_train_X_plus = train_X_n.copy()
imputed_test_X_plus =  test_X_n.copy()

cols_with_missing = (col for col in train_X_n.columns 
                                 if train_X_n[col].isnull().any())
for col in cols_with_missing:
        imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
        imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_train_X_plus)
imputed_X_test_plus = my_imputer.transform(imputed_test_X_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus,imputed_X_test_plus,train_y_n,test_y_n ))

In [None]:
##Using Categorical Data with One Hot Encoding
import pandas as pd
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

# Drop houses where the target is missing
train_data.dropna(subset=['SalePrice'],axis=0,inplace= True)

target = train_data.SalePrice

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
# Since missing values isn't the focus of this tutorial, we use the simplest
# possible approach, which drops these columns. 
# For more detail (and a better approach) to missing values, see
# https://www.kaggle.com/dansbecker/handling-missing-values
cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()] 
candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

In [None]:
# "cardinality" means the number of unique values in a column.
# We use it as our only way to select categorical columns here. This is convenient, though
# a little arbitrary.
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]

numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numeric_cols

train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [None]:
train_predictors.dtypes.sample(10)

In [None]:
test_predictors.dtypes.sample(10)

In [None]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mea(X,y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])
mae_without_categoricals = get_mea(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mea(one_hot_encoded_training_predictors, target)

In [None]:
print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

In [None]:
#Applying to Multiple Files

#The align command makes sure the columns show up in the same order in both datasets (it uses column names to identify which columns line up in each dataset.) 
#The argument join='left' specifies that we will do the equivalent of SQL's left join. That means, 
#if there are ever columns that show up in one dataset and not the other, we will keep exactly the columns from our training data. 
#The argument join='inner' would do what SQL databases call an inner join, keeping only the columns showing up in both datasets. That's also a sensible choice.

one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_testing_predictors = pd.get_dummies(test_predictors)

final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_testing_predictors, join = 'left', axis = 1)

In [None]:
#Learning to Use XGBoost

xg_data = pd.read_csv('../input/train.csv')
xg_data.dropna(subset=['SalePrice'],axis=0,inplace=True)
y = xg_data.SalePrice
X = xg_data.drop(['SalePrice'] ,axis=1).select_dtypes(exclude=['object'])
#train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)
#NOTE: .as_matrix is removed from future versions
xg_train_X, xg_test_X, xg_train_y, xg_test_y = train_test_split(X.values, y.values, test_size=0.25)

xg_imputer = SimpleImputer()
xg_train_X = xg_imputer.fit_transform(train_X)
xg_test_X = xg_imputer.transform(test_X)

In [None]:
#We build and fit a model just as we would in scikit-learn.
#from xgboost import XGBRegressor
#xg_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
#xg_model.fit(xg_train_X, xg_train_y)


In [None]:
#We similarly evaluate a model and make predictions as we would do in scikit-learn.
# make predictions

#xg_predictions = xg_model.predict(xg_test_X)
#print('Mean Absolute Error :' + str(mean_absolute_error(xg_predictions, xg_test_y)))

In [None]:
#Model Tuning
#xg_model = XGBRegressor(n_estimators=1000)
#xg_model.fit(xg_train_X, xg_train_y,early_stopping_rounds = 5, eval_set=[(xg_test_X, xg_test_y)], verbose = False)

#xg_predictions = xg_model.predict(xg_test_X)
#print('Mean Absolute Error :' + str(mean_absolute_error(xg_predictions, xg_test_y)))

In [None]:
#learning_rate
#xg_model = XGBRegressor(n_estimators=1000,learning_rate=0.05)
#xg_model.fit(xg_train_X, xg_train_y,early_stopping_rounds = 5, eval_set=[(xg_test_X, xg_test_y)], verbose = False)

#xg_predictions = xg_model.predict(xg_test_X)
#print('Mean Absolute Error :' + str(mean_absolute_error(xg_predictions, xg_test_y)))

In [None]:
##Partial Dependence Plots
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.impute import SimpleImputer 

cols_to_use = ['LotArea', 'OverallQual', 'YearBuilt']

def get_some_data():
    data = pd.read_csv('../input/train.csv')
    y = data.SalePrice
    X = data[cols_to_use]
    my_imputer = SimpleImputer()
    imputed_X = my_imputer.fit_transform(X)
    return imputed_X, y

X, y = get_some_data() 
my_model = GradientBoostingRegressor()
my_model.fit(X,y)

my_plots = plot_partial_dependence(my_model,features=[0,2],X=X,feature_names = cols_to_use, grid_resolution=10)


In [None]:
#Pipelines

import pandas as pd
from sklearn.model_selection import train_test_split

# Read Data
data = pd.read_csv('../input/train.csv')
cols_to_use = ['LotArea', 'OverallQual', 'YearBuilt']
X = data[cols_to_use]
y = data.SalePrice

train_X, train_y, test_X, train_y = train_test_split(X,y)
    


In [None]:
#You have a modeling process that uses an Imputer to fill in missing values, followed by a RandomForestRegressor to make predictions. 
#These can be bundled together with the make_pipeline function as shown below.
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

my_pipeline = make_pipeline(SimpleImputer(), RandomForestRegressor())

#You can now fit and predict using this pipeline as a fused whole.
#my_pipeline.fit(train_X, train_y)
#predictions = my_pipeline.predict(test_X)




In [None]:
#For comparison, here is the code to do the same thing without pipelines

my_imputer = SimpleImputer()
my_model = RandomForestRegressor()

#imputed_train_X = my_imputer.fit_transform(train_X)
#imputed_test_X = my_imputer.transform(test_X)
#my_model.fit(imputed_train_X, train_y)
#predictions = my_model.predict(imputed_test_X)


In [None]:
#Cross-Validation

import pandas as pd
data = pd.read_csv('../input/train.csv')
cols_to_use = ['LotArea', 'OverallQual', 'YearBuilt']
X = data[cols_to_use]
y = data.SalePrice


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

my_pipeline = make_pipeline(SimpleImputer(),RandomForestRegressor())

from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_pipeline,X,y,scoring='neg_mean_absolute_error')
print(scores)
print('Mean Absolute Error %2f' %(-1 * scores.mean()))


In [None]:
# Data Leakage    DO IT LATER o IOWA DATASET
#Good Luck for your DS , ML Learning
 