*  *PERSONAL PROJECT - MACHINE LEARNING*
* **My work to predict housing prices in the state of Iowa**.
Each section is dedicated to practice a certain aspect of machine learning. Each section is named after what will be practiced in that section. All sections are divided into two chapers. CHAPTER ONE - deals with everything essential and basic for machine learning. CHAPTER TWO - tries to get better predictions by using more complex approaches and tools.
In the end, the final section of each chapter summerizes all the work completed in that chapter and predicts the final prices.


In [None]:
#CHAPTER 1: MACHINE LEARNING BASICS
#section 1: import pandas and data
import pandas as pd

main_file_path = '../input/train.csv' # training data for Iowa state information
data = pd.read_csv(main_file_path)

print(data.describe())


#section 2: Choosing a column
#one column
print('section 3')
print(data.columns)
dataNeighborhood=data.Neighborhood
print(dataNeighborhood.head())
#two columns
chosencolumns=['SalePrice','MoSold']
datacolumnschosen=data[chosencolumns]
datacolumnschosen.describe()

#section 3
#assigning a prediction target
y=data.SalePrice
#Choosing Predictors
datapredictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr', 'TotRmsAbvGrd']
X= data[datapredictors]
#building a model
from sklearn.tree import DecisionTreeRegressor
#Dedine model
iowa_model=DecisionTreeRegressor()
#fit model
iowa_model.fit(X,y)

print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(iowa_model.predict(X.head()))
print('actual price')
print(y.head())

#section 4
#Model Validation
# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
print ('Section 5')
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

#define model
from sklearn.tree import DecisionTreeRegressor
iowa_model=DecisionTreeRegressor()
#fit model
iowa_model.fit(train_X, train_y)

#get predicted prices on validation data
from sklearn.metrics import mean_absolute_error
val_predictions = iowa_model.predict(val_X)

print (mean_absolute_error(val_y, val_predictions))


#Section 5
#Underfitting, Overfitting and Model Optimization

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    iowamodel=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    iowamodel.fit(predictors_train, targ_train)
    preds_val = iowamodel.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

#compare MAE with differing values of max_leaf_nodes

for max_leaf_nodes in [5, 50,70, 100, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
    
    
#section 6
#new model RandomForest (works well withour any parameters needed to be changed)
print('Section 7: New Model, Random Forest')
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

iowaforest_model=RandomForestRegressor()
iowaforest_model.fit(train_X, train_y)
iowa_predictions=iowaforest_model.predict(val_X)
print(mean_absolute_error(val_y, iowa_predictions))


#section 7
#Summerizes everything done in sections 1-7 and predicts the prices
#However, there are more sections below that make the code more complex and predictions more accurate
print('Section 8: Submission and final file with test')
#get the right columns
train_y=data.SalePrice
datapredictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr', 'TotRmsAbvGrd']
train_X=data[datapredictors]
#fitting the data
iowafinalmodel=RandomForestRegressor()
iowafinalmodel.fit(train_X, train_y)

#Read the test data
test=pd.read_csv('../input/test.csv')
print(test.describe())
#treat the test data in the same way as training data. Pull same columns
test_X=test[datapredictors]
#Use model to make predictions
finalpredicted_prices=iowafinalmodel.predict(test_X)

#check the predicted prices
print(finalpredicted_prices)

#submit
my_submission=pd.DataFrame({'Id': test.Id, 'SalePrice': finalpredicted_prices})
#you could use any filename. We choose subission here
my_submission.to_csv('submission.csv', index=False)


#NOW WE WILL USE OTHER TOOLS TO MAKE PREDICTIONS BETTER
#CHAPTER 2: Attempt to MAke Better Predictions
print('Chapter 2: Attempt to Make Better Predictions')
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------#
#Section 1: Dealing with missing values in data
print ('Section 1: Dealing With Missing Values in Data')
#set up the problem and then compare all the options to eliminate missing values and find the best one
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
data = pd.read_csv('../input/train.csv') #training data
#how many missing values?
#print(data.isnull().sum())

train_target = data.SalePrice
train_predictors = data.drop(['SalePrice'], axis=1)
numeric_predictors = train_predictors.select_dtypes(exclude=['object']) #excludes not numeric values
train_X, test_X, train_y, test_y = train_test_split(numeric_predictors, train_target, random_state=0) #split data to see error

#fuction that returns MAE (error)
def get_mae(train_X, test_X, train_y, test_y):
    iowaforest_model=RandomForestRegressor()
    iowaforest_model.fit(train_X, train_y)
    iowa_predictions=iowaforest_model.predict(test_X)
    print(mean_absolute_error(test_y, iowa_predictions))
    
    
#THREE OPTIONS TO DEAL WITH MISSING VALUES
#Option 1: Droping columns with missing values
print('Option 1')
# you can drop columns with data_withou_missing_values=original_data.dropna(axis=1)
cols_with_missing = [col for col in train_X.columns 
                        if train_X[col].isnull().any()]
reduced_train_X=train_X.drop(cols_with_missing, axis=1)
reduced_test_X=test_X.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
get_mae(reduced_train_X, reduced_test_X, train_y, test_y) #find error with this option

#no missing values in training data now! However, we lose useful information too

#Option 2: IMPUTATION
print('Option 2')
#assign not exact value to the missing spot
from sklearn.preprocessing import Imputer
my_imputer = Imputer()
imputed_train_X=pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_test_X=pd.DataFrame(my_imputer.transform(test_X))
imputed_train_X.columns=train_X.columns
imputed_test_X.columns=test_X.columns
print("Mean Absolute Error from Imputation:")
get_mae(imputed_train_X, imputed_test_X, train_y, test_y)

#Section 2: Dealing with categorical data (words instead of numbers)
#lets see how much of train_X is categorical
train_target = data.SalePrice
train_predictors = data.drop(['SalePrice'], axis=1)

train_X, test_X, train_y, test_y = train_test_split(train_predictors, train_target, random_state=0) #split data to see error

#lets one-hot encode the data and that use imputation and compare final values
newtrain_X=pd.get_dummies(train_X)
newtest_X=pd.get_dummies(test_X)
hotencodedtrain_X, hotencodedtest_X = newtrain_X.align(newtest_X, join='left', axis=1) #allign two datasets

#now find MAE and answer and compare with the results we has in secion of this chapter
my_imputer = Imputer()
imputed_train_X=pd.DataFrame(my_imputer.fit_transform(hotencodedtrain_X))
imputed_test_X=pd.DataFrame(my_imputer.transform(hotencodedtest_X))
imputed_train_X.columns=hotencodedtrain_X.columns
imputed_test_X.columns=hotencodedtest_X.columns
                                
print("Mean Absolute Error from Imputation and Categorical Data:")
get_mae(imputed_train_X, imputed_test_X, train_y, test_y)

print('Section 3: New Model/Calssifier - XGBoost')
#Section 3: New Better Model/Classifier - XGBoost
#set up the problem
train_target = data.SalePrice
train_predictors = data.drop(['SalePrice'], axis=1)
numeric_predictors = train_predictors.select_dtypes(exclude=['object']) #excludes not numeric values
train_X, test_X, train_y, test_y = train_test_split(numeric_predictors, train_target, random_state=0) #split data to see error
#get rid of misssing values
my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)
#now work with the new model
from xgboost import XGBRegressor
my_model=XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=2)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)], verbose=False)

#we found that best iteration was 200!
#predict
predictions=my_model.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))
#compare and assign 200 in the beggining
my_model=XGBRegressor(n_estimators=200, learning_rate=0.05, n_jobs=2)
my_model.fit(train_X, train_y)
predictions=my_model.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))


#section 4: Partial Dependence Plot
#valuable way to extract insights from from the model
#for patial dependence plots we need to use a new model GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence

y=data.SalePrice
#Choosing Predictors
datapredictors=['YearBuilt', 'LotArea', 'BedroomAbvGr', 'Id']
X=data[datapredictors]


my_imputer=Imputer()
X=my_imputer.fit_transform(X)
my_model = GradientBoostingRegressor()

my_model.fit(X,y)
#make a plot
my_plots = plot_partial_dependence(my_model,       
                                   features=[0,3], # column numbers of plots we want to show
                                   X=X,            # raw predictors data.
                                   feature_names=['YearBuilt', 'LotArea', 'BedroomAbvGr', 'Id'], # labels on graphs # supposed to have the right order
                                   grid_resolution=20) # number of values to plot on x axis



#section 5: Introduction of pipelines
#makes code shorter and faster which leads to fewer mistakes
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
# we have already seperated training and test data

#use the pipeline
my_pipeline=make_pipeline(Imputer(),GradientBoostingRegressor())
my_pipeline.fit(train_X,train_y)
predictions=my_pipeline.predict(test_X)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))


#section 6:



