In [124]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import Imputer

from scipy.stats import skew

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

In [125]:
pd.set_option('display.max_columns', 100)

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [126]:
# Save the 'Id' column
train_ID = train_df['Id']
test_ID = test_df['Id']

# Now drop the 'Id' colum since we can not use it as a feature to train our model.
train_df.drop("Id", axis = 1, inplace = True)
test_df.drop("Id", axis = 1, inplace = True)

In [127]:
def is_outlier(points, thresh = 3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

In [128]:
Y_TRAIN= train_df['SalePrice']
X_TRAIN = train_df.drop('SalePrice', axis=1)
X_TEST = test_df.copy()

In [129]:
#del train_df, test_df
print(X_TRAIN.shape)
print(X_TEST.shape)

(1460, 79)
(1459, 79)


In [130]:
Y_TRAIN=np.log(Y_TRAIN)
#print(Y_TRAIN)

In [131]:
all_data = pd.concat([X_TRAIN, X_TEST])
all_data.shape

(2919, 79)

#### Find all Categorical Data

In [132]:
cats = []
for col in all_data.columns.values:
    if all_data[col].dtype == 'object':
        cats.append(col)

#### Create Separate Continuous Data

In [133]:
df_cont = all_data.drop(cats, axis=1)
df_cat = all_data[cats]

### Handle Missing Data for continuous data
- If any column contains more than 50 entries of missing data, drop the column
- If any column contains fewer that 50 entries of missing data, replace those missing values with the median for that column
- Remove outliers using Median Absolute Deviation
- Calculate skewness for each variable and if greater than 0.75 transform it
- Apply the sklearn.Normalizer to each column

In [134]:
for col in df_cont.columns.values:
    if np.sum(df_cont[col].isnull()) > 50:
        df_cont = df_cont.drop(col, axis = 1)
    elif np.sum(df_cont[col].isnull()) > 0:
        median = df_cont[col].median()
        idx = np.where(df_cont[col].isnull())[0]
        df_cont[col].iloc[idx] = median

        outliers = np.where(is_outlier(df_cont[col]))
        df_cont[col].iloc[outliers] = median
        
        if skew(df_cont[col]) > 0.75:
            df_cont[col] = np.log(df_cont[col])
            df_cont[col] = df_cont[col].apply(lambda x: 0 if x == -np.inf else x)
        
        df_cont[col] = Normalizer().fit_transform(df_cont[col].reshape(1,-1))[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
  if __name__ == '__main__':
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  app.launch_new_instance()
  del sys.path[0]



#### Handle Missing Data for Categorical Data

- If any column contains more than 50 entries of missing data, drop the column
- If any column contains fewer that 50 entries of missing data, replace those values with the 'MIA'
- Apply the sklearn.LabelEncoder
- For each categorical variable determine the number of unique values and for each, create a new column that is binary



In [135]:
for col in df_cat.columns.values:
    if np.sum(df_cat[col].isnull()) > 50:
        df_cat = df_cat.drop(col, axis = 1)
        continue
    elif np.sum(df_cat[col].isnull()) > 0:
        df_cat[col] = df_cat[col].fillna('MIA')
        
    df_cat[col] = LabelEncoder().fit_transform(df_cat[col])
    
    num_cols = df_cat[col].max()
    for i in range(num_cols):
        col_name = col + '_' + str(i)
        df_cat[col_name] = df_cat[col].apply(lambda x: 1 if x == i else 0)
        
    df_cat = df_cat.drop(col, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [136]:
df_cont.shape

(2919, 34)

In [137]:
df_cat.shape


(2919, 171)

In [138]:
all_data=pd.concat([df_cont, df_cat], axis = 1)
X_TRAIN = all_data.iloc[0:1460,]
X_TEST = all_data.iloc[1460:,]

In [141]:
print(X_TRAIN.shape)
print(Y_TRAIN.shape)
print(X_TEST.shape)

(1460, 205)
(1460,)
(1459, 205)


## Create Test/Train Split

In [243]:
x_train, x_test, y_train, y_test = train_test_split(X_TRAIN, Y_TRAIN, test_size = .25, random_state=5)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1095, 205)
(1095,)
(365, 205)
(365,)


In [244]:
from sklearn import tree
tree_model = tree.DecisionTreeRegressor(min_samples_leaf=8,max_depth=37)
tree_model.fit(x_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=37, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=8, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [245]:
import math
tree_model.predict(x_test)
msle=np.mean(( y_test-tree_model.predict(x_test))** 2)
math.sqrt(msle)

0.18498299086370976

In [150]:
#pd.DataFrame({'Actual':y_test, 'Predict':tree_model.predict(x_test)})

In [171]:
price_test=np.exp(tree_model.predict(X_TEST))

In [172]:
pred_df=pd.DataFrame({'Id':test_ID,'SalePrice':price_test})
pred_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120500.0
1,1462,157900.0
2,1463,192000.0
3,1464,155000.0
4,1465,213500.0


In [174]:
pred_df.to_csv('tree.csv',index=None)

In [255]:
rf_model = RandomForestRegressor(n_estimators=500, n_jobs=-1)

In [256]:
rf_model.fit(x_train, y_train)
msle=np.mean(( y_test-rf.predict(x_test))** 2)
math.sqrt(msle)

0.13542019178263243

In [257]:
rf_pred=np.exp(rf_model.predict(X_TEST))

In [259]:
pred_df=pd.DataFrame({'Id':test_ID,'SalePrice':rf_pred})
pred_df.head()

Unnamed: 0,Id,SalePrice
0,1461,129109.215122
1,1462,157527.509656
2,1463,178190.070649
3,1464,183175.459267
4,1465,197283.183225


In [260]:
pred_df.to_csv('rf_pred.csv',index=None)