In [2]:
import numpy as np
import pandas as pd
import seaborn as sn
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor


In [3]:
df = pd.read_csv(r"..\input\train.csv")
test_df = pd.read_csv(r"..\input\test.csv")

test_df.head()



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
#Get the dummy variables for all features.
# 1 : Numerical : Fill with the mean values
# 2:  Categorical : 1) Replace NA with None, as NA has some meaning. 
#                      Like NA for parking will mean No parking and may effect SaleCost
#Rest all delete the rows

def clean_data(df):
    df['MasVnrType'].fillna("NoNe",inplace=True)
    df['MasVnrArea'].fillna(df['MasVnrArea'].mean(),inplace=True)
    df['BsmtFinSF2'].fillna(df['BsmtFinSF2'].mean(),inplace=True)
    df['BsmtUnfSF'].fillna(df['BsmtUnfSF'].mean(),inplace=True)
    df['TotalBsmtSF'].fillna(df['TotalBsmtSF'].mean(),inplace=True)
    df['BsmtFullBath'].fillna(df['BsmtFullBath'].mean(),inplace=True)
    df['BsmtHalfBath'].fillna(df['BsmtHalfBath'].mean(),inplace=True)
    df['LotFrontage'].fillna(df['LotFrontage'].mean(),inplace=True)
    df['Alley'].fillna("NoNe",inplace=True)
    df['FireplaceQu'].fillna("NoNe",inplace=True)
    df['GarageType'].fillna("NoNe",inplace=True)
    df['GarageYrBlt'].fillna("NoNe",inplace=True)
    df['GarageFinish'].fillna("NoNe",inplace=True)
    df['GarageQual'].fillna("NoNe",inplace=True)
    df['GarageCond'].fillna("NoNe",inplace=True)
    df['PoolQC'].fillna("NoNe",inplace=True)
    df['Fence'].fillna("NoNe",inplace=True)
    df['MiscFeature'].fillna("NoNe",inplace=True)
    df['GarageCars'].fillna(df['GarageCars'].mean(),inplace=True)
    df['GarageArea'].fillna(df['GarageArea'].mean(),inplace=True)
    #For rest of he variables we will drop the data
    #df.dropna(axis=0,inplace=True)
    return df
def clean_data_test(df):
    df['MasVnrType'].fillna("NoNe",inplace=True)
    df['MasVnrArea'].fillna(df['MasVnrArea'].mean(),inplace=True)
    df['BsmtFinSF2'].fillna(df['BsmtFinSF2'].mean(),inplace=True)
    df['BsmtUnfSF'].fillna(df['BsmtUnfSF'].mean(),inplace=True)
    df['TotalBsmtSF'].fillna(df['TotalBsmtSF'].mean(),inplace=True)
    df['BsmtFullBath'].fillna(df['BsmtFullBath'].mean(),inplace=True)
    df['BsmtHalfBath'].fillna(df['BsmtHalfBath'].mean(),inplace=True)
    df['LotFrontage'].fillna(df['LotFrontage'].mean(),inplace=True)
    df['Alley'].fillna("NoNe",inplace=True)
    df['FireplaceQu'].fillna("NoNe",inplace=True)
    df['GarageType'].fillna("NoNe",inplace=True)
    df['GarageYrBlt'].fillna("NoNe",inplace=True)
    df['GarageFinish'].fillna("NoNe",inplace=True)
    df['GarageQual'].fillna("NoNe",inplace=True)
    df['GarageCond'].fillna("NoNe",inplace=True)
    df['PoolQC'].fillna("NoNe",inplace=True)
    df['Fence'].fillna("NoNe",inplace=True)
    df['MiscFeature'].fillna("NoNe",inplace=True)
    df['GarageCars'].fillna(df['GarageCars'].mean(),inplace=True)
    df['GarageArea'].fillna(df['GarageArea'].mean(),inplace=True)
    df['BsmtFinSF1'].fillna(df['BsmtFinSF1'].mean(),inplace=True)
    
    return df
    
def get_dummies(df):
    df = pd.get_dummies(df)
    return df

In [5]:
df = clean_data(df)
test_df = clean_data_test(test_df)
print(df.shape)
print(test_df.shape)

(1460, 81)
(1459, 80)


In [6]:
#convert the categorical variable by adding dummies
df = get_dummies(df)
test_df = get_dummies(test_df)

#Test to see if both the test & train data and same columns
print(test_df.shape)

df['SalePrice'] = np.log(df['SalePrice'])

Y_train = df['SalePrice']
df.drop('SalePrice',axis=1,inplace=True)

print(df.shape)


(1459, 378)
(1460, 396)


In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
def doMinMaxScaling(df):
    scalar = MinMaxScaler(feature_range = [0,1])
    data = scalar.fit_transform(df)
    return data

def doPCA(df,ncomponents=0):
    if ncomponents:
        pca = PCA(n_components=ncomponents)
        pcaData = pca.fit_transform(df)
        return pca,pcaData
    else:
        pca = PCA().fit(df)
        return pca
    
def getDataFromDf(df):
    return df.iloc[:,:]

In [8]:
X_train = df

In [9]:
#Now Run the Ridge Regularization
def reg_alpha(X,Y,alpha,normalize=True):
    lreg = Ridge(alpha)
    model = lreg.fit(X,Y)
    Y_pred = model.predict(X)
    return model,Y_pred,lreg

def get_rmse(true_val,pred_val):
    error = true_val-pred_val
    rmse = np.sqrt(np.mean(error**2))
    return rmse

def runDescisionTree(X,Y):
    regressor = DecisionTreeRegressor(random_state=0)
    model = regressor.fit(X,Y)
    return model

In [10]:
X = X_train
Y = Y_train

In [11]:
#Run The DecisionTree
model = runDescisionTree(X,Y)
Y_pred = model.predict(X)

In [12]:
score = model.score(X,Y)
print(score)

0.9999999948118853


In [13]:
Y_pred_df = pd.Series(Y_pred)
Y_pred_df.head()

0    12.247694
1    12.109011
2    12.317167
3    11.849398
4    12.429216
dtype: float64

In [14]:
rmse = get_rmse(Y,Y_pred_df)
print(rmse)

2.876209113325231e-05


In [15]:
X_test = test_df

In [16]:
Y_pred_test = model.predict(X_test)

ValueError: Number of features of the model must match the input. Model n_features is 396 and input n_features is 378 

In [17]:
Y_pred_test = np.exp(Y_pred_test)

NameError: name 'Y_pred_test' is not defined

In [32]:
Predict = pd.DataFrame({'Id': test_df.Id, 'SalePrice': (Y_pred_test)})
Predict.reset_index(drop=True,inplace=True)
Predict.to_csv('submission_PCA_DecisionTree.csv')


In [33]:
XTestApprox = pca.inverse_transform(testPcaData)

In [34]:
XTestApprox_df = pd.DataFrame(XTestApprox)

In [35]:
XTestApprox_df.shape

(1459, 378)

In [37]:
XTestApprox_df.columns = test_df.columns

In [38]:
XTestApprox_df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-9.6e-05,-0.015971,0.298219,0.136898,0.392493,0.544377,0.626796,0.198904,-0.021086,0.122626,...,-0.008917,0.000557,0.011969,0.994633,0.026781,-0.076612,-0.021804,0.037197,1.0222,0.012238
1,-0.004695,0.010792,0.347836,0.247679,0.484895,0.678245,0.631668,0.112317,0.262618,0.161212,...,-0.00145,0.001312,0.004685,0.978418,0.001252,-0.027355,0.00721,0.025691,1.007982,-0.01478
2,-0.03274,0.236135,0.243104,0.154069,0.508647,0.477349,0.911929,0.804106,0.036674,0.155939,...,-0.000735,-0.005418,0.008094,1.009709,-0.006563,-0.033144,0.042319,-0.003603,0.99873,0.002261
3,-0.035361,0.243522,0.274974,0.15339,0.621035,0.620272,0.852043,0.807463,0.061647,0.101646,...,0.006267,0.00036,0.011736,0.995161,0.005625,-0.038164,-0.003101,0.025156,1.012874,-0.002389
4,-0.03657,0.637476,0.124125,0.096316,0.718864,0.509932,0.850911,0.697174,0.01374,0.064043,...,0.011002,-0.024216,0.010706,0.988537,-0.013796,-0.010732,0.034879,0.018091,0.990278,-0.01872


(1459, 378)