# Cleaning

#### NA's (or NaNs) as value
1. **Alley:** Type of alley access to property
2. **BsmtQual:** Evaluates the height of the basement
3. **BsmtCond:** Evaluates the general condition of the basement
4. **BsmtExposure:** Refers to walkout or garden level walls
5. **BsmtFinType1:** Rating of basement finished area
6. **BsmtFinType2:** Rating of basement finished area (if multiple types)*
7. **FireplaceQu:** Fireplace quality
8. **GarageType:** Garage location
9. **GarageFinish:** Interior finish of the garage
10. **GarageQual:** Garage quality
11. **GarageCond:** Garage condition
12. **PoolQC:** Pool quality (Biggest of the above)
13. **Fence:** Fence quality
14. **MiscFeature:** Miscellaneous feature not covered in other categories

##### The two below are listed as None in description, but NA in dataset
* MasVnrArea: Masonry veneer type
* MasVnrType: Masonry veneer area in square feet

##### Others
* LotFrontage: Maybe set to NA if there is no street connected to property?
* GarageYrBlt: Set to NA if the above Garage attributes are set as NA

* Electrical: Electrical system. There is only one property with a missing data in Electrical. At row 1381


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape, test.shape, "total[0] =",test.shape[0]+train.shape[0])

(1460, 81) (1459, 80) total[0] = 2919


In [3]:
border = train.shape[0]
border

1460

In [4]:
DFc = pd.concat([train, test],ignore_index=True, sort=False)
DFc.shape

(2919, 81)

In [5]:
DFc.iloc[border-2:border+2, :]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125.0
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500.0
1460,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,
1461,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,


In [6]:
DFc = DFc.drop(columns=['LotShape', 'GarageYrBlt'])

In [7]:
def summary_missing_data(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count() * 100).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [8]:
missing_data = summary_missing_data(DFc)
print(missing_data)

               Total    Percent
PoolQC          2909  99.657417
MiscFeature     2814  96.402878
Alley           2721  93.216855
Fence           2348  80.438506
SalePrice       1459  49.982871
FireplaceQu     1420  48.646797
LotFrontage      486  16.649538
GarageQual       159   5.447071
GarageFinish     159   5.447071
GarageCond       159   5.447071
GarageType       157   5.378554
BsmtCond          82   2.809181
BsmtExposure      82   2.809181
BsmtQual          81   2.774923
BsmtFinType2      80   2.740665
BsmtFinType1      79   2.706406
MasVnrType        24   0.822199
MasVnrArea        23   0.787941
MSZoning           4   0.137033
Utilities          2   0.068517
Functional         2   0.068517
BsmtHalfBath       2   0.068517
BsmtFullBath       2   0.068517
GarageArea         1   0.034258
GarageCars         1   0.034258
SaleType           1   0.034258
BsmtFinSF1         1   0.034258
BsmtFinSF2         1   0.034258
BsmtUnfSF          1   0.034258
TotalBsmtSF        1   0.034258
...     

In [9]:
atts = pd.read_csv('attributes.csv')

In [10]:
for i in range(0, len(atts)):
    if atts.iloc[i]['Type']=='Categorical':
        col_name=atts.iloc[i]['Attribute']
        DFc[col_name].fillna('None', inplace=True)
        DFc=pd.get_dummies(DFc, columns=[col_name])

for value in ((missing_data[missing_data['Total'] < 50]).index):
    DFc = DFc.drop(DFc.loc[DFc[value].isnull()].index)
summary_missing_data(DFc,17)

In [11]:
summary_missing_data(DFc.drop(columns=['SalePrice'])).head(11)

Unnamed: 0,Total,Percent
LotFrontage,486,16.649538
MasVnrArea,23,0.787941
BsmtHalfBath,2,0.068517
BsmtFullBath,2,0.068517
BsmtFinSF2,1,0.034258
GarageCars,1,0.034258
GarageArea,1,0.034258
TotalBsmtSF,1,0.034258
BsmtUnfSF,1,0.034258
BsmtFinSF1,1,0.034258


In [None]:
DFc.LotFrontage

#### Manually filled in some missing values with modes

In [None]:
DFc['LotFrontage'].fillna(float(DFc['LotFrontage'].mode()), inplace=True)
DFc['MasVnrArea'].fillna(float(DFc['MasVnrArea'].mode()), inplace=True)
DFc['BsmtFullBath'].fillna(float(DFc['BsmtFullBath'].mode()), inplace=True)
DFc['BsmtHalfBath'].fillna(float(DFc['BsmtHalfBath'].mode()), inplace=True)
DFc['BsmtUnfSF'].fillna(float(DFc['BsmtUnfSF'].mode()), inplace=True)
DFc['GarageArea'].fillna(float(DFc['GarageArea'].mode()), inplace=True)
DFc['TotalBsmtSF'].fillna(float(DFc['TotalBsmtSF'].mode()), inplace=True)
DFc['BsmtFinSF1'].fillna(float(DFc['BsmtFinSF1'].mode()), inplace=True)
DFc['BsmtFinSF2'].fillna(float(DFc['BsmtFinSF2'].mode()), inplace=True)
DFc['GarageCars'].fillna(float(DFc['GarageCars'].mode()), inplace=True)

In [None]:
summary_missing_data(DFc.drop(columns=['SalePrice'])).head(11)

In [15]:
categorical = [i for i in DFc.columns if DFc[i].dtype == 'O']
print('Categorical: ', categorical)

Categorical:  []


In [14]:
numerical = [i for i in DFc.columns if DFc[i].dtype != 'O']
print('numerical columns: ', len(numerical))

numerical columns:  376


In [19]:
discrete = []
for i in numerical:
    if len(DFc[i].unique()) < 25:
        #print(i, ' values: ', DFc[i].unique())
        discrete.append(i)
print('Discrete values less than 25: ', len(discrete))

Discrete values less than 25:  355


In [20]:
continuous = [i for i in numerical if i not in discrete and i not in ['Id', 'SalePrice']]
print('Continuous vars: ', len(continuous))

Continuous vars:  19


In [40]:
pd.set_option('display.max_columns', 500)
na_cols = [i for i in DFc.columns if DFc[i].isna().any()]
na_cols

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageCars',
 'GarageArea',
 'SalePrice']

In [45]:
for i in na_cols:
    print(i, ':', DFc[i].dtype)

LotFrontage : float64
MasVnrArea : float64
BsmtFinSF1 : float64
BsmtFinSF2 : float64
BsmtUnfSF : float64
TotalBsmtSF : float64
BsmtFullBath : float64
BsmtHalfBath : float64
GarageCars : float64
GarageArea : float64
SalePrice : float64


In [50]:
lot_mean = DFc.LotFrontage.mean()

In [52]:
DFc.LotFrontage = DFc.LotFrontage.fillna(value=lot_mean)

In [53]:
DFc.LotFrontage

0        65.000000
1        80.000000
2        68.000000
3        60.000000
4        84.000000
5        85.000000
6        75.000000
7        69.305795
8        51.000000
9        50.000000
10       70.000000
11       85.000000
12       69.305795
13       91.000000
14       69.305795
15       51.000000
16       69.305795
17       72.000000
18       66.000000
19       70.000000
20      101.000000
21       57.000000
22       75.000000
23       44.000000
24       69.305795
25      110.000000
26       60.000000
27       98.000000
28       47.000000
29       60.000000
           ...    
2889     50.000000
2890     75.000000
2891     69.000000
2892     50.000000
2893     60.000000
2894     41.000000
2895     44.000000
2896     69.000000
2897     65.000000
2898     70.000000
2899    140.000000
2900     69.305795
2901     69.305795
2902     95.000000
2903     88.000000
2904    125.000000
2905     78.000000
2906     41.000000
2907     58.000000
2908     69.305795
2909     21.000000
2910     21.

In [None]:
DFtrain = DFc.iloc[0:border,:]
print(DFtrain.shape)
DFtest = DFc.iloc[border:,:]
DFtest = DFtest.drop(columns=['SalePrice'])
print(DFtest.shape)

In [None]:
DFtrain.to_csv('trainer.csv', index=False)
DFtest.to_csv('tester.csv', index=False)

# Modeling

Was going to get ready for PCA stuff below

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
X = DFtrain.drop(columns=['Id', 'SalePrice'])
Y = DFtrain['SalePrice']
Xt = DFtest.drop(columns=['Id'])

##### Scaling, otherwise we will just see one component!

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
Xt = scaler.transform(Xt)

In [None]:
Xcopy  = X
Ycopy  = Y
Xtcopy = Xt

In [None]:
pca = PCA(5)

In [None]:
pca.fit(X)

In [None]:
pca.n_components_

In [None]:
X = pca.transform(X)
Xt = pca.transform(Xt)

In [None]:
X.shape, Xt.shape, Y.shape

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
XTRAIN, XTEST, YTRAIN, YTEST=train_test_split(X,Y)
r=LinearRegression().fit(XTRAIN,YTRAIN)
P=r.predict(XTEST)
R2=r2_score(YTEST,P)
MSE = mean_squared_error(YTEST,P)
print(R2,MSE)

In [None]:
errs=[]
for i in range(100):
    XTRAIN, XTEST, YTRAIN, YTEST=train_test_split(X,Y)
    r=LinearRegression().fit(XTRAIN,YTRAIN)
    P=r.predict(XTEST)
    R2=r2_score(YTEST,P)
    #MSE = mean_squared_error(YTEST,P)
    #print(R2,MSE)
    errs.append(1-R2)

In [None]:
print('Result: ~',np.round((1-np.mean(errs))*100), '% accuracy')

maybe we should try a different number of components

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
gnb=GaussianNB()
errs=[]
nsplits = 100 #it takes a very long time with 100 splits
for split in range(nsplits):
    XTRAIN, XTEST, YTRAIN, YTEST=train_test_split(X,Y,test_size=.25)
    gnb.fit(XTRAIN,YTRAIN)
    YP=gnb.predict(XTEST)
    errs.append(1-accuracy_score(YTEST,YP))
print("%d Splits: Mean Error=%7.6f +/- %7.6f (95%%)"\
      %(nsplits, np.mean(errs),1.96*np.std(errs)))
print(confusion_matrix(YTEST,YP))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF=RandomForestClassifier(n_estimators=500)
XTRAIN, XTEST, YTRAIN, YTEST=train_test_split(Xcopy,Ycopy,test_size=.25)
RF.fit(XTRAIN,YTRAIN)
YP=RF.predict(XTEST)
error=(1-accuracy_score(YTEST,YP))
print('Error: ', error*100,'%')

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr.fit(X,Y)

In [None]:
logisticRegr.predict(Xt[0].reshape(1,-1))

In [None]:
logisticRegr.predict(Xt[0:10])