In [4]:
import numpy as np 
import pandas as pd 

from sklearn import model_selection

In [5]:
dftrain =  pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [6]:
dftrain.head()

In [7]:
dftrain.shape

In [8]:
dftrain.SalePrice.hist()

In [9]:
dftrain["kfold"] = -1

In [10]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=0)
for fold,(train_ind, valid_ind) in enumerate(kf.split(X=dftrain)):
    dftrain.loc[valid_ind, "kfold"] = fold

In [11]:
dftrain.head()

In [12]:
dftrain.kfold.value_counts()

In [13]:
#Cheking the distibution of the target variable - SalePrice 
#The distribution must be similar in all the folds in order to have good results using Kfold

dftrain[dftrain.kfold == 0].SalePrice.hist()

In [14]:
dftrain[dftrain.kfold == 1].SalePrice.hist()

In [15]:
dftrain[dftrain.kfold == 2].SalePrice.hist()

In [16]:
dftrain[dftrain.kfold == 3].SalePrice.hist()

In [17]:
dftrain[dftrain.kfold == 4].SalePrice.hist()

#### The data doesn't seem perfectly shuffled to use KFold, but I'll use it to see what will happen :)

In [18]:
#if you want to export the folded traindata uncomment
#dftrain.to_csv('train_folded.csv', index=False)

### Some EDA and dealing with missing values

In [19]:
#Loading the test data
dftest = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')


In [20]:
#Selecting target variable - which in this dataset is SalePrice - we are asked to predict sale price of houses
y = dftrain.SalePrice

In [21]:
#Dropping our target column 
dftrain = dftrain.drop(['SalePrice'], axis=1)
X = dftrain

In [22]:
miss_values = X.isnull().sum()
print(miss_values[miss_values > 0])

In [23]:
X.shape

#### It seems that we could drop some columns like 
* PoolQC - 1453/1460
* MiscFeature - 1406/1460




In [24]:
cols_to_drop = X[['PoolQC', 'MiscFeature']]

In [25]:
X = X.drop(cols_to_drop, axis=1)

#### Let's see the numerical columns first

In [26]:
#Selecting numerical columns and checking if they have correct datatype
num_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
num_cols_null = X[num_cols].isnull().sum()
print(num_cols_null[num_cols_null > 0])

So we have only 3 numerical columns in which we have missing values. But the last one - GarageYrBlt - seems like it should be transfered to a string datatype. 

Checked it, and it seems like in this case, we can - only a year of blt - we can store it as a numerical/quantitive variable. 

So our next step is to deal with missing values. What scenario will be the best? 

Let's try with SimpleImputer() and mean strategy.

In [27]:
from sklearn.impute import SimpleImputer

In [28]:
numerical_transfomer = SimpleImputer(strategy = 'most_frequent')

Storing numerical transfomer for later to build a Pipeline of it and easily change the strategy and method of imputation.

Let's now see what is going on with our categorical variable and how much of them are missing. 

In [30]:
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
cat_cols_null = X[cat_cols].isnull().sum()
print(cat_cols_null[cat_cols_null > 0])

In [36]:
#Unique values in ordinal cols
#ordinal_cols_unique = [col for col in X.columns if X[col].unique() and X[col].dtype == 'object']
#print('Cleanliness: ', airlines['cleanliness'].unique(), "\n")

In [38]:
print('Fence: ', X['Fence'].unique(), "\n")