In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
color= sns.color_palette()
sns.set_style('darkgrid')

In [None]:
df_train = pd.read_csv("../datasets/train.csv")
df_test = pd.read_csv("../datasets/test.csv")

Id = df_test["Id"]

df_train.set_index("Id", inplace = True)
df_test.set_index("Id", inplace = True)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.columns

In [None]:
df_train["SalePrice"].describe()

In [None]:
# plt.hist(df_train["SalePrice"], normed=True)

sns.distplot(df_train['SalePrice'])
plt.xticks(rotation=90)

In [None]:
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())

In [None]:
null_values = df_train.isnull().sum().loc[lambda x: x > 0]
null_count = len(df_train)
null_percent = null_values / null_count
pd.DataFrame({"null_count":null_values, "null_percent":null_percent}).\
    sort_values(['null_count'], ascending=False)

In [None]:
missing_columns = ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
                  'FireplaceQu', 'LotFrontage']
df_train.drop(missing_columns, inplace=True, axis=1)
df_test.drop(missing_columns, inplace=True, axis=1)

In [None]:
null_values = df_train.isnull().sum().loc[lambda x: x > 0]
null_count = len(df_train)
null_percent = null_values / null_count
pd.DataFrame({"null_count":null_values, "null_percent":null_percent}).\
    sort_values(['null_count'], ascending=False)

In [None]:
df_train.columns

In [None]:
feature_categories = {
    'MSSubClass': 'c',
    'MSZoning': 'c',
    'LotArea': 'n',
    'Street': 'c',
    'LotShape': 'c',
    'LandContour': 'c',
    'Utilities': 'c',
    'LotConfig': 'c',
    'LandSlope': 'c',
    'Neighborhood': 'c',
    'Condition1': 'c',
    'Condition2': 'c',
    'BldgType': 'c',
    'HouseStyle': 'c',
    'OverallQual': 'c',
    'OverallCond': 'c',
    'YearBuilt': 'n',
    'YearRemodAdd': 'n',
    'RoofStyle': 'c',
    'RoofMatl': 'c',
    'Exterior1st': 'c',
    'Exterior2nd': 'c',
    'MasVnrType': 'c',
    'MasVnrArea': 'n',
    'ExterQual': 'c',
    'ExterCond': 'c',
    'Foundation': 'c',
    'BsmtQual': 'c',
    'BsmtCond': 'c',
    'BsmtExposure': 'c',
    'BsmtFinType1': 'c',
    'BsmtFinSF1': 'n',
    'BsmtFinType2': 'c',
    'BsmtFinSF2': 'n',
    'BsmtUnfSF': 'n',
    'TotalBsmtSF': 'n',
    'Heating': 'c',
    'HeatingQC': 'c',
    'CentralAir': 'c',
    'Electrical': 'c',
    '1stFlrSF': 'n',
    '2ndFlrSF': 'n',
    'LowQualFinSF': 'n',
    'GrLivArea': 'n',
    'BsmtFullBath': 'n',
    'BsmtHalfBath': 'n',
    'FullBath': 'n',
    'HalfBath': 'n',
    'KitchenAbvGr': 'n',
    'KitchenQual': 'c',
    'TotRmsAbvGrd': 'n',
    'Functional': 'c',
    'Fireplaces': 'n',
    'GarageType': 'c',
    'GarageYrBlt': 'n',
    'GarageFinish': 'c',
    'GarageCars': 'n',
    'GarageArea': 'n',
    'GarageQual': 'c',
    'GarageCond': 'c',
    'PavedDrive': 'c',
    'WoodDeckSF': 'n',
    'OpenPorchSF': 'n',
    'EnclosedPorch': 'n',
    '3SsnPorch': 'n',
    'ScreenPorch': 'n',
    'PoolArea': 'n',
    'MiscVal': 'n',
    'MoSold': 'c',
    'YrSold': 'n',
    'SaleType': 'c',
    'SaleCondition': 'c'
}

In [None]:
def categorical_comp(df, var):
    data = pd.concat([df['SalePrice'], df[var]], axis = 1).sort_values(var, ascending=False)

    f, ax = plt.subplots(figsize=(8,6))
    fig = sns.boxplot(x=var, y='SalePrice', data = data)
    plt.xticks(rotation=90)

In [None]:
months_replace = {
    1: 'jan',
    2: 'feb',
    3: 'mar',
    4: 'apr',
    5: 'may',
    6: 'jun',
    7: 'jul',
    8: 'aug',
    9: 'sep',
    10: 'oct',
    11: 'nov',
    12: 'dec'
}

In [None]:
df_train['MoSold'].replace(months_replace, inplace=True)
df_test['MoSold'].replace(months_replace, inplace=True)

In [None]:
overall_qual_replace= {
    10: "a",
    9: 'b',
    8: 'c',
    7: 'd',
    6: 'e',
    5: 'f',
    4: 'g',
    3: 'h',
    2: 'i',
    1: "j"
}

overall_cond_replace= {
    10: "a",
    9: 'b',
    8: 'c',
    7: 'd',
    6: 'e',
    5: 'f',
    4: 'g',
    3: 'h',
    2: 'i',
    1: "j"
}

In [None]:
print(df_train['MoSold'].unique())
print(df_train['MoSold'].value_counts())

In [None]:
categorical_comp(df_train, 'MoSold')

In [None]:
df_train['OverallQual'].replace(overall_qual_replace, inplace=True)
df_test['OverallQual'].replace(overall_qual_replace, inplace=True)

df_train['OverallCond'].replace(overall_cond_replace, inplace=True)
df_test['OverallCond'].replace(overall_cond_replace, inplace=True)

In [None]:
print(df_train['OverallQual'].unique())
print(df_train['OverallQual'].value_counts())

print(df_train['OverallCond'].unique())
print(df_train['OverallCond'].value_counts())

In [None]:
categorical_comp(df_train, 'OverallQual')
categorical_comp(df_train, 'OverallCond')

In [None]:
sns.distplot(df_train['SalePrice'], fit=norm)

(mu, sigma) = norm.fit(df_train['SalePrice'])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu, sigma)],
           loc = 'best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot = plt)
plt.show()

In [None]:
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])

sns.distplot(df_train['SalePrice'], fit=norm)

(mu, sigma) = norm.fit(df_train['SalePrice'])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu, sigma)],
           loc = 'best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot = plt)
plt.show()

In [None]:
var = "MSSubClass"
data = pd.concat([df_train['SalePrice'], df_train[var]], axis = 1)
data.plot.scatter(x=var, y='SalePrice')

f, ax = plt.subplots(figsize=(8,6))
fig = sns.boxplot(x=var, y='SalePrice', data = data)

In [None]:
ms_sub_class_replace = {
    20: 'a',
    30: 'b',
    40: 'c',
    45: 'd',
    50: 'e',
    60: 'f',
    70: 'g',
    75: 'h',
    80: 'i',
    85: 'j',
    90: 'k',
    120: 'l',
    150: 'm',
    160: 'n',
    180: 'o',
    190: 'p'
}

In [None]:
df_train['MSSubClass'].replace(ms_sub_class_replace, inplace=True)
df_test['MSSubClass'].replace(ms_sub_class_replace, inplace=True)

In [None]:
print(df_train['MSSubClass'].unique())
print(df_train['MSSubClass'].value_counts())

In [None]:
categorical_comp(df_train, "MSSubClass")

In [None]:
categorical_features = dict(filter(lambda x: x[1]=='c', feature_categories.items()))

In [None]:
categorical_features

In [None]:
for key in categorical_features:
    categorical_comp(df_train, key)

In [None]:
df_train.columns

In [None]:
year_features = [
    'YearBuilt',
    'YearRemodAdd',
    'GarageYrBlt',
    'YrSold'
]

In [None]:
df_train[year_features].describe()

In [None]:
year_min = 1900
year_max = 2010

df_train['YearBuilt'] = 2010 - df_train['YearBuilt']
df_train['YearRemodAdd'] = 2010 - df_train['YearRemodAdd']
df_train['GarageYrBlt'] = 2010 - df_train['GarageYrBlt']
df_train['YrSold'] = 2010 - df_train['YrSold']

df_test['YearBuilt'] = 2010 - df_test['YearBuilt']
df_test['YearRemodAdd'] = 2010 - df_test['YearRemodAdd']
df_test['GarageYrBlt'] = 2010 - df_test['GarageYrBlt']
df_test['YrSold'] = 2010 - df_test['YrSold']

In [None]:
df_train[year_features].describe()

In [None]:
def numerical_comp(df, var):
    data = pd.concat([df['SalePrice'], df[var]], axis = 1)

#     f, ax = plt.subplots(figsize=(8,6))
    data.plot.scatter(x=var, y='SalePrice')
    plt.xticks(rotation=90)

In [None]:
numerical_filtered = dict(filter(lambda x: x[1]=='n', feature_categories.items()))

In [None]:
numerical_filtered

In [None]:
numerical_features = [key for key in numerical_filtered]
numerical_features

In [None]:
df_train[numerical_features].head()

In [None]:
for feature in numerical_filtered:
    numerical_comp(df_train, feature)

In [None]:
outlier_features = [
    'BsmtFinSF1',
    'TotalBsmtSF',
    '1stFlrSF',
    'GrLivArea',
    'GarageArea',
    'OpenPorchSF'
]

In [None]:
for feature in outlier_features:
    corr = df_train['SalePrice'].corr(df_train[feature])
    print(f'Corr SalesPrice and {feature}: {corr}')

In [None]:
df_train = df_train.drop(df_train[(df_train['GrLivArea'] > 4000) &
                                 (df_train['SalePrice'] < 12.5)].index)

In [None]:
for feature in outlier_features:
    numerical_comp(df_train, feature)

In [None]:
# df_train['SalePrice'] = np.log1p(df_train['SalePrice'])

sns.distplot(df_train['GarageArea'][df_train['GarageArea'] > 0], fit=norm)

(mu, sigma) = norm.fit(df_train['GarageArea'][df_train['GarageArea'] > 0])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu, sigma)],
           loc = 'best')
plt.ylabel('Frequency')
plt.title('GarageArea distribution')

fig = plt.figure()
res = stats.probplot(df_train['GarageArea'][df_train['GarageArea'] > 0], plot = plt)
plt.show()