In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

from scipy.stats import pearsonr

  import pandas.util.testing as tm


In [2]:
#reading in test.csv of datasets

df_holdout_data=pd.read_csv('../datasets/test.csv')


In [3]:
#printing shape and head for initial look and size of dataframe

print(df_holdout_data.shape)
df_holdout_data.head()

(879, 80)


Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
# show breakdown of columns with null entries

#creating dataframe of count of null values
null_values_df=pd.DataFrame(df_holdout_data.isnull().sum())

#renaming column
null_values_df.rename(columns={0: 'count_of_null_values'}, errors="raise",inplace=True)

# displaying by filtering
null_values_df[null_values_df.count_of_null_values!=0]

Unnamed: 0,count_of_null_values
Lot Frontage,160
Alley,821
Mas Vnr Type,1
Mas Vnr Area,1
Bsmt Qual,25
Bsmt Cond,25
Bsmt Exposure,25
BsmtFin Type 1,25
BsmtFin Type 2,25
Electrical,1


In [5]:
df_holdout_data[df_holdout_data['Mas Vnr Type'].isnull()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
866,868,907260030,60,RL,70.0,8749,Pave,,Reg,Lvl,...,0,0,0,,,,0,11,2009,WD


In [6]:
df_holdout_data[df_holdout_data['Mas Vnr Area'].isnull()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
866,868,907260030,60,RL,70.0,8749,Pave,,Reg,Lvl,...,0,0,0,,,,0,11,2009,WD


Checking PID 907260030 on the http://www.cityofames.org/assessor/ website, Mas Vnr Area is actually 0. Hence the null value here should be filled by None.

In [7]:
df_holdout_data[df_holdout_data['Electrical'].isnull()]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
635,1578,916386080,80,RL,73.0,9735,Pave,,Reg,Lvl,...,0,0,0,,,,0,5,2008,WD


Couldn't find electrical data from source website. Inputing as SBrkr as mode/medium value, as not continuous nor discrete, no real mean value.

Subsitiute later!

In [8]:
# list of base features used for modeling

features=['1st Flr SF','2nd Flr SF','MS SubClass','MS Zoning','Lot Frontage','Lot Area','Neighborhood','Overall Qual','Year Built','Roof Style','Mas Vnr Type','Exter Qual','Foundation','Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin SF 1','Total Bsmt SF','Heating QC','Gr Liv Area','Kitchen Qual','TotRms AbvGrd','Garage Type','Garage Yr Blt','Garage Finish','Garage Cars','Garage Area','Id']

In [9]:
df_holdout_cleaning=df_holdout_data[features]

In [10]:
df_holdout_cleaning.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   1st Flr SF      879 non-null    int64  
 1   2nd Flr SF      879 non-null    int64  
 2   MS SubClass     879 non-null    int64  
 3   MS Zoning       879 non-null    object 
 4   Lot Frontage    719 non-null    float64
 5   Lot Area        879 non-null    int64  
 6   Neighborhood    879 non-null    object 
 7   Overall Qual    879 non-null    int64  
 8   Year Built      879 non-null    int64  
 9   Roof Style      879 non-null    object 
 10  Mas Vnr Type    878 non-null    object 
 11  Exter Qual      879 non-null    object 
 12  Foundation      879 non-null    object 
 13  Bsmt Qual       854 non-null    object 
 14  Bsmt Cond       854 non-null    object 
 15  Bsmt Exposure   854 non-null    object 
 16  BsmtFin Type 1  854 non-null    object 
 17  BsmtFin SF 1    879 non-null    int

In [11]:
# function to clean 'Nominal' variables

def clean_nominal_func(df_to_be_cleaned,variable_colname_as_str,drop_nan_yes_or_no):
    

    df_clean_nominal=pd.DataFrame.copy(df_to_be_cleaned)

    # counting nan
    print(df_clean_nominal.isnull().sum())
    
    if (df_clean_nominal[variable_colname_as_str].isnull().sum())==0:
        print(f"There are no entries with null values in '{variable_colname_as_str}'.\n")
        
        # displaying value_counts
        print("These are the value_counts.\n")
        print(df_clean_nominal[variable_colname_as_str].value_counts())
        
    elif (df_clean_nominal[variable_colname_as_str].isnull().sum())!=0:
        print(f"There are some entries with null values in '{variable_colname_as_str}'.\n")   
        
        # if drop_nan_yes_or_no is 'Yes', to drop all rows with NaN values,
        if drop_nan_yes_or_no == 'Yes' or drop_nan_yes_or_no == 'yes':
            if (df_clean_nominal[variable_colname_as_str].isnull().sum())!=0:
                df_clean_nominal.dropna(subset=[variable_colname_as_str],inplace=True)
        
        # if drop_nan_yes_or_no is 'No', to replace nan values with 'Unknown'
        elif drop_nan_yes_or_no == 'No' or drop_nan_yes_or_no == 'no':
            # replacing nan values with 'Unknown'
            df_clean_nominal[variable_colname_as_str].fillna('Unknown', inplace=True)
            
        # post nan processing printout to verify
        print('\nAfter null value processing, this is the new null count.\n')
        print(df_clean_nominal.isnull().sum())
        print('\nAfter null value processing, these are the new value_counts.\n')
        print(df_clean_nominal[variable_colname_as_str].value_counts())

    
    # converting nominal columns with int or float values into str
    if type(df_clean_nominal[variable_colname_as_str]) != str:
        df_clean_nominal=pd.DataFrame(df_clean_nominal[variable_colname_as_str].map(lambda x: str(x)))

    # converting a categorical column into a one-hot encoded matrix; making subset df of dummy variables 
    dummy_var_df=pd.get_dummies(df_clean_nominal[[variable_colname_as_str]])
    
    # concating nordinal values with rest of df
    df_clean_nominal_return=pd.concat([dummy_var_df,df_to_be_cleaned.drop(columns=[variable_colname_as_str])],axis='columns')

    # containing converted ordinal values with rest of df
    return (df_clean_nominal_return)

In [12]:
# function for cleaning 'Ordinal' variables

def clean_ordinal_func (df_to_be_cleaned,variable_colname_as_str,drop_nan_yes_or_no,ordinal_string):
    
    df_clean_ordinal=pd.DataFrame.copy(df_to_be_cleaned)


    # counting nan
    print(df_clean_ordinal.isnull().sum())
    
    if (df_clean_ordinal[variable_colname_as_str].isnull().sum())==0:
        print(f"There are no entries with null values in '{variable_colname_as_str}'.\n")
        
        # displaying value_counts
        print("These are the value_counts.\n")
        print(df_clean_ordinal[variable_colname_as_str].value_counts())
    
    elif (df_clean_ordinal[variable_colname_as_str].isnull().sum())!=0:
        print(f"There are some entries with null values in '{variable_colname_as_str}'.\n")
        
        # if drop_nan_yes_or_no is 'Yes', to drop all rows with NaN values, 
        if drop_nan_yes_or_no == 'Yes' or drop_nan_yes_or_no == 'yes':
            if (df_clean_ordinal[variable_colname_as_str].isnull().sum())!=0:
                df_clean_ordinal.dropna(subset=[variable_colname_as_str],inplace=True)
                
                
        # if drop_nan_yes_or_no is 'No', to replace nan values with 'Unknown'
        elif drop_nan_yes_or_no == 'No' or drop_nan_yes_or_no == 'no':
            df_clean_ordinal[variable_colname_as_str].fillna('Unknown', inplace=True)

        # post nan processing printout to verify
        print('\nAfter null value processing, this is the new null count.\n')
        print(df_clean_ordinal.isnull().sum())
        print('\nAfter null value processing, these are the new value_counts.\n')
        print(df_clean_ordinal[variable_colname_as_str].value_counts())

    # printing set of all possible values just to check
    print(f"\nSet of all possible values for column {set(df_clean_ordinal[variable_colname_as_str])}\n")

    # initialising list starting with element Unknown
    list_of_possible_column_values=[]

    # extending list with split of string keyed in for full range of possible ordinal str values
    list_of_possible_column_values.extend(ordinal_string.split(','))

    # showing list of all possible column values
    print(f"\nList of all possible values for column, including 'Unknown' for '{variable_colname_as_str}': {list_of_possible_column_values}\n")

    # converting str values in column 'variable_colname_as_str' to integer ordered values
    df_ordinal_values_converted=pd.DataFrame(df_clean_ordinal[variable_colname_as_str].map(lambda x: list_of_possible_column_values.index(x)))

    # printing unconverted and converted value_counts to check conversion accuracy
    print(df_clean_ordinal[variable_colname_as_str].value_counts())
    print(df_ordinal_values_converted[variable_colname_as_str].value_counts())

 
    # concating converted ordinal values with rest of df
    df_clean_ordinal_return=pd.concat([df_ordinal_values_converted,df_to_be_cleaned.drop(columns=[variable_colname_as_str])],axis='columns')

    # containing converted ordinal values with rest of df
    return(df_clean_ordinal_return)

In [13]:
# function for cleaning 'Continuous' or 'Discrete' variables

def clean_condis_func(df_to_be_cleaned,variable_colname_as_str,drop_nan_yes_or_no):
    

    df_clean_condis=pd.DataFrame.copy(df_to_be_cleaned)

    # counting nan
    print(df_clean_condis.isnull().sum())
    
    if (df_clean_condis[variable_colname_as_str].isnull().sum())==0:
        print(f"There are no entries with null values in '{variable_colname_as_str}'.\n")
    
        # displaying value_counts
        print("These are the value_counts.\n")
        print(df_clean_condis[variable_colname_as_str].value_counts())
    
    elif (df_clean_condis[variable_colname_as_str].isnull().sum())!=0:
        print(f"There are some entries with null values in '{variable_colname_as_str}'.\n")
    
        # if drop_nan_yes_or_no is 'Yes', to drop all rows with NaN values, 
        if drop_nan_yes_or_no == 'Yes' or drop_nan_yes_or_no == 'yes':
            if (df_clean_condis[variable_colname_as_str].isnull().sum())!=0:
                df_clean_condis.dropna(subset=[variable_colname_as_str],inplace=True)
                
        # if drop_nan_yes_or_no is 'No', to replace NaN values with 'Unknown'
        elif drop_nan_yes_or_no == 'No' or drop_nan_yes_or_no == 'no':
            df_clean_condis[variable_colname_as_str].fillna('Unknown', inplace=True)
        
        # post nan processing printout to verify
        print('\nAfter null value processing, this is the new null count.\n')
        print(df_clean_condis.isnull().sum())
        print('\nAfter null value processing, these are the new value_counts.\n')
        print(df_clean_condis[variable_colname_as_str].value_counts())
    
    return (df_clean_condis)     
        
        
        

In [14]:
# calling cleaning functions

df_under_cleaning=clean_nominal_func (df_holdout_cleaning,'MS SubClass','Yes')

print(df_under_cleaning.shape)
df_under_cleaning.head()

1st Flr SF          0
2nd Flr SF          0
MS SubClass         0
MS Zoning           0
Lot Frontage      160
Lot Area            0
Neighborhood        0
Overall Qual        0
Year Built          0
Roof Style          0
Mas Vnr Type        1
Exter Qual          0
Foundation          0
Bsmt Qual          25
Bsmt Cond          25
Bsmt Exposure      25
BsmtFin Type 1     25
BsmtFin SF 1        0
Total Bsmt SF       0
Heating QC          0
Gr Liv Area         0
Kitchen Qual        0
TotRms AbvGrd       0
Garage Type        44
Garage Yr Blt      45
Garage Finish      45
Garage Cars         0
Garage Area         0
Id                  0
dtype: int64
There are no entries with null values in 'MS SubClass'.

These are the value_counts.

20     309
60     181
50      89
120     60
160     41
70      38
30      38
90      34
80      32
85      20
190     15
75       7
45       7
180      6
40       2
Name: MS SubClass, dtype: int64
(879, 43)


Unnamed: 0,MS SubClass_120,MS SubClass_160,MS SubClass_180,MS SubClass_190,MS SubClass_20,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,MS SubClass_60,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,0,0,1,0,0,0,0,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,0,0,0,0,0,0,0,0,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,0,0,0,0,0,0,0,0,1,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,0,0,0,0,0,1,0,0,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,0,0,0,0,1,0,0,0,0,0,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [15]:
# further cleaning MS SubClass

df_under_cleaning.drop(columns=['MS SubClass_120', 'MS SubClass_160', 'MS SubClass_180',
                                'MS SubClass_190', 'MS SubClass_20', 'MS SubClass_30', 'MS SubClass_40',
                                'MS SubClass_45', 'MS SubClass_70','MS SubClass_75', 'MS SubClass_80',
                                'MS SubClass_85', 'MS SubClass_90'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 30)


Unnamed: 0,MS SubClass_50,MS SubClass_60,1st Flr SF,2nd Flr SF,MS Zoning,Lot Frontage,Lot Area,Neighborhood,Overall Qual,Year Built,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,0,908,1020,RM,69.0,9142,OldTown,6,1910,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,0,0,1967,0,RL,,9662,Sawyer,5,1977,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,1,664,832,RL,58.0,17104,Gilbert,7,2006,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,0,0,968,0,RM,60.0,8520,OldTown,5,1923,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,0,0,1394,0,RL,,9500,NAmes,6,1963,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [16]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'MS Zoning','Yes','Unknown,RP,I (all),C (all),A (agr),RH,RM,FV,RL')

print(df_under_cleaning.shape)
df_under_cleaning.head()

MS SubClass_50      0
MS SubClass_60      0
1st Flr SF          0
2nd Flr SF          0
MS Zoning           0
Lot Frontage      160
Lot Area            0
Neighborhood        0
Overall Qual        0
Year Built          0
Roof Style          0
Mas Vnr Type        1
Exter Qual          0
Foundation          0
Bsmt Qual          25
Bsmt Cond          25
Bsmt Exposure      25
BsmtFin Type 1     25
BsmtFin SF 1        0
Total Bsmt SF       0
Heating QC          0
Gr Liv Area         0
Kitchen Qual        0
TotRms AbvGrd       0
Garage Type        44
Garage Yr Blt      45
Garage Finish      45
Garage Cars         0
Garage Area         0
Id                  0
dtype: int64
There are no entries with null values in 'MS Zoning'.

These are the value_counts.

RL         675
RM         146
FV          38
RH          13
C (all)      6
I (all)      1
Name: MS Zoning, dtype: int64

Set of all possible values for column {'RH', 'C (all)', 'RL', 'I (all)', 'FV', 'RM'}


List of all possible values for col

Unnamed: 0,MS Zoning,MS SubClass_50,MS SubClass_60,1st Flr SF,2nd Flr SF,Lot Frontage,Lot Area,Neighborhood,Overall Qual,Year Built,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,6,0,0,908,1020,69.0,9142,OldTown,6,1910,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,8,0,0,1967,0,,9662,Sawyer,5,1977,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,8,0,1,664,832,58.0,17104,Gilbert,7,2006,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,6,0,0,968,0,60.0,8520,OldTown,5,1923,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,8,0,0,1394,0,,9500,NAmes,6,1963,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [17]:
# calling cleaning functions

df_under_cleaning=clean_condis_func(df_under_cleaning,'Lot Frontage','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

MS Zoning           0
MS SubClass_50      0
MS SubClass_60      0
1st Flr SF          0
2nd Flr SF          0
Lot Frontage      160
Lot Area            0
Neighborhood        0
Overall Qual        0
Year Built          0
Roof Style          0
Mas Vnr Type        1
Exter Qual          0
Foundation          0
Bsmt Qual          25
Bsmt Cond          25
Bsmt Exposure      25
BsmtFin Type 1     25
BsmtFin SF 1        0
Total Bsmt SF       0
Heating QC          0
Gr Liv Area         0
Kitchen Qual        0
TotRms AbvGrd       0
Garage Type        44
Garage Yr Blt      45
Garage Finish      45
Garage Cars         0
Garage Area         0
Id                  0
dtype: int64
There are some entries with null values in 'Lot Frontage'.


After null value processing, this is the new null count.

MS Zoning          0
MS SubClass_50     0
MS SubClass_60     0
1st Flr SF         0
2nd Flr SF         0
Lot Frontage       0
Lot Area           0
Neighborhood       0
Overall Qual       0
Year Built         

Unnamed: 0,MS Zoning,MS SubClass_50,MS SubClass_60,1st Flr SF,2nd Flr SF,Lot Frontage,Lot Area,Neighborhood,Overall Qual,Year Built,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,6,0,0,908,1020,69,9142,OldTown,6,1910,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,8,0,0,1967,0,Unknown,9662,Sawyer,5,1977,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,8,0,1,664,832,58,17104,Gilbert,7,2006,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,6,0,0,968,0,60,8520,OldTown,5,1923,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,8,0,0,1394,0,Unknown,9500,NAmes,6,1963,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [18]:
# further cleaning Lot Frontage's Unknown values

# considered inputing a SLR on Lot Frontage and SalePrice, but due to time constraints, will just
# impute a simple SalePrice.mean and Lot Frontage.mean correlation. 
# However the model shows close to 0 coefficient
# between Lot Frontage and Sale Price, and Holdout Data has no SalePrice to use the ratio on.

# Decision was made to fill dummy_values into Lot Frontage. Use df_under_cleaning['Lot Frontage'].mean


temp_df=df_holdout_data['Lot Frontage'].dropna()
print(temp_df.mean())


df_under_cleaning['Lot Frontage'].replace('Unknown', 0,inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

69.63004172461753
(879, 30)


Unnamed: 0,MS Zoning,MS SubClass_50,MS SubClass_60,1st Flr SF,2nd Flr SF,Lot Frontage,Lot Area,Neighborhood,Overall Qual,Year Built,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,6,0,0,908,1020,69.0,9142,OldTown,6,1910,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,8,0,0,1967,0,0.0,9662,Sawyer,5,1977,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,8,0,1,664,832,58.0,17104,Gilbert,7,2006,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,6,0,0,968,0,60.0,8520,OldTown,5,1923,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,8,0,0,1394,0,0.0,9500,NAmes,6,1963,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [19]:
# calling cleaning functions

df_under_cleaning=clean_nominal_func (df_under_cleaning,'Neighborhood','Yes')

print(df_under_cleaning.shape)
df_under_cleaning.head()

MS Zoning          0
MS SubClass_50     0
MS SubClass_60     0
1st Flr SF         0
2nd Flr SF         0
Lot Frontage       0
Lot Area           0
Neighborhood       0
Overall Qual       0
Year Built         0
Roof Style         0
Mas Vnr Type       1
Exter Qual         0
Foundation         0
Bsmt Qual         25
Bsmt Cond         25
Bsmt Exposure     25
BsmtFin Type 1    25
BsmtFin SF 1       0
Total Bsmt SF      0
Heating QC         0
Gr Liv Area        0
Kitchen Qual       0
TotRms AbvGrd      0
Garage Type       44
Garage Yr Blt     45
Garage Finish     45
Garage Cars        0
Garage Area        0
Id                 0
dtype: int64
There are no entries with null values in 'Neighborhood'.

These are the value_counts.

NAmes      133
CollgCr     87
OldTown     76
Somerst     52
Edwards     51
Gilbert     49
NWAmes      44
NridgHt     44
Sawyer      40
SawyerW     38
BrkSide     32
Crawfor     32
Mitchel     32
Timber      24
IDOTRR      24
NoRidge     23
ClearCr     17
SWISU       16


Unnamed: 0,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,0,0,0,0,0,0,0,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,0,0,0,0,0,0,0,0,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,0,0,0,0,0,0,0,1,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,0,0,0,0,0,0,0,0,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,0,0,0,0,0,0,0,0,0,0,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [20]:
# further cleaning Neighborhood

df_under_cleaning.drop(columns=['Neighborhood_Blmngtn', 'Neighborhood_Blueste',
       'Neighborhood_BrDale', 'Neighborhood_BrkSide',
       'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
       'Neighborhood_Crawfor', 'Neighborhood_Edwards',
       'Neighborhood_Gilbert', 'Neighborhood_Greens',
       'Neighborhood_IDOTRR', 'Neighborhood_MeadowV',
       'Neighborhood_Mitchel', 'Neighborhood_NPkVill', 
       'Neighborhood_NWAmes', 'Neighborhood_NridgHt',
       'Neighborhood_OldTown', 'Neighborhood_SWISU',
       'Neighborhood_Sawyer', 'Neighborhood_SawyerW',
       'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 32)


Unnamed: 0,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,MS Zoning,MS SubClass_50,MS SubClass_60,1st Flr SF,2nd Flr SF,Lot Frontage,Lot Area,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,0,0,6,0,0,908,1020,69.0,9142,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,0,0,0,8,0,0,1967,0,0.0,9662,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,0,0,8,0,1,664,832,58.0,17104,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,0,0,0,6,0,0,968,0,60.0,8520,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,1,0,0,8,0,0,1394,0,0.0,9500,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [21]:
# calling cleaning functions

df_under_cleaning=clean_nominal_func (df_under_cleaning,'Roof Style','Yes')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Roof Style               0
Mas Vnr Type             1
Exter Qual               0
Foundation               0
Bsmt Qual               25
Bsmt Cond               25
Bsmt Exposure           25
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are no entries with null values in 'Roof Style'.

These are the value_counts.

Gable      702
Hip        154
Gambrel 

Unnamed: 0,Roof Style_Flat,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,MS Zoning,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,1,0,0,0,0,0,0,0,6,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,0,1,0,0,0,0,0,0,0,8,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,1,0,0,0,0,0,0,0,8,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,0,1,0,0,0,0,0,0,0,6,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,0,1,0,0,0,0,1,0,0,8,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [22]:
# further cleaning Roof Style

df_under_cleaning.drop(columns=['Roof Style_Flat', 'Roof Style_Gambrel',
                                'Roof Style_Mansard', 'Roof Style_Shed'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 33)


Unnamed: 0,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,MS Zoning,MS SubClass_50,MS SubClass_60,1st Flr SF,2nd Flr SF,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,0,0,0,0,6,0,0,908,1020,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,0,0,0,0,8,0,0,1967,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,1,0,0,0,0,8,0,1,664,832,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,0,0,0,0,6,0,0,968,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,1,0,1,0,0,8,0,0,1394,0,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [23]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Exter Qual','Yes','Unknown,Po,Fa,TA,Gd,Ex')

print(df_under_cleaning.shape)
df_under_cleaning.head()



Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
Exter Qual               0
Foundation               0
Bsmt Qual               25
Bsmt Cond               25
Bsmt Exposure           25
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are no entries with null values in 'Exter Qual'.

These are the value_counts.

TA    552
G

Unnamed: 0,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,MS Zoning,MS SubClass_50,MS SubClass_60,1st Flr SF,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,3,1,0,0,0,0,6,0,0,908,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,3,1,0,0,0,0,8,0,0,1967,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,4,1,0,0,0,0,8,0,1,664,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,4,1,0,0,0,0,6,0,0,968,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,3,1,0,1,0,0,8,0,0,1394,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [24]:
# calling cleaning functions

df_under_cleaning=clean_nominal_func (df_under_cleaning,'Foundation','Yes')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
Foundation               0
Bsmt Qual               25
Bsmt Cond               25
Bsmt Exposure           25
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are no entries with null values in 'Foundation'.

These are the value_counts.

PConc     3

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,0,0,0,1,0,3,1,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,0,1,0,0,0,0,3,1,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,0,1,0,0,0,4,1,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,0,1,0,0,0,0,4,1,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,0,1,0,0,0,0,3,1,0,1,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [25]:
# further cleaning Roof Style

df_under_cleaning.drop(columns=['Foundation_BrkTil', 'Foundation_Slab',
                                'Foundation_Stone', 'Foundation_Wood'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()



(879, 34)


Unnamed: 0,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,MS Zoning,MS SubClass_50,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,0,0,3,1,0,0,0,0,6,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,0,3,1,0,0,0,0,8,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,0,1,4,1,0,0,0,0,8,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,0,4,1,0,0,0,0,6,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,1,0,3,1,0,1,0,0,8,0,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [26]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Bsmt Qual','No','Unknown,Po,Fa,TA,Gd,Ex')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
Bsmt Qual               25
Bsmt Cond               25
Bsmt Exposure           25
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are some entries with null values in 'Bsmt Qual'.


After null 

Unnamed: 0,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,MS Zoning,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,2,0,0,3,1,0,0,0,0,6,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,4,1,0,3,1,0,0,0,0,8,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,4,0,1,4,1,0,0,0,0,8,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,3,1,0,4,1,0,0,0,0,6,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,1,0,3,1,0,1,0,0,8,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [27]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Bsmt Cond','No','Unknown,Po,Fa,TA,Gd,Ex')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
Bsmt Cond               25
Bsmt Exposure           25
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are some entries with null values in 'Bsmt Cond'.


After null 

Unnamed: 0,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,Neighborhood_NoRidge,Neighborhood_Somerst,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,3,2,0,0,3,1,0,0,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,3,4,1,0,3,1,0,0,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,4,4,0,1,4,1,0,0,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,3,3,1,0,4,1,0,0,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,3,4,1,0,3,1,0,1,0,0,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [28]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Bsmt Exposure','No','Unknown,No,Mn,Av,Gd')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
Bsmt Exposure           25
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are some entries with null values in 'Bsmt Exposure'.


After n

Unnamed: 0,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,Neighborhood_NoRidge,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,3,2,0,0,3,1,0,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,3,4,1,0,3,1,0,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,3,4,4,0,1,4,1,0,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,3,3,1,0,4,1,0,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,1,3,4,1,0,3,1,0,1,0,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [29]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'BsmtFin Type 1','No','Unknown,Unf,LwQ,Rec,BLQ,ALQ,GLQ')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin Type 1          25
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are some entries with null values in 'BsmtFin Type 1'.


After 

Unnamed: 0,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,1,3,2,0,0,3,1,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,1,3,4,1,0,3,1,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,6,3,4,4,0,1,4,1,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,1,3,3,1,0,4,1,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,1,3,4,1,0,3,1,0,1,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [30]:
# calling cleaning functions

df_under_cleaning=clean_condis_func (df_under_cleaning,'BsmtFin SF 1','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are no entries with null values in 'BsmtFin SF 1'.

These are t

Unnamed: 0,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,1,3,2,0,0,3,1,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,1,3,4,1,0,3,1,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,6,3,4,4,0,1,4,1,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,1,3,3,1,0,4,1,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,1,3,4,1,0,3,1,0,1,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [31]:
# further cleaning BsmtFin SF 1

df_under_cleaning['BsmtFin SF 1'].replace('Unknown', 0,inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,1,3,2,0,0,3,1,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,1,3,4,1,0,3,1,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,6,3,4,4,0,1,4,1,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,1,3,3,1,0,4,1,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,1,3,4,1,0,3,1,0,1,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [32]:
# calling cleaning functions

df_under_cleaning=clean_condis_func (df_under_cleaning,'Total Bsmt SF','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are no entries with null values in 'Total Bsmt SF'.

These are 

Unnamed: 0,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,1,3,2,0,0,3,1,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,1,3,4,1,0,3,1,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,6,3,4,4,0,1,4,1,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,1,3,3,1,0,4,1,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,1,3,4,1,0,3,1,0,1,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [33]:
# further cleaning Total Bsmt SF

df_under_cleaning['Total Bsmt SF'].replace('Unknown', 0,inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,Neighborhood_NAmes,...,Heating QC,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,1,1,3,2,0,0,3,1,0,0,...,Gd,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,1,1,3,4,1,0,3,1,0,0,...,TA,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,6,3,4,4,0,1,4,1,0,0,...,Ex,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,1,1,3,3,1,0,4,1,0,0,...,TA,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,1,3,4,1,0,3,1,0,1,...,Gd,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [34]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Heating QC','Yes','Unknown,Po,Fa,TA,Gd,Ex')

print(df_under_cleaning.shape)
df_under_cleaning.head()

BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
1st Flr SF               0
2nd Flr SF               0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Heating QC               0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
dtype: int64
There are no entries with null values in 'Heating QC'.

These are the

Unnamed: 0,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,...,Total Bsmt SF,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id
0,4,1,1,3,2,0,0,3,1,0,...,1020,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658
1,3,1,1,3,4,1,0,3,1,0,...,1967,1967,TA,10,Attchd,1977.0,Fin,2,580,2718
2,5,6,3,4,4,0,1,4,1,0,...,654,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414
3,3,1,1,3,3,1,0,4,1,0,...,968,968,TA,5,Detchd,1935.0,Unf,2,480,1989
4,4,4,1,3,4,1,0,3,1,0,...,1394,1394,TA,6,Attchd,1963.0,RFn,2,514,625


In [35]:
# Code to feature engineer 1st Flr SF and 2nd Flr SF into Total Flr SF

df_under_cleaning['Total Flr SF']=df_under_cleaning['1st Flr SF']+df_under_cleaning['2nd Flr SF']

df_under_cleaning.drop(columns=['1st Flr SF','2nd Flr SF'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 33)


Unnamed: 0,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,Roof Style_Hip,...,Gr Liv Area,Kitchen Qual,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id,Total Flr SF
0,4,1,1,3,2,0,0,3,1,0,...,1928,Fa,9,Detchd,1910.0,Unf,1,440,2658,1928
1,3,1,1,3,4,1,0,3,1,0,...,1967,TA,10,Attchd,1977.0,Fin,2,580,2718,1967
2,5,6,3,4,4,0,1,4,1,0,...,1496,Gd,7,Attchd,2006.0,RFn,2,426,2414,1496
3,3,1,1,3,3,1,0,4,1,0,...,968,TA,5,Detchd,1935.0,Unf,2,480,1989,968
4,4,4,1,3,4,1,0,3,1,0,...,1394,TA,6,Attchd,1963.0,RFn,2,514,625,1394


In [36]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Kitchen Qual','Yes','Unknown,Po,Fa,TA,Gd,Ex')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Heating QC               0
BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Gr Liv Area              0
Kitchen Qual             0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
Total Flr SF             0
dtype: int64
There are no entries with null values in 'Kitchen Qual'.

These are the value_counts.

TA    447

Unnamed: 0,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,Exter Qual,Roof Style_Gable,...,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id,Total Flr SF
0,2,4,1,1,3,2,0,0,3,1,...,1020,1928,9,Detchd,1910.0,Unf,1,440,2658,1928
1,3,3,1,1,3,4,1,0,3,1,...,1967,1967,10,Attchd,1977.0,Fin,2,580,2718,1967
2,4,5,6,3,4,4,0,1,4,1,...,654,1496,7,Attchd,2006.0,RFn,2,426,2414,1496
3,3,3,1,1,3,3,1,0,4,1,...,968,968,5,Detchd,1935.0,Unf,2,480,1989,968
4,3,4,4,1,3,4,1,0,3,1,...,1394,1394,6,Attchd,1963.0,RFn,2,514,625,1394


In [37]:
# calling cleaning functions

df_under_cleaning=clean_nominal_func (df_under_cleaning,'Garage Type','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Kitchen Qual             0
Heating QC               0
BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Gr Liv Area              0
TotRms AbvGrd            0
Garage Type             44
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
Total Flr SF             0
dtype: int64
There are some entries with null values in 'Garage Type'.


After null value processing, this is

Unnamed: 0,Garage Type_2Types,Garage Type_Attchd,Garage Type_Basment,Garage Type_BuiltIn,Garage Type_CarPort,Garage Type_Detchd,Garage Type_Unknown,Kitchen Qual,Heating QC,BsmtFin Type 1,...,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id,Total Flr SF
0,0,0,0,0,0,1,0,2,4,1,...,0,1020,1928,9,1910.0,Unf,1,440,2658,1928
1,0,1,0,0,0,0,0,3,3,1,...,0,1967,1967,10,1977.0,Fin,2,580,2718,1967
2,0,1,0,0,0,0,0,4,5,6,...,554,654,1496,7,2006.0,RFn,2,426,2414,1496
3,0,0,0,0,0,1,0,3,3,1,...,0,968,968,5,1935.0,Unf,2,480,1989,968
4,0,1,0,0,0,0,0,3,4,4,...,609,1394,1394,6,1963.0,RFn,2,514,625,1394


In [38]:
# further cleaning Garage Type

df_under_cleaning.drop(columns=['Garage Type_2Types', 'Garage Type_Basment',
                                'Garage Type_BuiltIn', 'Garage Type_CarPort',
                                'Garage Type_Unknown'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,...,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id,Total Flr SF
0,0,1,2,4,1,1,3,2,0,0,...,0,1020,1928,9,1910.0,Unf,1,440,2658,1928
1,1,0,3,3,1,1,3,4,1,0,...,0,1967,1967,10,1977.0,Fin,2,580,2718,1967
2,1,0,4,5,6,3,4,4,0,1,...,554,654,1496,7,2006.0,RFn,2,426,2414,1496
3,0,1,3,3,1,1,3,3,1,0,...,0,968,968,5,1935.0,Unf,2,480,1989,968
4,1,0,3,4,4,1,3,4,1,0,...,609,1394,1394,6,1963.0,RFn,2,514,625,1394


In [39]:
# calling cleaning functions

df_under_cleaning=clean_condis_func (df_under_cleaning,'Garage Yr Blt','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Garage Type_Attchd       0
Garage Type_Detchd       0
Kitchen Qual             0
Heating QC               0
BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Gr Liv Area              0
TotRms AbvGrd            0
Garage Yr Blt           45
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
Total Flr SF             0
dtype: int64
There are some entries with null values in 'Garage Yr Blt'.


After n

Unnamed: 0,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,...,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id,Total Flr SF
0,0,1,2,4,1,1,3,2,0,0,...,0,1020,1928,9,1910,Unf,1,440,2658,1928
1,1,0,3,3,1,1,3,4,1,0,...,0,1967,1967,10,1977,Fin,2,580,2718,1967
2,1,0,4,5,6,3,4,4,0,1,...,554,654,1496,7,2006,RFn,2,426,2414,1496
3,0,1,3,3,1,1,3,3,1,0,...,0,968,968,5,1935,Unf,2,480,1989,968
4,1,0,3,4,4,1,3,4,1,0,...,609,1394,1394,6,1963,RFn,2,514,625,1394


In [40]:
# further cleaning Garage Yr Blt

df_under_cleaning['Garage Yr Blt'].replace('Unknown', df_under_cleaning['Year Built'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,Foundation_PConc,...,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Id,Total Flr SF
0,0,1,2,4,1,1,3,2,0,0,...,0,1020,1928,9,1910.0,Unf,1,440,2658,1928
1,1,0,3,3,1,1,3,4,1,0,...,0,1967,1967,10,1977.0,Fin,2,580,2718,1967
2,1,0,4,5,6,3,4,4,0,1,...,554,654,1496,7,2006.0,RFn,2,426,2414,1496
3,0,1,3,3,1,1,3,3,1,0,...,0,968,968,5,1935.0,Unf,2,480,1989,968
4,1,0,3,4,4,1,3,4,1,0,...,609,1394,1394,6,1963.0,RFn,2,514,625,1394


In [41]:
# calling cleaning functions

df_under_cleaning=clean_ordinal_func (df_under_cleaning,'Garage Finish','No','Unknown,Unf,RFn,Fin')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Garage Type_Attchd       0
Garage Type_Detchd       0
Kitchen Qual             0
Heating QC               0
BsmtFin Type 1           0
Bsmt Exposure            0
Bsmt Cond                0
Bsmt Qual                0
Foundation_CBlock        0
Foundation_PConc         0
Exter Qual               0
Roof Style_Gable         0
Roof Style_Hip           0
Neighborhood_NAmes       0
Neighborhood_NoRidge     0
Neighborhood_Somerst     0
MS Zoning                0
MS SubClass_50           0
MS SubClass_60           0
Lot Frontage             0
Lot Area                 0
Overall Qual             0
Year Built               0
Mas Vnr Type             1
BsmtFin SF 1             0
Total Bsmt SF            0
Gr Liv Area              0
TotRms AbvGrd            0
Garage Yr Blt            0
Garage Finish           45
Garage Cars              0
Garage Area              0
Id                       0
Total Flr SF             0
dtype: int64
There are some entries with null values in 'Garage Finish'.


After n

Unnamed: 0,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,...,Mas Vnr Type,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,1,0,1,2,4,1,1,3,2,0,...,,0,1020,1928,9,1910.0,1,440,2658,1928
1,3,1,0,3,3,1,1,3,4,1,...,,0,1967,1967,10,1977.0,2,580,2718,1967
2,2,1,0,4,5,6,3,4,4,0,...,,554,654,1496,7,2006.0,2,426,2414,1496
3,1,0,1,3,3,1,1,3,3,1,...,,0,968,968,5,1935.0,2,480,1989,968
4,2,1,0,3,4,4,1,3,4,1,...,BrkFace,609,1394,1394,6,1963.0,2,514,625,1394


In [42]:
# further cleaning Garage Finish

df_under_cleaning['Garage Finish'].replace(0, 2,inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,...,Mas Vnr Type,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,1,0,1,2,4,1,1,3,2,0,...,,0,1020,1928,9,1910.0,1,440,2658,1928
1,3,1,0,3,3,1,1,3,4,1,...,,0,1967,1967,10,1977.0,2,580,2718,1967
2,2,1,0,4,5,6,3,4,4,0,...,,554,654,1496,7,2006.0,2,426,2414,1496
3,1,0,1,3,3,1,1,3,3,1,...,,0,968,968,5,1935.0,2,480,1989,968
4,2,1,0,3,4,4,1,3,4,1,...,BrkFace,609,1394,1394,6,1963.0,2,514,625,1394


In [43]:
# calling cleaning functions

df_under_cleaning=clean_condis_func (df_under_cleaning,'Garage Cars','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Garage Finish           0
Garage Type_Attchd      0
Garage Type_Detchd      0
Kitchen Qual            0
Heating QC              0
BsmtFin Type 1          0
Bsmt Exposure           0
Bsmt Cond               0
Bsmt Qual               0
Foundation_CBlock       0
Foundation_PConc        0
Exter Qual              0
Roof Style_Gable        0
Roof Style_Hip          0
Neighborhood_NAmes      0
Neighborhood_NoRidge    0
Neighborhood_Somerst    0
MS Zoning               0
MS SubClass_50          0
MS SubClass_60          0
Lot Frontage            0
Lot Area                0
Overall Qual            0
Year Built              0
Mas Vnr Type            1
BsmtFin SF 1            0
Total Bsmt SF           0
Gr Liv Area             0
TotRms AbvGrd           0
Garage Yr Blt           0
Garage Cars             0
Garage Area             0
Id                      0
Total Flr SF            0
dtype: int64
There are no entries with null values in 'Garage Cars'.

These are the value_counts.

2    467
1    254

Unnamed: 0,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,...,Mas Vnr Type,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,1,0,1,2,4,1,1,3,2,0,...,,0,1020,1928,9,1910.0,1,440,2658,1928
1,3,1,0,3,3,1,1,3,4,1,...,,0,1967,1967,10,1977.0,2,580,2718,1967
2,2,1,0,4,5,6,3,4,4,0,...,,554,654,1496,7,2006.0,2,426,2414,1496
3,1,0,1,3,3,1,1,3,3,1,...,,0,968,968,5,1935.0,2,480,1989,968
4,2,1,0,3,4,4,1,3,4,1,...,BrkFace,609,1394,1394,6,1963.0,2,514,625,1394


In [44]:
# further cleaning Garage Cars

df_under_cleaning['Garage Cars'].replace('Unknown', 0,inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,...,Mas Vnr Type,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,1,0,1,2,4,1,1,3,2,0,...,,0,1020,1928,9,1910.0,1,440,2658,1928
1,3,1,0,3,3,1,1,3,4,1,...,,0,1967,1967,10,1977.0,2,580,2718,1967
2,2,1,0,4,5,6,3,4,4,0,...,,554,654,1496,7,2006.0,2,426,2414,1496
3,1,0,1,3,3,1,1,3,3,1,...,,0,968,968,5,1935.0,2,480,1989,968
4,2,1,0,3,4,4,1,3,4,1,...,BrkFace,609,1394,1394,6,1963.0,2,514,625,1394


In [45]:
# calling cleaning functions

df_under_cleaning=clean_condis_func (df_under_cleaning,'Garage Area','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Garage Finish           0
Garage Type_Attchd      0
Garage Type_Detchd      0
Kitchen Qual            0
Heating QC              0
BsmtFin Type 1          0
Bsmt Exposure           0
Bsmt Cond               0
Bsmt Qual               0
Foundation_CBlock       0
Foundation_PConc        0
Exter Qual              0
Roof Style_Gable        0
Roof Style_Hip          0
Neighborhood_NAmes      0
Neighborhood_NoRidge    0
Neighborhood_Somerst    0
MS Zoning               0
MS SubClass_50          0
MS SubClass_60          0
Lot Frontage            0
Lot Area                0
Overall Qual            0
Year Built              0
Mas Vnr Type            1
BsmtFin SF 1            0
Total Bsmt SF           0
Gr Liv Area             0
TotRms AbvGrd           0
Garage Yr Blt           0
Garage Cars             0
Garage Area             0
Id                      0
Total Flr SF            0
dtype: int64
There are no entries with null values in 'Garage Area'.

These are the value_counts.

0       44
576   

Unnamed: 0,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,...,Mas Vnr Type,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,1,0,1,2,4,1,1,3,2,0,...,,0,1020,1928,9,1910.0,1,440,2658,1928
1,3,1,0,3,3,1,1,3,4,1,...,,0,1967,1967,10,1977.0,2,580,2718,1967
2,2,1,0,4,5,6,3,4,4,0,...,,554,654,1496,7,2006.0,2,426,2414,1496
3,1,0,1,3,3,1,1,3,3,1,...,,0,968,968,5,1935.0,2,480,1989,968
4,2,1,0,3,4,4,1,3,4,1,...,BrkFace,609,1394,1394,6,1963.0,2,514,625,1394


In [46]:
# further cleaning Garage Area

df_under_cleaning['Garage Area'].replace('Unknown', 0,inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()

(879, 34)


Unnamed: 0,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,Bsmt Cond,Bsmt Qual,Foundation_CBlock,...,Mas Vnr Type,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,1,0,1,2,4,1,1,3,2,0,...,,0,1020,1928,9,1910.0,1,440,2658,1928
1,3,1,0,3,3,1,1,3,4,1,...,,0,1967,1967,10,1977.0,2,580,2718,1967
2,2,1,0,4,5,6,3,4,4,0,...,,554,654,1496,7,2006.0,2,426,2414,1496
3,1,0,1,3,3,1,1,3,3,1,...,,0,968,968,5,1935.0,2,480,1989,968
4,2,1,0,3,4,4,1,3,4,1,...,BrkFace,609,1394,1394,6,1963.0,2,514,625,1394


In [47]:
# calling cleaning functions

df_under_cleaning=clean_nominal_func (df_under_cleaning,'Mas Vnr Type','No')

print(df_under_cleaning.shape)
df_under_cleaning.head()

Garage Finish           0
Garage Type_Attchd      0
Garage Type_Detchd      0
Kitchen Qual            0
Heating QC              0
BsmtFin Type 1          0
Bsmt Exposure           0
Bsmt Cond               0
Bsmt Qual               0
Foundation_CBlock       0
Foundation_PConc        0
Exter Qual              0
Roof Style_Gable        0
Roof Style_Hip          0
Neighborhood_NAmes      0
Neighborhood_NoRidge    0
Neighborhood_Somerst    0
MS Zoning               0
MS SubClass_50          0
MS SubClass_60          0
Lot Frontage            0
Lot Area                0
Overall Qual            0
Year Built              0
Mas Vnr Type            1
BsmtFin SF 1            0
Total Bsmt SF           0
Gr Liv Area             0
TotRms AbvGrd           0
Garage Yr Blt           0
Garage Cars             0
Garage Area             0
Id                      0
Total Flr SF            0
dtype: int64
There are some entries with null values in 'Mas Vnr Type'.


After null value processing, this is the n

Unnamed: 0,Mas Vnr Type_BrkCmn,Mas Vnr Type_BrkFace,Mas Vnr Type_CBlock,Mas Vnr Type_None,Mas Vnr Type_Stone,Mas Vnr Type_Unknown,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,...,Year Built,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,0,0,0,1,0,0,1,0,1,2,...,1910,0,1020,1928,9,1910.0,1,440,2658,1928
1,0,0,0,1,0,0,3,1,0,3,...,1977,0,1967,1967,10,1977.0,2,580,2718,1967
2,0,0,0,1,0,0,2,1,0,4,...,2006,554,654,1496,7,2006.0,2,426,2414,1496
3,0,0,0,1,0,0,1,0,1,3,...,1923,0,968,968,5,1935.0,2,480,1989,968
4,0,1,0,0,0,0,2,1,0,3,...,1963,609,1394,1394,6,1963.0,2,514,625,1394


In [48]:
# further cleaning Mas Vnr Type

df_under_cleaning.drop(columns=['Mas Vnr Type_BrkCmn', 'Mas Vnr Type_Unknown', 'Mas Vnr Type_CBlock'],inplace=True)

print(df_under_cleaning.shape)
df_under_cleaning.head()



(879, 36)


Unnamed: 0,Mas Vnr Type_BrkFace,Mas Vnr Type_None,Mas Vnr Type_Stone,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,...,Year Built,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Id,Total Flr SF
0,0,1,0,1,0,1,2,4,1,1,...,1910,0,1020,1928,9,1910.0,1,440,2658,1928
1,0,1,0,3,1,0,3,3,1,1,...,1977,0,1967,1967,10,1977.0,2,580,2718,1967
2,0,1,0,2,1,0,4,5,6,3,...,2006,554,654,1496,7,2006.0,2,426,2414,1496
3,0,1,0,1,0,1,3,3,1,1,...,1923,0,968,968,5,1935.0,2,480,1989,968
4,1,0,0,2,1,0,3,4,4,1,...,1963,609,1394,1394,6,1963.0,2,514,625,1394


In [49]:
df_under_cleaning.isnull().sum()

Mas Vnr Type_BrkFace    0
Mas Vnr Type_None       0
Mas Vnr Type_Stone      0
Garage Finish           0
Garage Type_Attchd      0
Garage Type_Detchd      0
Kitchen Qual            0
Heating QC              0
BsmtFin Type 1          0
Bsmt Exposure           0
Bsmt Cond               0
Bsmt Qual               0
Foundation_CBlock       0
Foundation_PConc        0
Exter Qual              0
Roof Style_Gable        0
Roof Style_Hip          0
Neighborhood_NAmes      0
Neighborhood_NoRidge    0
Neighborhood_Somerst    0
MS Zoning               0
MS SubClass_50          0
MS SubClass_60          0
Lot Frontage            0
Lot Area                0
Overall Qual            0
Year Built              0
BsmtFin SF 1            0
Total Bsmt SF           0
Gr Liv Area             0
TotRms AbvGrd           0
Garage Yr Blt           0
Garage Cars             0
Garage Area             0
Id                      0
Total Flr SF            0
dtype: int64

In [50]:
# forcing NaN values for any errors as last check
df_under_cleaning = df_under_cleaning.apply(pd.to_numeric, errors='coerce')

In [51]:
df_under_cleaning.isnull().sum()

Mas Vnr Type_BrkFace    0
Mas Vnr Type_None       0
Mas Vnr Type_Stone      0
Garage Finish           0
Garage Type_Attchd      0
Garage Type_Detchd      0
Kitchen Qual            0
Heating QC              0
BsmtFin Type 1          0
Bsmt Exposure           0
Bsmt Cond               0
Bsmt Qual               0
Foundation_CBlock       0
Foundation_PConc        0
Exter Qual              0
Roof Style_Gable        0
Roof Style_Hip          0
Neighborhood_NAmes      0
Neighborhood_NoRidge    0
Neighborhood_Somerst    0
MS Zoning               0
MS SubClass_50          0
MS SubClass_60          0
Lot Frontage            0
Lot Area                0
Overall Qual            0
Year Built              0
BsmtFin SF 1            0
Total Bsmt SF           0
Gr Liv Area             0
TotRms AbvGrd           0
Garage Yr Blt           0
Garage Cars             0
Garage Area             0
Id                      0
Total Flr SF            0
dtype: int64

In [52]:
df_holdoutdata_cleaned= pd.DataFrame.copy(df_under_cleaning)

In [53]:
df_holdoutdata_cleaned.to_csv(r'../datasets/df_holdoutdata_cleaned.csv', index = False)

In [54]:
## End of File