In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from scipy import stats
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# reading in CSV - had to move csv into main folder. was unable to read using file path for some reason
test_df = pd.read_csv ("test.csv")

In [3]:
# Shape of test DF
test_df.shape

(878, 80)

In [4]:
# dtypes and nulls for each column.
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [5]:
# renaming columns - replacing spaces with underscores
# and making lower case for easier access
test_df.columns = test_df.columns.str.lower().str.replace(" ","_")

In [6]:
test_df.columns[:10] # Confirming columns have been changed

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour'],
      dtype='object')

In [7]:
# using .any() from Pandas documentation to show only columns with nulls, as there are too many to show above
# AND how many -- if just inputting 'null_locations', 
# will just get string of columns that contain nulls, not amounts

null_locations = test_df.columns[test_df.isnull().any()]
test_df[null_locations].isnull().sum()

lot_frontage      160
alley             820
mas_vnr_type        1
mas_vnr_area        1
bsmt_qual          25
bsmt_cond          25
bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_yr_blt      45
garage_finish      45
garage_qual        45
garage_cond        45
pool_qc           874
fence             706
misc_feature      837
dtype: int64

In [8]:
# Looking for most common value in 'electrical' column, and assigning to the one null value.
test_df['electrical'].value_counts()

SBrkr    813
FuseA     48
FuseF     15
FuseP      1
Name: electrical, dtype: int64

In [9]:
# Imputing the one null value in electrical with the most common type of electrical configuration. Based on   
# the lack of additional data, imputing the most commonly observed value seemed most reasonable to me.
test_df['electrical'].replace(to_replace = np.nan, value = 'SBrkr', inplace = True)

In [10]:
test_df['electrical'].value_counts()

SBrkr    814
FuseA     48
FuseF     15
FuseP      1
Name: electrical, dtype: int64

In [11]:
# Dropping all columns that are almost completely filled with nulls.
test_df.drop(columns=['alley', 'pool_qc', 'fence', 'misc_feature'], inplace=True)

In [12]:
# function to fill null values in columns with the value/string 'None'.
def fill_nulls(features):
    # Iterating through the input list, 'features'.
    for feature in features:
        # Replacing null values with the text string 'None' for each feature in features, and saving to my dataframe.
        test_df[feature].replace(to_replace = np.nan, value = 'None', inplace = True)
    return test_df
# assistance within study group

In [13]:
## list of features to pass through fill_nulls function
features = ['mas_vnr_type', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2']
fill_nulls(features)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Reg,Lvl,AllPub,...,0,60,112,0,0,0,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,IR1,Lvl,AllPub,...,170,0,0,0,0,0,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,IR1,Lvl,AllPub,...,100,24,0,0,0,0,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,Reg,Lvl,AllPub,...,0,0,184,0,0,0,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,IR1,Lvl,AllPub,...,0,76,0,0,185,0,0,7,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,80.0,8000,Pave,Reg,Lvl,AllPub,...,0,96,0,0,0,0,0,11,2007,WD
874,1234,535126140,60,RL,90.0,14670,Pave,Reg,Lvl,AllPub,...,0,230,0,0,0,0,0,8,2008,WD
875,1373,904100040,20,RL,55.0,8250,Pave,Reg,Lvl,AllPub,...,0,63,0,0,0,0,0,8,2008,WD
876,1672,527425140,20,RL,60.0,9000,Pave,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,5,2007,WD


In [14]:
# Defining a list of features to pass into my function, 'fill_nulls'.
features = ['mas_vnr_type', 'bsmt_exposure', 'bsmt_qual', 'bsmt_cond', 'bsmtfin_type_1', 'bsmtfin_type_2',
            'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'fireplace_qu']
fill_nulls(features)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Reg,Lvl,AllPub,...,0,60,112,0,0,0,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,IR1,Lvl,AllPub,...,170,0,0,0,0,0,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,IR1,Lvl,AllPub,...,100,24,0,0,0,0,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,Reg,Lvl,AllPub,...,0,0,184,0,0,0,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,IR1,Lvl,AllPub,...,0,76,0,0,185,0,0,7,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,80.0,8000,Pave,Reg,Lvl,AllPub,...,0,96,0,0,0,0,0,11,2007,WD
874,1234,535126140,60,RL,90.0,14670,Pave,Reg,Lvl,AllPub,...,0,230,0,0,0,0,0,8,2008,WD
875,1373,904100040,20,RL,55.0,8250,Pave,Reg,Lvl,AllPub,...,0,63,0,0,0,0,0,8,2008,WD
876,1672,527425140,20,RL,60.0,9000,Pave,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,5,2007,WD


In [15]:
test_df['mas_vnr_area'].replace(to_replace = np.nan, value=0)

0        0.0
1        0.0
2        0.0
3        0.0
4      247.0
       ...  
873      0.0
874    410.0
875      0.0
876      0.0
877      0.0
Name: mas_vnr_area, Length: 878, dtype: float64

In [16]:
test_df['mas_vnr_area'].unique()

array([   0.,  247.,   23.,   98.,  104.,  156.,  180.,   44.,   76.,
         70.,  352.,  162.,  444.,  495.,  340.,  634.,  182.,  147.,
        108.,   20.,  423.,  178.,  359.,   75.,  161.,  674.,  100.,
        306.,  509.,  653.,  450.,  360.,  680.,  112.,   72.,  440.,
       1378.,  304.,  364.,  754.,  788.,  230.,  368.,  120.,  113.,
        216.,  371.,  153.,  151.,  396.,  215.,  472.,  500.,  468.,
         14.,   50.,   96.,   99.,  342.,  174.,  310.,  114.,   74.,
        270.,  260.,  123.,  218.,  415.,  921.,  771.,  726.,   16.,
        362.,  473.,  870., 1224.,  285.,  420.,  137.,  259.,   82.,
        632.,  170.,  408.,   53.,  532.,  286.,  206.,  308.,  405.,
        128.,  236.,  350.,  302.,  256.,  657.,  194.,  567.,  116.,
         65.,  305.,  188.,  281.,  300.,  198.,   95.,  481.,  226.,
        459.,  480.,  422.,  877.,  166.,  149.,  190.,  189.,  492.,
        205.,  130.,  250.,  223.,  280.,  435.,  229.,  438.,  975.,
         67.,  150.,

In [17]:
test_df.fillna(test_df.mean(), inplace=True)

In [18]:
# Defining a function, 'float_to_int' to accept one argument, a list of features.
def flt_to_int(features2):
    # Iterating through the list of features.
    for feature in features2:
        # Casting all features in features2 as integers.
        test_df[feature] = test_df[feature].astype(int)
    return test_df

In [19]:
# Defining a list of features to pass into my 'float_to_int' function.
features2 = ['garage_yr_blt', 'lot_frontage', 'mas_vnr_area', 'bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath']
flt_to_int(features2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69,9142,Pave,Reg,Lvl,AllPub,...,0,60,112,0,0,0,0,4,2006,WD
1,2718,905108090,90,RL,69,9662,Pave,IR1,Lvl,AllPub,...,170,0,0,0,0,0,0,8,2006,WD
2,2414,528218130,60,RL,58,17104,Pave,IR1,Lvl,AllPub,...,100,24,0,0,0,0,0,9,2006,New
3,1989,902207150,30,RM,60,8520,Pave,Reg,Lvl,AllPub,...,0,0,184,0,0,0,0,7,2007,WD
4,625,535105100,20,RL,69,9500,Pave,IR1,Lvl,AllPub,...,0,76,0,0,185,0,0,7,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,80,8000,Pave,Reg,Lvl,AllPub,...,0,96,0,0,0,0,0,11,2007,WD
874,1234,535126140,60,RL,90,14670,Pave,Reg,Lvl,AllPub,...,0,230,0,0,0,0,0,8,2008,WD
875,1373,904100040,20,RL,55,8250,Pave,Reg,Lvl,AllPub,...,0,63,0,0,0,0,0,8,2008,WD
876,1672,527425140,20,RL,60,9000,Pave,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,5,2007,WD


In [20]:
null_locations = test_df.columns[test_df.isnull().any()]
test_df[null_locations].isnull().sum()
#checking null values

Series([], dtype: float64)

In [21]:
test_df.fillna(test_df.mean(), inplace=True)

In [22]:
test_df.shape

(878, 76)

In [23]:
# Replacing nulls for masonry veneer area with a value of 0. 
test_df['mas_vnr_area'].replace(to_replace = np.nan, value = 0, inplace = True)

In [24]:
test_df['paved_drive'].value_counts()

Y    790
N     65
P     23
Name: paved_drive, dtype: int64

In [25]:
# Changing to numerical values
test_df['central_air'] = test_df['central_air'].map({'Y':1, 'N':0})
test_df['paved_drive'] = test_df['paved_drive'].map({'Y':1, 'N':0, 'P':0})

In [27]:
# With some excellent guidance from Kovacs!

# Defining a new dictionary, 'map_dict', to hold the keys (which currently exist in features), and the values I want to replace them with.
map_dict = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'None':0}

# Defining the list 'ordinals' to contain all the features I want to pass into my function.
ordinals = ['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'kitchen_qual', 'fireplace_qu', 'garage_qual', 'garage_cond']

# Defining a function, 'ordinals_to_numeric', converting remaining ordinal columns, ranked on a scale from Poor
# to Excellent, to numeric values. Nulls that I previously converted to the string 'None' will now be represented
# by the number 0.
def ordinals_to_num(test_df):
    # Iterating through features in my testing dataframe's columns.
    for feature in test_df.columns:
        # If that feature is in the feature list I defined above as 'ordinals':
        if feature in ordinals:
            # then map the associated numeric dictionary values in 'map_dict' to the keys (found in the original ordinal columns).
            test_df[feature] = test_df[feature].map(map_dict)
            # Converting all features in my list 'ordinals' to numeric values, and coercing errors to return nulls for uninterpretable values. 
            pd.to_numeric(arg=test_df[feature], errors="coerce")
    return test_df

In [28]:
map_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NA":0}
ordinals = ["exter_qual", "exter_cond", "bsmt_qual", "bsmt_cond", "kitchenqual", "fireplacequ", "garage_qual", "garage_cond"]


In [33]:
# Applying the function to test dataframe.
test_df = ordinals_to_num(test_df)
test_df.head(10)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69,9142,Pave,Reg,Lvl,AllPub,...,0,60,112,0,0,0,0,4,2006,WD
1,2718,905108090,90,RL,69,9662,Pave,IR1,Lvl,AllPub,...,170,0,0,0,0,0,0,8,2006,WD
2,2414,528218130,60,RL,58,17104,Pave,IR1,Lvl,AllPub,...,100,24,0,0,0,0,0,9,2006,New
3,1989,902207150,30,RM,60,8520,Pave,Reg,Lvl,AllPub,...,0,0,184,0,0,0,0,7,2007,WD
4,625,535105100,20,RL,69,9500,Pave,IR1,Lvl,AllPub,...,0,76,0,0,185,0,0,7,2009,WD
5,333,923228370,160,RM,21,1890,Pave,Reg,Lvl,AllPub,...,0,0,64,0,0,0,0,6,2010,WD
6,1327,902427150,20,RM,52,8516,Pave,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,5,2008,WD
7,858,907202130,20,RL,69,9286,Pave,IR1,Lvl,AllPub,...,173,0,0,0,0,0,0,10,2009,WD
8,95,533208090,160,FV,39,3515,Pave,Reg,Lvl,AllPub,...,0,111,0,0,0,0,0,1,2010,WD
9,1568,914476010,20,RL,75,10125,Pave,Reg,Lvl,AllPub,...,238,83,0,0,0,0,0,2,2008,WD


In [34]:
# Casting 'ms_subclass' into a string type object.
test_df['ms_subclass'] = test_df['ms_subclass'].astype(str)

In [35]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 76 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    object 
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     878 non-null    int32  
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   lot_shape        878 non-null    object 
 8   land_contour     878 non-null    object 
 9   utilities        878 non-null    object 
 10  lot_config       878 non-null    object 
 11  land_slope       878 non-null    object 
 12  neighborhood     878 non-null    object 
 13  condition_1      878 non-null    object 
 14  condition_2      878 non-null    object 
 15  bldg_type        878 non-null    object 
 16  house_style      878 non-null    object 
 17  overall_qual    

In [50]:
test_df_filtered = pd.get_dummies(test_df)
test_df_filtered.dropna(inplace=True)

In [51]:
test_df_filtered.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,overall_qual,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,...,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,190,69,9142,6,1910,1950,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,69,9662,5,1977,1977,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,60,58,17104,7,2006,2006,0,554,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,30,60,8520,5,1923,2006,0,0,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,20,69,9500,6,1963,1963,247,609,...,0,0,0,0,0,0,0,0,0,1


In [37]:
# Looking at unique values in 'kitchen_qual'
test_df['kitchen_qual'].unique()

array(['Fa', 'TA', 'Gd', 'Ex', 'Po'], dtype=object)

# Step 2 
### Featire Engingeering and dummification

In [38]:
# Engineering features to match
test_df['total_sqft'] = test_df['total_bsmt_sf'] + test_df['gr_liv_area']
test_df['total_sqft_factor'] = test_df['total_bsmt_sf'] * test_df['gr_liv_area']
test_df['garage_by_cars'] = test_df['garage_area'] * test_df['garage_cars']
test_df['quality_by_area'] = test_df['overall_qual'] * test_df['gr_liv_area']
test_df['quality_by_rooms'] = test_df['overall_qual'] * test_df['totrms_abvgrd']
test_df['total_baths'] = test_df['full_bath'] + test_df['half_bath'] + test_df['bsmt_full_bath'] + test_df['bsmt_half_bath']
test_df['quality_by_baths'] = test_df['overall_qual'] * test_df['total_baths']
test_df['total_baths_abvgrd'] = test_df['full_bath'] + test_df['half_bath']
test_df['quality_by_abvgrd_baths'] = test_df['overall_qual'] * test_df['total_baths_abvgrd']
test_df['baths_by_cars'] = test_df['total_baths_abvgrd'] * test_df['garage_cars']
test_df['quality_by_kitchenqual'] = test_df['overall_qual'] * test_df['kitchen_qual']
test_df['age_at_sale'] = test_df['yr_sold'] - test_df['year_built']
test_df['quality_by_age'] = test_df['overall_qual'] * test_df['age_at_sale']

In [39]:
# Creating a dataframe of dummy columns for nominal categories I'd like to further examine, and saving to a new dataframe, 'dummies2'.
dummies2 = pd.get_dummies(test_df[['neighborhood', 'house_style', 'ms_subclass', 'ms_zoning',
                                   'lot_config', 'condition_1', 'foundation', 'heating', 'garage_type',
                                   'exterior_1st']], drop_first = True)
dummies2.head()

Unnamed: 0,neighborhood_Blueste,neighborhood_BrDale,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_CollgCr,neighborhood_Crawfor,neighborhood_Edwards,neighborhood_Gilbert,neighborhood_Greens,neighborhood_IDOTRR,...,exterior_1st_BrkFace,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_PreCast,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [41]:
dummies2.shape

(878, 89)

In [40]:
# Creating a list of features to drop before merging
test_df_feats = test_df.drop(columns = ['neighborhood', 'house_style', 'ms_subclass', 'ms_zoning',
                                   'lot_config', 'condition_1', 'foundation', 'heating', 'garage_type',
                                   'exterior_1st'])

In [42]:
# With a little help from Stephen and Adam/Study Group
# Using pd.concat() to merge my dummified dataframe and my cleaned dataframe, minus the columns that were dummied.
cd_test_df_init = pd.concat([dummies2, test_df_feats], axis = 1)

## Section 3
### Matching DataFrames

In [43]:
training_clean = pd.read_csv('train_clean.csv')
training_clean.head()

Unnamed: 0.1,Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,overall_qual,year_built,year_remod/add,mas_vnr_area,...,garage_finish_Unf,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,0,109,533352170,60,69,13517,6,1976,2005,289,...,0,0,0,0,0,0,0,0,0,1
1,1,544,531379050,60,43,11492,7,1996,1997,132,...,0,0,0,0,0,0,0,0,0,1
2,2,153,535304180,20,68,7922,5,1953,2007,0,...,1,0,0,0,0,0,0,0,0,1
3,3,318,916386060,60,73,9802,5,2006,2007,0,...,0,0,0,0,0,0,0,0,0,1
4,4,255,906425045,50,82,14235,6,1900,1993,0,...,1,0,0,0,0,0,0,0,0,1


In [44]:
# Creating an empty list to hold the features I'll need to drop from my training dataframe to match my testing.
drop_cols = []

# Iterating through features in my training dataframe using a 'for' loop.
for feature in training_clean.columns:
    # If the feature is sale price or sale ID, then ignore it.
    if feature == 'saleprice' or feature == 'id':
        continue
    # Otherwise, if the feature is not also found in the testing dataframe, then...
    elif feature not in cd_test_df_init.columns:
        # add those features to the list of columns to drop.
        drop_cols.append(feature)

In [46]:
# Drop features in 'drop_cols' list from training dataframe.
training_clean.drop(columns = drop_cols, inplace = True)

KeyError: "['Unnamed: 0' 'ms_subclass' 'ms_zoning_A (agr)' 'ms_zoning_C (all)'\n 'street_Grvl' 'street_Pave' 'lot_shape_IR1' 'lot_shape_IR2'\n 'lot_shape_IR3' 'lot_shape_Reg' 'land_contour_Bnk' 'land_contour_HLS'\n 'land_contour_Low' 'land_contour_Lvl' 'utilities_AllPub'\n 'utilities_NoSeWa' 'utilities_NoSewr' 'lot_config_Corner'\n 'land_slope_Gtl' 'land_slope_Mod' 'land_slope_Sev' 'neighborhood_Blmngtn'\n 'neighborhood_GrnHill' 'neighborhood_Landmrk' 'condition_1_Artery'\n 'condition_2_Artery' 'condition_2_Feedr' 'condition_2_Norm'\n 'condition_2_PosA' 'condition_2_PosN' 'condition_2_RRAe'\n 'condition_2_RRAn' 'condition_2_RRNn' 'bldg_type_1Fam' 'bldg_type_2fmCon'\n 'bldg_type_Duplex' 'bldg_type_Twnhs' 'bldg_type_TwnhsE'\n 'house_style_1.5Fin' 'roof_style_Flat' 'roof_style_Gable'\n 'roof_style_Gambrel' 'roof_style_Hip' 'roof_style_Mansard'\n 'roof_style_Shed' 'roof_matl_CompShg' 'roof_matl_Membran'\n 'roof_matl_Tar&Grv' 'roof_matl_WdShake' 'roof_matl_WdShngl'\n 'exterior_1st_AsbShng' 'exterior_1st_CBlock' 'exterior_1st_ImStucc'\n 'exterior_1st_Stone' 'exterior_2nd_AsbShng' 'exterior_2nd_AsphShn'\n 'exterior_2nd_Brk Cmn' 'exterior_2nd_BrkFace' 'exterior_2nd_CBlock'\n 'exterior_2nd_CmentBd' 'exterior_2nd_HdBoard' 'exterior_2nd_ImStucc'\n 'exterior_2nd_MetalSd' 'exterior_2nd_Plywood' 'exterior_2nd_Stone'\n 'exterior_2nd_Stucco' 'exterior_2nd_VinylSd' 'exterior_2nd_Wd Sdng'\n 'exterior_2nd_Wd Shng' 'mas_vnr_type_BrkCmn' 'mas_vnr_type_BrkFace'\n 'mas_vnr_type_None' 'mas_vnr_type_Stone' 'exter_qual_Ex' 'exter_qual_Fa'\n 'exter_qual_Gd' 'exter_qual_TA' 'foundation_BrkTil' 'bsmt_exposure_Av'\n 'bsmt_exposure_Gd' 'bsmt_exposure_Mn' 'bsmt_exposure_No'\n 'bsmtfin_type_1_ALQ' 'bsmtfin_type_1_BLQ' 'bsmtfin_type_1_GLQ'\n 'bsmtfin_type_1_LwQ' 'bsmtfin_type_1_Rec' 'bsmtfin_type_1_Unf'\n 'bsmtfin_type_2_ALQ' 'bsmtfin_type_2_BLQ' 'bsmtfin_type_2_GLQ'\n 'bsmtfin_type_2_LwQ' 'bsmtfin_type_2_Rec' 'bsmtfin_type_2_Unf'\n 'heating_OthW' 'heating_Wall' 'heating_qc_Ex' 'heating_qc_Fa'\n 'heating_qc_Gd' 'heating_qc_Po' 'heating_qc_TA' 'electrical_FuseA'\n 'electrical_FuseF' 'electrical_FuseP' 'electrical_Mix' 'electrical_SBrkr'\n 'kitchen_qual_Ex' 'kitchen_qual_Fa' 'kitchen_qual_Gd' 'kitchen_qual_TA'\n 'functional_Maj1' 'functional_Maj2' 'functional_Min1' 'functional_Min2'\n 'functional_Mod' 'functional_Sal' 'functional_Sev' 'functional_Typ'\n 'fireplace_qu_Ex' 'fireplace_qu_Fa' 'fireplace_qu_Gd' 'fireplace_qu_Po'\n 'fireplace_qu_TA' 'garage_type_2Types' 'garage_finish_Fin'\n 'garage_finish_RFn' 'garage_finish_Unf' 'sale_type_COD' 'sale_type_CWD'\n 'sale_type_Con' 'sale_type_ConLD' 'sale_type_ConLI' 'sale_type_ConLw'\n 'sale_type_New' 'sale_type_Oth' 'sale_type_WD '] not found in axis"

In [47]:
training_clean.shape

(2049, 101)

## Exporting to CSV

In [51]:
test_df_filtered = test_df

In [53]:
test_df_filtered.to_csv("test_clean1.csv", index=False)

In [52]:
training_clean.to_csv('train_clean.csv', index = False)