In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from scipy import stats
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier

In [2]:
test_df = pd.read_csv ("test.csv")

In [3]:
test_df.shape

(878, 80)

In [4]:
test_df.dtypes

Id                int64
PID               int64
MS SubClass       int64
MS Zoning        object
Lot Frontage    float64
                 ...   
Misc Feature     object
Misc Val          int64
Mo Sold           int64
Yr Sold           int64
Sale Type        object
Length: 80, dtype: object

In [5]:
test_df.columns = test_df.columns.str.lower().str.replace(" ","_")

In [6]:
test_df.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [7]:
test_df.isnull().sum()

id                0
pid               0
ms_subclass       0
ms_zoning         0
lot_frontage    160
               ... 
misc_feature    837
misc_val          0
mo_sold           0
yr_sold           0
sale_type         0
Length: 80, dtype: int64

In [8]:
# using .any() from Pandas documentation to show only columns with nulls, as there are too many to show above
# AND how many -- if just inputting 'null_locations', 
# will just get string of columns that contain nulls, not amounts

null_locations = test_df.columns[test_df.isnull().any()]
test_df[null_locations].isnull().sum()

lot_frontage      160
alley             820
mas_vnr_type        1
mas_vnr_area        1
bsmt_qual          25
bsmt_cond          25
bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_yr_blt      45
garage_finish      45
garage_qual        45
garage_cond        45
pool_qc           874
fence             706
misc_feature      837
dtype: int64

In [59]:
# Dropping columns that appears to be primarily null values:

test_df.drop(columns=['alley', 'pool_qc', 'fence', 'misc_feature'], inplace=True)

In [58]:
test_df.drop(columns=['enclosed_porch', 'kitchen_abvgr', 'bedroom_abvgr', 'low_qual_fin_sf','misc_val', '3ssn_porch', 
                       'bsmt_half_bath', 'screen_porch', 'pool_area', 'garage_cond', 'bsmt_unf_sf', 'garage_qual', 'bsmt_unf_sf', 'bsmt_qual', 'exter_cond', 'overall_cond',
                      'bsmt_cond', 'bsmtfin_sf_2'], inplace=True)

In [60]:
# train_df.drop(columns=['enclosed_porch', 'kitchen_abvgr', 'bedroom_abvgr', 'low_qual_fin_sf','misc_val', '3ssn_porch', 
#                       'bsmt_half_bath', 'screen_porch', 'pool_area', 'garage_cond', 'bsmt_unf_sf', 'garage_qual', 'bsmt_unf_sf', 'bsmt_qual', 'exter_cond', 'overall_cond',
#                      'bsmt_cond', 'bsmtfin_sf_2'], inplace=True)

#Sidni

In [61]:
null_locations = test_df.columns[test_df.isnull().any()]
test_df[null_locations].isnull().sum()

bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_finish      45
dtype: int64

In [63]:
## MIGHT NOT BE NEEDED, HAS BEEN RUN ON THIS KERNEL
features = ['mas_vnr_type', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2',
            'garage_cond']
null_replace(features)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,garage_yr_blt,garage_finish,garage_cars,garage_area,paved_drive,wood_deck_sf,open_porch_sf,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69,9142,Pave,Reg,Lvl,AllPub,...,1910,Unf,1,440,Y,0,60,4,2006,WD
1,2718,905108090,90,RL,69,9662,Pave,IR1,Lvl,AllPub,...,1977,Fin,2,580,Y,170,0,8,2006,WD
2,2414,528218130,60,RL,58,17104,Pave,IR1,Lvl,AllPub,...,2006,RFn,2,426,Y,100,24,9,2006,New
3,1989,902207150,30,RM,60,8520,Pave,Reg,Lvl,AllPub,...,1935,Unf,2,480,N,0,0,7,2007,WD
4,625,535105100,20,RL,69,9500,Pave,IR1,Lvl,AllPub,...,1963,RFn,2,514,Y,0,76,7,2009,WD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,80,8000,Pave,Reg,Lvl,AllPub,...,1974,Unf,2,488,Y,0,96,11,2007,WD
874,1234,535126140,60,RL,90,14670,Pave,Reg,Lvl,AllPub,...,1966,RFn,2,480,Y,0,230,8,2008,WD
875,1373,904100040,20,RL,55,8250,Pave,Reg,Lvl,AllPub,...,1968,Unf,1,322,Y,0,63,8,2008,WD
876,1672,527425140,20,RL,60,9000,Pave,Reg,Lvl,AllPub,...,1974,Unf,2,528,Y,0,0,5,2007,WD


In [64]:
# Function to fill remaining null values with 'None' as opposed to a NaN
def null_replace(features):
    for feature in features:
        test_df[feature].replace(to_replace = np.nan, value = 'None', inplace = True)

        return test_df

In [65]:
test_df['mas_vnr_area'].replace(to_replace = np.nan, value=0)

0        0
1        0
2        0
3        0
4      247
      ... 
873      0
874    410
875      0
876      0
877      0
Name: mas_vnr_area, Length: 878, dtype: int32

In [66]:
test_df['mas_vnr_area'].unique()

array([   0,  247,   23,   98,  104,  156,  180,   44,   76,   70,  352,
        162,  444,  495,  340,  634,  182,  147,  108,   20,  423,  178,
        359,   75,  161,  674,  100,  306,  509,  653,  450,  360,  680,
        112,   72,  440, 1378,  304,  364,  754,  788,  230,  368,  120,
        113,  216,  371,  153,  151,  396,  215,  472,  500,  468,   14,
         50,   96,   99,  342,  174,  310,  114,   74,  270,  260,  123,
        218,  415,  921,  771,  726,   16,  362,  473,  870, 1224,  285,
        420,  137,  259,   82,  632,  170,  408,   53,  532,  286,  206,
        308,  405,  128,  236,  350,  302,  256,  657,  194,  567,  116,
         65,  305,  188,  281,  300,  198,   95,  481,  226,  459,  480,
        422,  877,  166,  149,  190,  189,  492,  205,  130,  250,  223,
        280,  435,  229,  438,  975,   67,  150,  196,   80,  380,   94,
        594,  288,  209,   54,  266, 1170,  227,  240,   89,  246,  263,
        169,  621,  252,   18,  232,  200,   90,  4

In [67]:
test_df.fillna(test_df.mean(), inplace=True)

In [68]:
def flt_to_int(features2):
    for feature in features2:
        test_df[feature] = test_df[feature].astype(int)
    return test_df

In [69]:
features2 = ['garage_yr_blt', 'lot_frontage', 'mas_vnr_area', 'bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 
             'total_bsmt_sf', 'bsmt_full_bath']
flt_to_int(features2)

KeyError: 'bsmtfin_sf_2'

In [70]:
null_locations = test_df.columns[test_df.isnull().any()]
test_df[null_locations].isnull().sum()

bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_finish      45
dtype: int64

In [71]:
test_df['mas_vnr_area'].replace(to_replace = np.nan, value=0)

0        0
1        0
2        0
3        0
4      247
      ... 
873      0
874    410
875      0
876      0
877      0
Name: mas_vnr_area, Length: 878, dtype: int32

In [72]:
test_df['mas_vnr_area'].unique()

array([   0,  247,   23,   98,  104,  156,  180,   44,   76,   70,  352,
        162,  444,  495,  340,  634,  182,  147,  108,   20,  423,  178,
        359,   75,  161,  674,  100,  306,  509,  653,  450,  360,  680,
        112,   72,  440, 1378,  304,  364,  754,  788,  230,  368,  120,
        113,  216,  371,  153,  151,  396,  215,  472,  500,  468,   14,
         50,   96,   99,  342,  174,  310,  114,   74,  270,  260,  123,
        218,  415,  921,  771,  726,   16,  362,  473,  870, 1224,  285,
        420,  137,  259,   82,  632,  170,  408,   53,  532,  286,  206,
        308,  405,  128,  236,  350,  302,  256,  657,  194,  567,  116,
         65,  305,  188,  281,  300,  198,   95,  481,  226,  459,  480,
        422,  877,  166,  149,  190,  189,  492,  205,  130,  250,  223,
        280,  435,  229,  438,  975,   67,  150,  196,   80,  380,   94,
        594,  288,  209,   54,  266, 1170,  227,  240,   89,  246,  263,
        169,  621,  252,   18,  232,  200,   90,  4

In [73]:
test_df.fillna(test_df.mean(), inplace=True)

In [74]:
features2 = ['garage_yr_blt', 'lot_frontage', 'mas_vnr_area', 'bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 
             'total_bsmt_sf', 'bsmt_full_bath']
flt_to_int(features2)

KeyError: 'bsmtfin_sf_2'

In [75]:
null_locations = test_df.columns[test_df.isnull().any()]
test_df[null_locations].isnull().sum()

bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_finish      45
dtype: int64

In [30]:
test_df.shape

(878, 80)

In [31]:
test_df['paved_drive'].value_counts()

Y    790
N     65
P     23
Name: paved_drive, dtype: int64

In [None]:
# train_df['central_air'] = train_df['central_air'].map({'Y':1, 'N':0})
# train_df['paved_drive'] = train_df['paved_drive'].map({'Y':1, 'N':0, 'P':0})

In [36]:
def ordinals_to_num(df):
    for i in test_df.columns:
        if i in ordinals:
            test_df[i].map(map_dict)
            pd.to_numeric(arg=test_df[i], errors="coerce")
    return test_df

In [37]:
map_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NA":0}
ordinals = ["exter_qual", "exter_cond", "bsmt_qual", "bsmt_cond", "kitchenqual", "fireplacequ", "garage_qual", "garage_cond"]


In [38]:
dummy_test = ordinals_to_num(test_df)

In [39]:
dummy_test.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [40]:
test_df.isnull().sum()

id                0
pid               0
ms_subclass       0
ms_zoning         0
lot_frontage      0
               ... 
misc_feature    837
misc_val          0
mo_sold           0
yr_sold           0
sale_type         0
Length: 80, dtype: int64

In [76]:
test_df_filtered = pd.get_dummies(test_df)
test_df_filtered.dropna(inplace=True)

In [81]:
test_df_filtered.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,overall_qual,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,...,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,190,69,9142,6,1910,1950,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,69,9662,5,1977,1977,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,60,58,17104,7,2006,2006,0,554,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,30,60,8520,5,1923,2006,0,0,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,20,69,9500,6,1963,1963,247,609,...,0,0,0,0,0,0,0,0,0,1


In [82]:
dummy_nulls = dummy_test[dummy_test.isnull().any()]
dummy_test[dummy_nulls].isnull.sum()

  dummy_nulls = dummy_test[dummy_test.isnull().any()]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [83]:
dummy_test['kitchen_qual'].unique()

array(['Fa', 'TA', 'Gd', 'Ex', 'Po'], dtype=object)

In [None]:
# SALE PRICE, TAKE MEAN AND PREDICT FOR EVERY ONE

In [80]:
test_df_filtered.to_csv("test_clean.csv")