In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from statsmodels.formula.api import ols
from sklearn import metrics

## Fixing Null Values 

In [2]:
df = pd.read_csv('datasets/train.csv')

In [3]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
# First want to rename all the columns so they are lower case, and joined together with a '_'
# This function will return a list of the column names I want
def re_name(df):
    news = []
    for i in df.columns:
        col = i.lower()
        splits = (col.split())
        joint = '_'.join(splits)
        news.append(joint)
    return news

In [5]:
#all the new column names
new_cols = re_name(df)

In [6]:
#new_cols are now the column names 
df.columns = new_cols
df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [7]:
#checking for nulls
#will return list of columns that are null and null sum for each as int
def null_check(df, cols):
    nulls = []
    for col in cols:
        if df[col].isnull().sum() > 0:
            nulls.append((col, df[col].isnull().sum()))
    return nulls

In [8]:
#Takes all the nulls in the df and replaces with 0 if int/float, or 'None' if object
def fix_nulls(df):
    nulls = []
    for i in df.columns:
        if df[i].isnull().sum() > 0:
            nulls.append(i)
    num_columns = []
    object_columns = []
    for i in nulls:
        if df[i].dtype != 'object':
            num_columns.append(i)
        else:
            object_columns.append(i)
    for i in num_columns:
        df[i].fillna(0, inplace=True)
    for i in object_columns:
        df[i].fillna('None', inplace=True)
    return     

In [9]:
null_check(df, df.columns)

[('lot_frontage', 330),
 ('alley', 1911),
 ('mas_vnr_type', 22),
 ('mas_vnr_area', 22),
 ('bsmt_qual', 55),
 ('bsmt_cond', 55),
 ('bsmt_exposure', 58),
 ('bsmtfin_type_1', 55),
 ('bsmtfin_sf_1', 1),
 ('bsmtfin_type_2', 56),
 ('bsmtfin_sf_2', 1),
 ('bsmt_unf_sf', 1),
 ('total_bsmt_sf', 1),
 ('bsmt_full_bath', 2),
 ('bsmt_half_bath', 2),
 ('fireplace_qu', 1000),
 ('garage_type', 113),
 ('garage_yr_blt', 114),
 ('garage_finish', 114),
 ('garage_cars', 1),
 ('garage_area', 1),
 ('garage_qual', 114),
 ('garage_cond', 114),
 ('pool_qc', 2042),
 ('fence', 1651),
 ('misc_feature', 1986)]

In [10]:
fix_nulls(df)

In [11]:
#Train.csv is now free on nulls
null_check(df, df.columns)

[]

In [14]:
df.to_csv('datasets/base_train.csv')