#### Function to create linear model

In [4]:
path = './data/kaggle/created/homes_grouped.csv'

In [5]:
def OLS_Model_Creation(path):
    '''
    This function takes in Aimes, Iowa housing data and spits out a scikit multi-linear regression model
    '''
    
    # import relevant libraries
    import pandas as pd
    import numpy as np
    from sklearn.linear_model import LinearRegression 
    from sklearn.model_selection import train_test_split
    
    # read data in from parent directory
    from read_path_module import read_data_relative_path
    
    #try:
    #    df_train = read_data_relative_path(relative_dataset_path = relative_path, data_type='csv')
    #except:
    #    df_train = path
        
    df_train = pd.read_csv('homes_grouped1.csv')
    
    # impute truly "missing" data (i.e. the NA's do not have significance)
    df_train['LotFrontage'] = df_train['LotFrontage'].mask(df_train['LotFrontage'].isnull(), np.random.uniform(df_train['LotFrontage'].min(), df_train['LotFrontage'].max(), size = df_train['LotFrontage'].shape))
    df_train['GarageYrBlt'] = df_train['GarageYrBlt'].mask(df_train['GarageYrBlt'].isnull(), np.random.uniform(df_train['GarageYrBlt'].min(), df_train['GarageYrBlt'].max(), size = df_train['GarageYrBlt'].shape))
    df_train['MasVnrArea'] = df_train['MasVnrArea'].mask(df_train['MasVnrArea'].isnull(), np.random.uniform(df_train['MasVnrArea'].min(), df_train['MasVnrArea'].max(), size = df_train['MasVnrArea'].shape))

    # fill rest of NA's with Nothing to add categorical meaning (i.e. a NA in poolQC means that there is no pool)
    df_train.fillna('Nothing', inplace = True)  
    
    # drop non-needed columns
    df = df_train.drop(['Id'], axis = 1)
    df = df.drop(['Unnamed: 0'], axis = 1) 
    
    # create df copy and isolate the categorical columns for dummification
    categorical = ['Alley', 'BldgType_group', 'BsmtCond_group', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual_group', 'CentralAir', 'Condition1_group', 'Electrical_group', 'ExterCond_group', 'ExterQual', 'Exterior1st_group', 'Exterior2nd_group', 'Fence', 'FireplaceQu', 'Foundation_group', 'GarageCond_group', 'GarageFinish', 'GarageQual', 'GarageType', 'HeatingQC_group', 'HouseStyle_group', 'KitchenQual', 'LandContour_group', 'LandSlope', 'LotConfig_group', 'LotShape_group', 'MS_Zoning_group', 'MasVnrType_group', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofStyle_group', 'SaleCondition_group', 'SaleType_group']
    df_1 = df[categorical]
    df_dum = pd.get_dummies(df_1, drop_first = True)  
    
    # create df copy of numerical variables and concatenate this with the dummified df
    df_num = df[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']] 
    df = pd.concat([df_num, df_dum], axis = 1) 
    
    # Filter out outliers
    mult_upper = 4
    mult_lower = 1.5
    med = df['SalePrice'].median()
    mean = df['SalePrice'].mean()
    std = df['SalePrice'].std()
    df = df.loc[(df['SalePrice'] > med - (mult_lower * std) ) & (df['SalePrice'] < med + (mult_upper * std))]
    
    # create X and y      
    X = df.drop(['SalePrice'], axis = 1)
    y = np.log(df['SalePrice'])
    
    # train_test_split   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)  
    
    # linear regression   
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    
    # fit model
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    
    return lin_reg, X_train, X_test, y_train, y_test

#### Run function and return linear model and processed datasets

In [6]:
lin_reg, X_train, X_test, y_train, y_test = OLS_Model_Creation(path)

FileNotFoundError: [Errno 2] File homes_grouped1.csv does not exist: 'homes_grouped1.csv'

#### Check model results

In [18]:
import pandas as pd
import numpy as np

print(f'ordinary linear regression score: {lin_reg.score(X_train, y_train)}', '\n')  
print(f'ordinary linear regression intercept: {np.exp(lin_reg.intercept_) }', '\n')
coefficients = pd.DataFrame(np.exp(lin_reg.coef_), X_train.columns, columns = ['Coefficients'])   
coefficients.head(5)

ordinary linear regression score: 0.9185877904485831 

ordinary linear regression intercept: 500764.3883166676 



Unnamed: 0,Coefficients
MSSubClass,0.99979
LotFrontage,1.00006
LotArea,1.000003
OverallQual,1.043146
OverallCond,1.046659


In [19]:
predictions = lin_reg.predict(X_test)
comparison = pd.DataFrame({'Actual': np.exp(y_test), 'Predictions': np.exp(predictions)})
comparison.head(5)  

Unnamed: 0,Actual,Predictions
593,140000.0,145631.81789
962,155000.0,163203.299653
746,236000.0,228893.892507
77,127000.0,121653.417104
223,97000.0,110697.767244
