In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
%matplotlib inline

In [5]:
train = pd.read_csv("train.csv", index_col = 0)
test = pd.read_csv("test.csv", index_col = 0)

def process(train):
    train = train.dropna() 
    mu = np.mean(train['Lot Area'])
    std = np.std(train['Lot Area'])
    normalized_data = (train['Lot Area'] - mu)/std
    indexes = normalized_data < 3
    train_cleaned = train.loc[indexes, :]
    return train_cleaned

In [6]:
tt = [train1, test1]
# Adjust the column names for these
def fill(tt):
    for df in tt:
        df['Lot Frontage'].fillna(df['Lot Frontage'].mean(), inplace=True)
        df['Alley'].fillna('No alley', inplace=True)
        df['Mas Vnr Area'].fillna(df['Mas Vnr Area'].median(), inplace=True)
        df['Fireplace Qu'].fillna('No fireplace', inplace=True)
        df['Pool QC'].fillna('No pool', inplace=True)
        df['Fence'].fillna('No fence', inplace=True)
        df['Misc Feature'].fillna('No', inplace=True)
        df['Garage Yr Blt'].fillna(1, inplace=True)
        mp1 = {'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0}
        df['Exter Qual'] = df['Exter Qual'].map(mp1)
        df['Exter Cond'] = df['Exter Cond'].map(mp1)
        mp3 = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No basement':0}
        df['Bsmt Qual'] = df['Bsmt Qual'].map(mp3)
        df['Bsmt Cond'] = df['Bsmt Cond'].map(mp3)
        df['Bsmt Exposure'] = df['Bsmt Exposure'].map({'Gd':4,'Av':3,'Mn':2,'No':1,'No basement':0})
        mp2 = {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'No basement':0}
        df['BsmtFin Type 1'] = df['BsmtFin Type 1'].map(mp2)
        df['BsmtFin Type 2'] = df['BsmtFin Type 2'].map(mp2)
        df['Heating QC'] = df['Heating QC'].map(mp1)
        df['Central Air'] = df['Central Air'].map({'Y':1,'N':0})
        df['Kitchen Qual'] = df['Kitchen Qual'].map(mp1)
        df['Functional'] = df['Functional'].map({'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0})
        df['Garage Finish'] = df['Garage Finish'].map({'Fin':3,'RFn':2,'Unf':1,'No Garage':0})
        df['Garage Qual'] = df['Garage Qual'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No Garage':0})
        df['Garage Cond'] = df['Garage Cond'].map({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No Garage':0})
        df['Garage Type'] = df['Garage Type'].map({'No Garage':6,'Attchd':5,'Detchd':4,'Basment':3,'2Types':2,'BuiltIn':1,'CarPort':0})
        df['Sale Condition'] = df['Sale Condition'].map({'Abnorml':5,'AdjLand':4,'Alloca':3,'Family':2,'Normal':1,'Partial':0})
        df['Pool QC'] = df['Pool QC'].map({'Ex':4,'Gd':3,'TA':2,'Fa':1,'No pool':0})
        df['Bldg Type'] = df['Bldg Type'].map({'1Fam':0, '2fmCon':1,'Duplex':2,'Twnhs':3,'TwnhsE':4})
        df['Fence'] = df['Fence'].map({'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'No fence':0})
        df['House Style'] = df['House Style'].map({'1Story':0,'2Story':1,'1.5Fin':2,'SLvl':3,'SFoyre':4,'2.5Unf':5,'1.5Unf':6,'2.5Fin':7,'':8})
    
#     for a in ['Mas Vnr Type', 'Bsmt Exposure']:
#         df[a].fillna(df[a].mode().iloc[0], inplace=True)
#     for a in ['Bsmt Qual', 'Bsmt Cond', 'BsmtFin Type 1', 'BsmtFin Type 2']:
#         df[a].fillna('No basement', inplace=True)
#     for a in ['Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond']:
#         df[a].fillna('No Garage', inplace=True)
    
    train = tt[0]
    test = tt[1]
    return train,test

In [11]:
xtrain = train[['Lot Frontage','Lot Area','Overall Qual','Overall Cond','Total Bsmt SF','1st Flr SF','2nd Flr SF','Gr Liv Area','Garage Area','Wood Deck SF','Overall Qual','Year Remod/Add','Bedroom AbvGr','TotRms AbvGrd','Garage Cars','SalePrice']]
newtrain = process(xtrain)

X = newtrain.drop('SalePrice', axis = 1)
y = newtrain['SalePrice']

In [12]:
def linreg(x,y,x2):
    s = sm.add_constant(x)
    r = sm.OLS(y,s).fit()
    x2 = sm.add_constant(x2)
    ypred = r.predict(x2)
    return ypred

s = sm.add_constant(X)
result = sm.OLS(y,s).fit()
result.rsquared

0.8533210586028859

In [13]:
def calculate_vif(r_squared):
    ## Your code goes here
    vif = 1/(1 - r_squared)
    return vif

def vif(processed_train):
    vif_scores = []
    var = []
    
    new_data = processed_train.drop(['SalePrice'], axis = 1)
    
    for i in new_data.columns:
        
        var = [i]
        x = new_data.drop(i,axis=1).values
        y = new_data[i].values
        
        #calculate regression
        s = sm.add_constant(x)
        result = sm.OLS(y,s).fit()
        r2 = result.rsquared
        
        vif = round(calculate_vif(r2),3)
        var.append(vif)
        vif_scores.append(var)
        
        vif_dataframe = pd.DataFrame(vif_scores, columns = ['Variables', 'VIF'])   
    
    return vif_dataframe

In [22]:
xtest = test[['Lot Frontage','Lot Area','Overall Qual','Overall Cond','Total Bsmt SF','1st Flr SF','2nd Flr SF','Gr Liv Area','Garage Area','Wood Deck SF','Overall Qual','Year Remod/Add','Bedroom AbvGr','TotRms AbvGrd','Garage Cars']]
xtest2 = process(xtest)
sale_price = linreg(X,y,xtest2)
test['SalePrice'] = sale_price

In [23]:
#test.to_csv(r"C:\Users\Duker\Desktop\CS530\Quiz\Quiz 2\attempt11.csv")