In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from scipy import stats
import statsmodels.api as sm
from datetime import date
from sklearn import metrics

In [2]:
ames = pd.read_csv('../frames/training.csv')
ames_test = pd.read_csv('../frames/testing.csv')
ames_raw = pd.read_csv('../data/train.csv') # easier with categorical values

In [3]:
# shout out: https://stackoverflow.com/a/65663400/16451694
def num_na(df, value):
    '''
        takes a data frame, and value and sets numeric column nas
        to the value.
    '''
    df = df.copy()
    for col in df:
        # select only integer or float dtypes
        if df[col].dtype in ("int", "float"):
            df[col] = df[col].fillna(value)
    return df

In [4]:
def curb_appeal(df):
    '''
        returns the curb appeal score array of a dataframe.
    '''
    appeal = []
    for index in range(df.shape[0]):
        score = 0
        if (df['Pool Area'][index] != 0):
            score += 1
        if (df['Fence'][index] != 'NA') or (df['Fence'][index] != 'MnWw'):
            score += 1
        if (df['Garage Type'][index]  == 'CarPort'):
            score += 1
        if (df['Garage Type'][index]  != 'NA'):
            score += 1
        if (df['Paved Drive'][index] == 'Y'):
            score += 1
        if (df['Fireplaces'][index] != 0):
            score += 1
        if (df['TotRms AbvGrd'][index] > df['TotRms AbvGrd'].mean()):
            score += 1
        if (df['Central Air'][index] == 'Y'):
            score += 1
        if (df['Utilities'][index] == 'AllPub'):
            score += 1
        if (df['Condition 1'][index] == 'Norm' or df['Condition 2'][index] == 'Norm'):
            score += 1
        if (df['Functional'][index] == 'Typ'):
            score += 1
        if (df['Alley'][index] == 'Pave'):
            score += 1
        if (df['BsmtFin Type 1'][index] == 'GLQ' or df['BsmtFin Type 2'][index] == 'GLQ'):
            score += 1
        if (df['Misc Feature'][index] == 'TenC'):
            score += 1  
        if (df['Roof Style'][index] == 'Hip'):
            score += 1
        appeal.append(score)
    return appeal

In [5]:
def clean_dum_ames(df):
    '''
        cleans the data from both data frames, the main purpose is to not have to format the test data every time
    '''
    # Lot Frontage issues
    df['Lot Frontage'].fillna(df['Lot Area']**.5)
    
    df = num_na(df, 0)
    df.fillna('NA',inplace=True)
    
    # NOT COVERED ABOVE THIS GARAGE YEAR HAD A BAD ENTRY, SO I FIXED IT
    df['Garage Yr Blt'] = df['Garage Yr Blt'].apply(lambda x: 2007 if x > 2020 else x)
    
    # curb appeal
    df['curb_appeal'] = curb_appeal(df)
    
    # Quality Scaling
    cast_dict = {
        'Ex' : 6,
        'Gd' : 5,
        'TA' : 4,
        'Fa' : 3,
        'Po' : 2,
        'NA' : 1
    }

    # we take them all except Bsmt Qual, which is measuring height
    quals = [
        'Exter Qual', 'Exter Cond', 'Bsmt Cond', 'Heating QC',
        'Kitchen Qual', 'Fireplace Qu', 'Garage Qual', 'Garage Cond',
        'Pool QC'
    ]
    for qual in quals:
        df[qual] = df[qual].map(cast_dict) 
        
    # NOT MENTIONED ABOVE, THIS WILL ENGINEER A SCORE OF ALL THE QUALTIES NA-EX TOGETHER OF A PROPERTY
    overall_quals = [1] * df.shape[0]
    for qual in quals:
        overall_quals *= df[qual]
    df['overall_quals'] = overall_quals
    
    # mapping the Ms SubClass
    df['MS SubClass'] = df['MS SubClass'].map({20:'One Story',30:'One Story',40:'One Story',45:'One Story',50:'One Story',
                        60:'Two Story',70:'Two Story',75:'Two Story',
                        80:'Split',85:'Split',
                        90:'Duplex',
                        120:'PUD',150:'PUD',160:'PUD',180:'PUD',
                        190:'Conversion'})
    
    df['total_SF'] = df['1st Flr SF'] + df['2nd Flr SF']
    df['flag_lot'] = df['Lot Frontage'].apply(lambda x: 'Yes' if x == 0 else 'No')
    df['overalls'] = df['Overall Qual'] * df['Overall Cond']
    # age of house at purchase
    df['house_age'] = df['Yr Sold'] - df['Year Built']
    # dummying
    cols_to_dummy = df.select_dtypes(include=['object','category']).columns
    
    return pd.get_dummies(df[cols_to_dummy],drop_first=True)
    #return pd.get_dummies(df,drop_first=True,columns=cols_to_dummy)

clean_dum_ames(ames_raw)

Unnamed: 0,MS SubClass_Duplex,MS SubClass_One Story,MS SubClass_PUD,MS SubClass_Split,MS SubClass_Two Story,MS Zoning_C (all),MS Zoning_FV,MS Zoning_I (all),MS Zoning_RH,MS Zoning_RL,...,Misc Feature_TenC,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD,flag_lot_Yes
0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2047,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2048,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2049,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# what predicts quality? can lasso find out?


X = clean_dum_ames(ames_raw)
y = ames['Overall Qual']

poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
X_overfit = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_overfit,y,
                                                   random_state=13,
                                                   test_size=.2)
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

#alphas = np.logspace(-3, 0, 100)
lasso = linear_model.Lasso(alpha=.1)
lasso.fit(Z_train, y_train)

KeyboardInterrupt: 