In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
train_df = pd.read_csv("datasets/final_train_df.csv")
test_df = pd.read_csv("datasets/final_test_df.csv")
test = pd.read_csv("datasets/final_kaggle_data.csv")

In [3]:
#split X,y now that values are in order
X_train = train_df.drop(columns = ['id','pid','saleprice'])
y_train = train_df['saleprice']

In [4]:
test_ids = test[['id']]

### VIF Feature Dropping DataFrame

In [5]:
#recursive loop to check multicolinearity and remove variables that do not pass the threshold. (5)

def vif_dropper(df):  #input a dataframe
    df_new = df
    
    while True:  #creating a while loop
        
        df_vif = pd.DataFrame()        # create blank dataframe and assigning it to vif_data
        df_vif["feature"] = df_new.columns 
        df_vif["VIF"] = [variance_inflation_factor(df_new.values, i) for i in range(len(df.columns))]

        v = df_vif[df_vif['VIF'] != float('inf')] #filter df for VIF values that  are not inf & saving to new df called v
    
        m = max([i for i in v['VIF']])   # for each of the VIF vals in v df, return the max
        print(m)
        if m < 5:
            break
            
        else:
            feature = df_vif.iloc[df_vif[df_vif['VIF'] == m].index]['feature'].values[0]
            df_new.drop(columns = feature,inplace =True)
            print(feature)
            
    return df_new

In [None]:
#dropping variables of multicolinearity and saving to new dataframe
X_train_vif = vif_dropper(X_train)

  vif = 1. / (1. - r_squared_i)


107843.24480378188
bsmtfin_sf_1


## Modeling

In [None]:
#evening out columns in both the test set and the x_train_vif set since some dummied variables were not considered due to values within the set 
dropset = set(test) - set(X_train_vif)
dropset
test = test.drop(columns = dropset)

In [None]:
#evening out columns in both the test set and the x_train_vif set since some dummied variables were not considered due to values within the set 
dropset = set(X_train_vif) - set(test)
dropset
X_train_vif = X_train_vif.drop(columns = dropset)

In [None]:
lasso = Lasso(max_iter=5000)
params = {'alpha':np.logspace(0,2,20)}
grid = GridSearchCV(lasso,params,n_jobs=-1,cv=5)
grid.fit(X_train_vif,y_train)

In [None]:
y_preds = grid.predict(test)

In [None]:
preds_df = pd.DataFrame(y_preds,columns = ['saleprice'])

In [None]:
test_ids

In [None]:
preds_df = pd.concat([test_ids['id'],preds_df],axis=1)

In [None]:
preds_df = preds_df.rename(columns = {'id':"Id","SalePrice":'sample_soln'})[:260]
preds_df = preds_df.reset_index().drop(columns = 'Id')

In [None]:
preds_df = preds_df.rename(columns = {'index':"Id"})
preds_df

In [None]:
preds_df.to_csv('datasets/kaggle_submission.csv',header = True,index=False)

In [None]:
y_preds = pd.DataFrame([y_train.mean() for x in y_train])

In [None]:
y_preds = y_preds.reset_index().rename(columns = {'index':"Id",0:'sample_soln'})

In [None]:
y_preds[:260].to_csv('datasets/kaggle_submission.csv',header = True,index=False)

In [None]:
y_preds[:260]