In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [67]:
df = pd.read_csv('../datasets/train.csv')

tf = pd.read_csv('../datasets/test.csv')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [64]:
df['Age'] = df['Yr Sold'] - df['Year Built']
df['Age'] = df['Age'].map(lambda i: 0 if i < 0 else i)

df['bad_cond'] = df['Condition 1'].map(lambda i: 0 if i == 'Norm' or i == 'PosA' or i == 'PosB' else 1)
df['good_cond'] = df['Condition 1'].map(lambda i: 1 if i == 'PosA' or i == 'PosB' else 0)

df['last_mod'] = df['Yr Sold'] - df['Year Remod/Add']
df['tot_bath'] = df['Full Bath'] + 0.5*df['Half Bath'] + df['Bsmt Full Bath'] + 0.5*df['Bsmt Half Bath']

In [61]:
df.shape

(2051, 87)

In [8]:
features = ['MS Zoning', 'Street', 'Lot Frontage', 'Lot Area','bad_cond','good_cond','Overall Qual','Overall Cond', 'Age', 'last_mod', 'tot_bath', 'Mas Vnr Area', 'Garage Area', 'Garage Cars', 'Exterior 1st', 'Bldg Type', 'House Style', 'Lot Config', 'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond', 'Foundation', 'Central Air', 'Heating QC', 'Kitchen Qual', 'Lot Shape', 'Functional', 'Land Slope', 'Roof Matl', 'Paved Drive', 'Wood Deck SF', 'Open Porch SF', 'Pool Area', 'Pool QC', 'Fence', 'Misc Val', 'Sale Type','1st Flr SF', 'total_SF']

In [9]:
X = df[features]
y = df['SalePrice']

X['Lot Frontage'] = X['Lot Frontage'].map(lambda i: 1 if i == 0 else i)
X = pd.get_dummies(data = X, columns = ['MS Zoning', 'Street', 'Exterior 1st','Bldg Type', 'House Style', 'Lot Config', 'Foundation', 'Central Air', 'Roof Matl', 'Sale Type'], drop_first = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Lot Frontage'] = X['Lot Frontage'].map(lambda i: 1 if i == 0 else i)


In [69]:
X['lot_fr*Lot_ar'] = X['Lot Frontage'] * X['Lot Area']
X['overall**'] = X['Overall Qual'] * X['Overall Cond']
X['gar_area*cars'] = X['Garage Area'] * X['Garage Cars']
X['age*mod'] = X['Age'] * X['last_mod']
X['exter_qual*cond'] = X['Exter Qual']* X['Exter Cond']
X['bsmt_qual*cond'] = X['Bsmt Qual']* X['Bsmt Cond']
X['tot*1st_sf'] = X['total_SF']*X['1st Flr SF']
X['deck*porch'] = X['Wood Deck SF'] * X['Open Porch SF']

In [70]:
lr = LinearRegression()

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 8)

In [72]:
lr.fit(X_train, y_train)

LinearRegression()

In [73]:
lr.score(X_train, y_train)

0.9016680888376937

In [74]:
lr.score(X_test, y_test)

0.5803118353111743

In [75]:
cross_val_score(lr, X_train, y_train).mean()

0.5548733706059051

In [76]:
resids = y_test - lr.predict(X_test)

In [77]:
np.sqrt((resids**2).sum()/len(resids))

53425.61171399558

In [65]:
tf['Age'] = tf['Yr Sold'] - tf['Year Built']
tf['Age'] = tf['Age'].map(lambda i: 0 if i < 0 else i)
tf['bad_cond'] = tf['Condition 1'].map(lambda i: 0 if i == 'Norm' or i == 'PosA' or i == 'PosB' else 1)
tf['good_cond'] = tf['Condition 1'].map(lambda i: 1 if i == 'PosA' or i == 'PosB' else 0)
tf['last_mod'] = tf['Yr Sold'] - tf['Year Remod/Add']
tf['tot_bath'] = tf['Full Bath'] + 0.5*tf['Half Bath'] + tf['Bsmt Full Bath'] + 0.5*tf['Bsmt Half Bath']

tf_X = tf[features]

tf_X['Lot Frontage'] = tf_X['Lot Frontage'].map(lambda i: 1 if i == 0 else i)
tf_X = pd.get_dummies(data = tf_X, columns = ['MS Zoning', 'Street', 'Exterior 1st','Bldg Type', 'House Style', 'Lot Config', 'Foundation', 'Central Air', 'Roof Matl', 'Sale Type'], drop_first = True)

tf_X['lot_fr*Lot_ar'] = tf_X['Lot Frontage'] * tf_X['Lot Area']
tf_X['overall**'] = tf_X['Overall Qual'] * tf_X['Overall Cond']
tf_X['gar_area*cars'] = tf_X['Garage Area'] * tf_X['Garage Cars']
tf_X['age*mod'] = tf_X['Age'] * tf_X['last_mod']
tf_X['exter_qual*cond'] = tf_X['Exter Qual']* tf_X['Exter Cond']
tf_X['bsmt_qual*cond'] = tf_X['Bsmt Qual']* tf_X['Bsmt Cond']
tf_X['tot*1st_sf'] = tf_X['total_SF']*tf_X['1st Flr SF']
tf_X['deck*porch'] = tf_X['Wood Deck SF'] * tf_X['Open Porch SF']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf_X['Lot Frontage'] = tf_X['Lot Frontage'].map(lambda i: 1 if i == 0 else i)


In [131]:
for i in tf_X.columns:
    if i not in X.columns:
        X[i] = 0
        
for i in X.columns:
    if i not in tf_X.columns:
        tf_X[i] = 0
        
tf_X = tf_X[X.columns]

In [141]:
model = LinearRegression()

model.fit(X, y)

predictions = model.predict(tf_X)

tf['SalePrice'] = predictions

tf[['Id', 'SalePrice']].to_csv('../datasets/take_.csv', index = False)