## importz

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [94]:
# Set max columns and rows displayed
# pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [40]:
# Read in files
training = pd.read_csv("datasets/train.csv", keep_default_na=False, na_values=[''])
testing = pd.read_csv("datasets/test.csv", keep_default_na=False, na_values=[''])

### speedrunning past eda

### ok dataset is being changed here

In [41]:
# Getting rid of the outliers
training.drop(training[training['Gr Liv Area'] > 4000].index, inplace=True)

# Getting rid of that floor
training.drop(training[training['Year Remod/Add'] == 1950].index, inplace=True)

In [42]:
# Defining just the features I want to work with
subset = [
    'Id', 
    'Lot Area', 
    'Neighborhood', 
    'Overall Qual', 
    'Overall Cond', 
    'Year Built', 
    'Year Remod/Add', 
    'Bsmt Unf SF', 
    'Total Bsmt SF', 
    'Gr Liv Area', 
    'Bedroom AbvGr', 
    'Garage Type',
]

# Smaller dataframe with just my features
train_trimmed = training[subset].copy()
train_trimmed['SalePrice'] = training['SalePrice']

test_trimmed = testing[subset].copy()

In [43]:
train_trimmed.head(3)

Unnamed: 0,Id,Lot Area,Neighborhood,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Bsmt Unf SF,Total Bsmt SF,Gr Liv Area,Bedroom AbvGr,Garage Type,SalePrice
0,109,13517,Sawyer,6,8,1976,2005,192.0,725.0,1479,3,Attchd,130500
1,544,11492,SawyerW,7,5,1996,1997,276.0,913.0,2122,4,Attchd,220000
2,153,7922,NAmes,5,7,1953,2007,326.0,1057.0,1057,3,Detchd,109000


In [44]:
train_trimmed.shape

(1787, 13)

In [45]:
# Engineering a feature for finished basement square footage
train_trimmed['Bsmt Fin SF'] = train_trimmed['Total Bsmt SF'] - train_trimmed['Bsmt Unf SF']

# repeating in test
test_trimmed['Bsmt Fin SF'] = test_trimmed['Total Bsmt SF'] - test_trimmed['Bsmt Unf SF']

In [46]:
train_trimmed.head()

Unnamed: 0,Id,Lot Area,Neighborhood,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Bsmt Unf SF,Total Bsmt SF,Gr Liv Area,Bedroom AbvGr,Garage Type,SalePrice,Bsmt Fin SF
0,109,13517,Sawyer,6,8,1976,2005,192.0,725.0,1479,3,Attchd,130500,533.0
1,544,11492,SawyerW,7,5,1996,1997,276.0,913.0,2122,4,Attchd,220000,637.0
2,153,7922,NAmes,5,7,1953,2007,326.0,1057.0,1057,3,Detchd,109000,731.0
3,318,9802,Timber,5,5,2006,2007,384.0,384.0,1444,3,BuiltIn,174000,0.0
4,255,14235,SawyerW,6,8,1900,1993,676.0,676.0,1445,3,Detchd,138500,0.0


In [47]:
# Dropping Cond from both frames
train_trimmed.drop(columns='Overall Cond', inplace=True)
test_trimmed.drop(columns='Overall Cond', inplace=True)

In [48]:
# Getting dummies for my categoricals
train_dummy = pd.get_dummies(data=train_trimmed, columns=['Neighborhood', 'Garage Type'], drop_first=True)
test_dummy = pd.get_dummies(data=test_trimmed, columns=['Neighborhood', 'Garage Type'], drop_first=True)

In [49]:
# Let's see if there are any discrepancies between train and test besides sale price
set(train_dummy.columns.tolist()) - set(test_dummy.columns.tolist())

{'Neighborhood_GrnHill', 'Neighborhood_Landmrk', 'SalePrice'}

In [50]:
# Add those neighborhood dummy columns to test for the sake of consistency
test_dummy['Neighborhood_GrnHill'] = 0
test_dummy['Neighborhood_Landmrk'] = 0

In [51]:
# Nothing in test that isn't in train, thankfully
set(test_dummy.columns.tolist()) - set(train_dummy.columns.tolist())

set()

## initial regression

In [52]:
# Setting up features and target for a quick initial regression
X = train_dummy.drop(columns=['Id', 'Total Bsmt SF', 'SalePrice'])
X_kaggle = test_dummy.drop(columns=['Id', 'Total Bsmt SF'])
y = train_dummy['SalePrice']

In [60]:
# Get predictions and add them to the frame as a new column
# train_dummy['preds'] = lr.predict(X)

## more modelin'

In [56]:
features = train_dummy.drop(columns=['Id', 'Total Bsmt SF', 'SalePrice']).columns.tolist()

In [57]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [58]:
X_poly = poly.fit_transform(X)
X_kaggle_poly = poly.fit_transform(X_kaggle)

In [59]:
# Turn these poly versions back into dataframes
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names(features))
X_kaggle_poly_df = pd.DataFrame(X_kaggle_poly, columns=poly.get_feature_names(features))

In [62]:
# Create train/test splits.
X_train, X_test, y_train, y_test = train_test_split(
    X_poly_df,
    y,
    test_size=0.2,
    random_state=42
)

In [63]:
# Instantiate and apply standard scaler
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)
Z_test_2 = sc.transform(X_kaggle_poly_df)

In [64]:
# Instantiating and fitting a new linear regression on scaled poly data
lr2 = LinearRegression()
lr2.fit(Z_train, y_train)

LinearRegression()

In [65]:
# Hahahaha oh god what the hell is that test score
print(f"Training data score: {lr2.score(Z_train, y_train)}")
print(f"Testing data score: {lr2.score(Z_test, y_test)}")

Training data score: 0.9530452058649201
Testing data score: -2.0100572149229475e+24


In [66]:
coef_df = pd.DataFrame({
    'features': X_train.columns,
    'vals': lr2.coef_
})

In [86]:
decimals = coef_df['vals'].map(lambda x: x % 1)
coef_df.loc[:,'decimals'] = decimals

# drop fts with 0 coefficient
coef_df = coef_df[coef_df['vals'] != 0]
coef_df = coef_df.reset_index(drop=True)

In [96]:
# @ben look here
coef_df[coef_df['decimals'] != 0].sort_values(by='decimals')

Unnamed: 0,features,vals,decimals
396,Neighborhood_BrDale Neighborhood_Veenker,0.001953125,0.001953
392,Neighborhood_BrDale Neighborhood_SawyerW,0.004882812,0.004883
395,Neighborhood_BrDale Neighborhood_Timber,0.0078125,0.007812
48,Lot Area Bsmt Fin SF,1893.016,0.015625
390,Neighborhood_BrDale Neighborhood_SWISU,0.0234375,0.023438
394,Neighborhood_BrDale Neighborhood_StoneBr,0.0390625,0.039062
329,Bsmt Fin SF Neighborhood_Sawyer,35182.05,0.046875
1,Overall Qual,52125110000000.0,0.0625
350,Neighborhood_Blueste Neighborhood_GrnHill,-46395010000000.0,0.0625
354,Neighborhood_Blueste Neighborhood_Mitchel,-15335940000000.0,0.0625


In [92]:
1 - .63875771

0.36124228999999997

In [82]:
lr2.predict(Z_test)[:5]

array([349662.63875771, 287582.63875771, 194334.63875771, 231486.63875771,
       220062.63875771])

In [67]:
349662.63875770895 % 1

0.6387577089481056

In [None]:
lr2.predict(Z_test).tolist()[:5]

In [None]:
y_test

In [None]:
lr2.coef_