## importz

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
# Set max columns and rows displayed
# pd.set_option('display.max_columns', 10000)
# pd.set_option('display.max_rows', 10000)

In [None]:
# Read in files
training = pd.read_csv("datasets/train.csv", keep_default_na=False, na_values=[''])
testing = pd.read_csv("datasets/test.csv", keep_default_na=False, na_values=[''])

In [None]:
# Some correlation but with age-related categories there may be some major collinearity here
sns.scatterplot(x=training['MS SubClass'], y=training['SalePrice'], alpha=0.1);

In [None]:
training['MS Zoning'].value_counts()

In [None]:
# I don't think this will be very useful
sns.scatterplot(x=training['Lot Config'], y=training['SalePrice'], alpha=0.1);

In [None]:
# Definite differences between neighborhoods, worth making dummies for
sns.scatterplot(x=training['Neighborhood'], y=training['SalePrice'], alpha=0.1);

In [None]:
# Strong correlation but it isn't very linear - this one may require feature engineering
# Also that statistical heaping in early decades is suspicious but probably not too damaging
sns.scatterplot(x=training['Year Built'], y=training['SalePrice'], alpha=0.2);

In [None]:
# Similar to above but I don't like the look of that floor at 1950, will drop that year if I use this feature
# It almost certainly actually represents '1950 OR BEFORE'
sns.scatterplot(x=training['Year Remod/Add'], y=training['SalePrice'], alpha=0.2);

In [None]:
# 5 seems really heavily overreported, maybe to the point where this feature would distort results?
sns.scatterplot(x=training['Overall Cond'], y=training['SalePrice'], alpha=0.2);

In [None]:
# Oooh, that's excellent
sns.scatterplot(x=training['Overall Qual'], y=training['SalePrice'], alpha=0.2);

In [None]:
# Nice strong linear relationship with square footage, but with a few definite outliers that may need to be pruned
sns.scatterplot(x=training['1st Flr SF'], y=training['SalePrice'], alpha=0.2);

In [None]:
# Surprised that it looks like new houses aren't much bigger overall, there's a trend upwards but not as strong as I expected
sns.scatterplot(x=training['Year Built'], y=training['1st Flr SF'], alpha=0.2);

In [None]:
# Even less trend here. That floor definitely has to go.
sns.scatterplot(x=training['Year Remod/Add'], y=training['1st Flr SF'], alpha=0.2);

In [None]:
# Still just a weak trend - may be survivorship bias. Those outliers are really throwing things off, though. 
sns.scatterplot(x=training['Year Built'], y=training['Gr Liv Area'], alpha=0.2);

In [None]:
# Getting rid of the outliers
training.drop(training[training['Gr Liv Area'] > 4000].index, inplace=True)

In [None]:
# Getting rid of that floor
training.drop(training[training['Year Remod/Add'] == 1950].index, inplace=True)

In [None]:
# MUCH better
sns.scatterplot(x=training['Year Remod/Add'], y=training['Gr Liv Area'], alpha=0.2);

In [None]:
# Fascinating - strong, fairly linear correlation but it seems like more recent builds have a steeper correlation than older ones
sns.scatterplot(x=training['Gr Liv Area'], y=training['SalePrice'], hue=training['Year Built'], palette='CMRmap_r');

In [None]:
# This is not diverse enough to be useful
training['Heating'].value_counts()

In [None]:
# Same for this one
training['Central Air'].value_counts()

In [None]:
# This could be useful?
sns.scatterplot(x=training['Functional'], y=training['SalePrice'], alpha=0.2);

In [None]:
# I actually think it isn't diverse enough or strong enough, I won't use it
training['Functional'].value_counts()

In [None]:
# Defining just the features I want to work with
subset = [
    'Id', 
    'Lot Area', 
    'Neighborhood', 
    'Overall Qual', 
    'Overall Cond', 
    'Year Built', 
    'Year Remod/Add', 
    'Bsmt Unf SF', 
    'Total Bsmt SF', 
    'Gr Liv Area', 
    'Bedroom AbvGr', 
    'Garage Type',
]

In [None]:
# Smaller dataframe with just my features
train_trimmed = training[subset].copy()

In [None]:
# Gotta add sale price back onto the training set
train_trimmed['SalePrice'] = training['SalePrice']

In [None]:
# trimming test too
test_trimmed = testing[subset].copy()

In [None]:
train_trimmed.head(3)

In [None]:
train_trimmed.dtypes

In [None]:
train_trimmed.shape

In [None]:
# Nothing untoward here - some very skewed distributions but that's to be expected
train_trimmed.describe()

In [None]:
train_trimmed['Bedroom AbvGr'].value_counts()

In [None]:
# Engineering a feature for finished basement square footage
train_trimmed['Bsmt Fin SF'] = train_trimmed['Total Bsmt SF'] - train_trimmed['Bsmt Unf SF']

In [None]:
# repeating in test
test_trimmed['Bsmt Fin SF'] = test_trimmed['Total Bsmt SF'] - test_trimmed['Bsmt Unf SF']

In [None]:
train_trimmed.head()

In [None]:
# 5 is so dominant and spans such a diverse range of prices that I think I won't use Cond
train_trimmed['Overall Cond'].value_counts()

In [None]:
# Dropping Cond from both frames
train_trimmed.drop(columns='Overall Cond', inplace=True)
test_trimmed.drop(columns='Overall Cond', inplace=True)

In [None]:
# Getting dummies for my categoricals
train_dummy = pd.get_dummies(data=train_trimmed, columns=['Neighborhood', 'Garage Type'], drop_first=True)
test_dummy = pd.get_dummies(data=test_trimmed, columns=['Neighborhood', 'Garage Type'], drop_first=True)

In [None]:
train_dummy.head()

In [None]:
# Let's see if there are any discrepancies between train and test besides sale price
set(train_dummy.columns.tolist()) - set(test_dummy.columns.tolist())

In [None]:
# Add those neighborhood dummy columns to test for the sake of consistency
test_dummy['Neighborhood_GrnHill'] = 0
test_dummy['Neighborhood_Landmrk'] = 0

In [None]:
# Nothing in test that isn't in train, thankfully
set(test_dummy.columns.tolist()) - set(train_dummy.columns.tolist())

In [None]:
# Setting up features and target for a quick initial regression
X = train_dummy.drop(columns=['Id', 'Total Bsmt SF', 'SalePrice'])
X2 = test_dummy.drop(columns=['Id', 'Total Bsmt SF'])
y = train_dummy['SalePrice']

In [None]:
# Instantiate a linear regression object
lr = LinearRegression()

In [None]:
# This looks surprisingly good?
cross_val_score(lr, X, y).mean()

In [None]:
# Quick fit
lr.fit(X,y)

In [None]:
# Get predictions and add them to the frame as a new column
train_dummy['preds'] = lr.predict(X)

In [None]:
# Scatter residuals against target - I'm overestimating at lower prices and drastically underestimating at high prices
# Basically we have a curve here where there ought to be a line. Poly features may help. 
sns.scatterplot(x=y, y=(train_dummy['preds']-y));

In [None]:
# Okay, time to get serious

In [None]:
features = train_dummy.drop(columns=['Id', 'Total Bsmt SF', 'SalePrice', 'preds']).columns.tolist()

In [None]:
# Instantiate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [None]:
# Poly transforming train and test sets
X_poly = poly.fit_transform(X)
X2_poly = poly.fit_transform(X2)

In [None]:
# Turn these poly versions back into dataframes
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names(features))
X2_poly_df = pd.DataFrame(X2_poly, columns=poly.get_feature_names(features))

In [None]:
# The interactions between neighborhoods are pointless but shouldn't have any effect, so I think it would be more work to get rid of them
X_poly_df.head()

In [None]:
# Create train/test splits.
X_train, X_test, y_train, y_test = train_test_split(
    X_poly_df,
    y,
    test_size=0.2,
    random_state=42
)

In [None]:
# Instantiate and apply standard scaler
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)
Z_test_2 = sc.transform(X2_poly_df)

In [None]:
# Instantiating and fitting a new linear regression on scaled poly data
lr2 = LinearRegression()
lr2.fit(Z_train, y_train)

In [None]:
# Hahahaha oh god what the hell is that test score
print(f"Training data score: {lr2.score(Z_train, y_train)}")
print(f"Testing data score: {lr2.score(Z_test, y_test)}")

In [None]:
# This looks great
sns.scatterplot(x=y_train, y=lr2.predict(Z_train));

In [None]:
# So I think we can say this is a liiiiiiiiittle overfit
sns.scatterplot(x=y_test, y=lr2.predict(Z_test));

In [None]:
len(X_train.columns), len(lr2.coef_)

In [None]:
coef_df = pd.DataFrame({
    'features': X_train.columns,
    'vals': lr2.coef_
})

In [None]:
349662.63875770895 % 1

In [None]:
lr2.predict(Z_test).tolist()[:5]

In [None]:
y_test

In [None]:
lr2.coef_