### About the data

The data for this project has been obtained from Kaggle. The dataset contains a collection of 1460 individual properties with 81 attributes.

### Research collection

Using the 81 attributes and a rather limited number of samples, can we build a sufficiently accurate model for estimating property values in Ames, Iowa?

### Source

House Prices: Advanced Regression Techniques
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

### Packages

In [None]:
# Necessary imports
import os
import time
import timeit
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
%matplotlib inline

# Modelling packages
from sklearn import ensemble, linear_model
from sklearn.feature_selection import chi2, f_classif, SelectKBest 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import adjusted_rand_score, classification_report, confusion_matrix, silhouette_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import normalize

# Plotly packages
import cufflinks as cf
import ipywidgets as widgets
import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools
from scipy import special
py.offline.init_notebook_mode(connected=True)

  from numpy.core.umath_tests import inner1d


In [None]:
# Import the data
housing_train = pd.read_csv("train.csv")
housing_test = pd.read_csv("test.csv")

# Drop the 'id' column since it's not a predictor
housing_train.drop(['Id'], axis=1, inplace=True)

# Preview the dataset
housing_train.head()

In [None]:
# Preview the size of the dataframe
housing_train.shape

In [None]:
# View all 80 data columns
housing_train.columns

In [None]:
# View number of missing values in each category
housing_train.isna().sum().sort_values(ascending=False).head(20)

So, quite a few of features seem to be missing values for the majority of data points, like poolqc, misc features, alley, fence, fireplacequ, and lotfrontage. For the columns that are missing an excessive number of values, it seems safe to impute the value of 0 to signify that these traits are not present.

In [None]:
# Impute missing values in these categories with 0
housing_train[['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage']].fillna(0, inplace=True)

### Exploratory Data Analysis

In [None]:
# View the distribution on sale prices
housing_train['SalePrice'].describe()

In [None]:
# Visualize the distribution of housing prices
sns.distplot(housing_train['SalePrice'])

In [None]:
# View descriptive statistics for all numerical categories
housing_train.describe()

In [None]:
# Describe unique occurences for each categorical variable
housing_train.nunique().sort_values(ascending = False).head(20)

In [None]:
# Creates a correlation matrix among the predictor variables
plt.rcParams['figure.figsize'] = [10, 10]

correlation_martix = housing_train.corr()
sns.heatmap(correlation_martix, vmax = 1, square = True)
plt.show()

In [None]:
# Impute all remaining missing values with 0s
housing_train.fillna(0, inplace=True)
housing_train.head()

In [None]:
# Temporarily ignore non-numeric values
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_housing_train = housing_train.select_dtypes(include=numerics)

In [None]:
# Identify the input and output variables
X = numeric_housing_train.drop('SalePrice', axis=1)
y = housing_train['SalePrice']

In [None]:
# Divide the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Create dataframe to track runtime and scores
models = ['Logistic regression' , 'Random forest']
runtime = []
train_score = []
test_score = []

In [None]:
# def run_model(model):
    
#     # Train the model
#     start_time = timeit.default_timer()
#     train_set = cross_val_score(model, X, y, cv=5, n_jobs=-1)
#     elapsed_time = timeit.default_timer() - start_time   
    
#     # Append the scores and runtime to our dataframe
#     train_score.append(train_set.mean())
#     runtime.append(elapsed_time)
    
#     # Fit the model to the data
#     model.fit(X, y)
    
#     # Store the predicted values in a dataframe
#     y_pred = model.predict(X)
    
#     # Print scores and runtime
#     print(str(model), '\n\nTrain score: {:.5f}(+/- {:.2f})\n'.format(train_set.mean(), train_set.std()*2))
#     print('Runtime:', elapsed_time, 'seconds\n')
    
#     # Generate and print the confusion matrix
#     print('Confusion matrix:\n\n', confusion_matrix(y, y_pred))

In [None]:
# Create a linear regression model
lr = linear_model.LinearRegression()

# Fit the model to the data, and predict values
lr.fit(X, y)
y_pred = lr.predict(X)

# Print the overall accuracy of the model
print('Score:', lr.score(X_test, y_test))

In [None]:
# Comparing predicted results to actual results
plt.title
ax = sns.scatterplot(x = 'True Values', 
                     y = 'Predicted Values', 
                     data = pd.DataFrame({'True Values': y, 'Predicted Values': y_pred}))\
                    .set_title('Predicted vs Actual Results')

In [None]:
# Cross validate our model
cross_val_score = cross_val_score(lr, X, y, cv=5, n_jobs=-1)

# Print the results of our cross validation matrix
print('Cross validation score: {:.5f}(+/- {:.2f})\n'.format(cross_val_score.mean(), cross_val_score.std()*2))