# Assignment 1: Linear regression with Scikit-Learn
For this first assignment, you are asked to fit a linear regression model on data from sold properties (`train.csv`) in order to predict the house prices in (`test.csv`). We suggest that you explore the data set before jumping into model fitting. You can find more info about the data in the `data_description.txt` file. 

In [None]:
# import libraries
import numpy as np
import pandas as pd

In [None]:
# read train_data 
train_data = pd.read_csv("train.csv")

In [None]:
# explore train_data
print(train_data.info())

In [None]:
# plot correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

corrmat = train_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
# SalePrice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
# top correlated features
top_corr = train_data.corrwith(train_data["SalePrice"]).apply(abs).sort_values(ascending=False).head(5)
sns.pairplot(train_data, x_vars=top_corr.drop(labels=['SalePrice']).axes[0].tolist(), y_vars='SalePrice', kind='reg')

In [None]:
# check normality
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.probplot.html
import scipy.stats as stats

# probability plot
fig = plt.figure()
res = stats.probplot(train_data['SalePrice'], plot=plt)

In [None]:
# data is not normal, apply log transformation
train_data['SalePrice'] = np.log(train_data['SalePrice'])

In [None]:
# prepare data for training
X, y = train_data.drop("SalePrice", axis=1), train_data["SalePrice"]

In [None]:
# split train into train an validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=112)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
from sklearn.linear_model import LinearRegression

# the linear regressor
model = LinearRegression()

# fit the model to the training data (learn the coefficients)
model.fit(X_train, y_train)

In [None]:
# print coefficients and intercept
print(model.intercept_)
print(model.coef_)

In [None]:
# fit score 
model.score(X_train, y_train)

In [None]:
# compute mean squared error
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
from sklearn.metrics import mean_squared_error

train_e = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
train_e

valid_e = np.sqrt(mean_squared_error(y_val, model.predict(X_val)))
valid_e

In [None]:
# compute r2 scores
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score
from sklearn.metrics import r2_score

r2_score(y_train, model.predict(X_train))
r2_score(y_val, model.predict(X_val))

In [None]:
# data for submission
test_data = pd.read_csv("test.csv")
model.predict(test_data)