# House Prices - Advanced Regression Techniques 🏠

## Data Import and Initial Explore

In [None]:
# Pandas Import

import pandas as pd

In [None]:
# Dataset Import

dataset = pd.read_csv('train.csv')
dataset

In [None]:

dataset.head(10)


In [None]:
dataset.shape

## Starting Exploration

In [None]:
dataset.info

# A lot of NaN and 0 / high registers, need to check before analyse

In [None]:
# Analysis of blank values - % per column
(dataset.isnull().sum()/dataset.shape[0]).sort_values(ascending=False).head(20)

In [None]:
# Keep only the 90% columns + non-blank values

columns_to_drop = dataset.columns[(dataset.isnull().sum()/dataset.shape[0]) > 0.1]

dataset_blankfix = dataset.drop(columns_to_drop, axis=1)
dataset_blankfix

In [None]:
# Selecting numerical columns

numerical_columns = dataset_blankfix.columns[dataset_blankfix.dtypes != 'object']
dataset_numerical = dataset_blankfix.loc[:, numerical_columns]
dataset_numerical.head(5)

In [None]:
# Checking Null

dataset_numerical.isnull().sum().sort_values(ascending = False)

In [None]:
# Setting null to -1 (we don't have values for this, it's not an error)

dataset_numerical = dataset_numerical.fillna(-1)

## Training Model
##### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# Importing train_test_split

from sklearn.model_selection import train_test_split

In [None]:
# Selecting X and Y

X = dataset_numerical.drop('SalePrice', axis = 1)
y = dataset_numerical.SalePrice

In [None]:
# Spliting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Import LinearRegression

from sklearn.linear_model import LinearRegression

In [None]:
# Creating reg

reg_lr = LinearRegression().fit(X_train, y_train)

In [None]:
# Predicting with test data

y_lr = reg_lr.predict(X_test)

In [None]:
# Importing mean absolute error

from sklearn.metrics import mean_absolute_error

In [None]:
# Importing mean squared error

from sklearn.metrics import mean_squared_error

In [None]:
# Checking error on regression 

print (mean_absolute_error(y_test, y_lr))
print (mean_squared_error(y_test, y_lr))

In [None]:
# Graphically analyzing the model
# Importing matplotlib

import matplotlib.pyplot as plt

In [None]:
# Ploting Graph

fig, ax = plt.subplots()

ax.scatter(y_test/100000, y_lr/100000)
ax.plot([0,700000], [0,700000], '--r')

ax.set(xlim=(0,7), ylim=(0,7))
ax.set_xlabel('Real')
ax.set_ylabel('Predict')

plt.show()

## Predicting the test data

In [None]:
# Importing test base

df_test = pd.read_csv('test.csv')

In [None]:
# Visualizing df

df_test.head(3)

In [None]:
# Replicating training treatments
# Dropping columns

df_test = df_test.drop(columns_to_drop, axis=1)

In [None]:
# Verifying numerical columns

numerical_columns2 = df_test.columns[df_test.dtypes != 'object']
numerical_columns2

In [None]:
# Keeping numerical columns

df_test = df_test.loc[:,numerical_columns2]

In [None]:
# Checking df

df_test.info()

In [None]:
# Null values

df_test.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
# Changing Null to -1

df_test = df_test.fillna(-1)
df_test

In [None]:
# Prediction with linear regression

y_pred = reg_lr.predict(df_test)

In [None]:
# Add SalePrice to test df

df_test['SalePrice'] = y_pred

In [None]:
# Extracting id and SalePrice

result = df_test[['Id', 'SalePrice']]

In [None]:
# Export to csv

result.to_csv('result.csv', index=False)