# Linear Regression Real Estate

## Import required libraries

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [None]:
spark = SparkSession\
        .builder\
        .appName("Real Estate")\
        .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Warn")

## Load the dataset from sklearn datasets

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
print(boston.keys())

## Loaded data contains following information<br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">

<font color='teal'>
    <ol>
        <li>data - contains the features </li>
        <li>target - contains the dependent variable </li>
        <li>feature_names - column headings </li>
        <li>DESCR - description of the dataset </li>
        <li>filename - dataset file name </li>
    </ol>
</font>
</span>

In [None]:
print(boston.data.shape)

## Print the DESCR attribute in the data

In [None]:
print(boston.DESCR)

## Convert the features (independent variables) into data frame

In [None]:
bos = pd.DataFrame(boston.data)
bos.head()

# Notice the header columns are not populated

## Notice the feature names are not set. Set them on the data frame

In [None]:
bos.columns = boston.feature_names
bos.head()

## Print the shape of the target variable

In [None]:
boston.target.shape

## Add the target column to the dataset and display first 5 rows/samples

In [None]:
# add the price to the data frame
bos['PRICE'] = boston.target
bos.head()

## Display the last 10 rows/samples

In [None]:
bos.tail(10)

## Display the std deviation, mean, min, max, etc of the dataset

In [None]:
# Get the data description e.g. count, mean, standard deviation, etc.
pd.set_option('precision', 3)
bos.describe()

## Display the data types of features and target

In [None]:
# display the data types
bos.info()

## Create a scatter plot of the Price v/s number of rooms

In [None]:
plt.scatter(x = bos['PRICE'], y = bos['RM'])
plt.xlabel('Price in $1,000')
plt.ylabel('Number of Rooms')
plt.title('Rooms to Price')
display()

## Display all features "null" count

In [None]:
# check if there are any null values in our features
bos.isnull().sum()

## Looks like our data set is clean - no nulls

<font color = 'tomato'>
<h1>Spark Processing</h1>
</font>

## Create Spark Data Frame from bos (pandas data frame)

In [None]:
df1 = spark.createDataFrame(bos)

In [None]:
df1.columns

In [None]:
cols = df1.columns
cols = cols[:-1]
print(cols)

<font color = 'tomato'>
<h2> Create Features and Target</h2><br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">
<ol>
    <li>Using Vector Assembler create features</li>
    <li>Transform features</li>
    <li>print the schema</li>
    <li>Display the first row</li>
</ol>
</span>
</font>
    

<font color = 'tomato'>
    <h2>Apply Standard Scaler to the features</h2>    
</font>

In [None]:
from pyspark.ml.feature import StandardScaler

<font color = 'tomato'>
<h2> Add the target variable PRICE</h2><br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">
<ol>
    <li>Create new variable that will hold both standard features and PRICE</li>
    <li>Print the schema of this new variable</li>
    <li>Display the first row of the new variable</li>
</ol>
</span>
</font>

<font color = 'tomato'>
    <h2>Create Training and Test sets</h2>
</font>

<font color = 'tomato'>
    <h2>Linear Regression</h2><br>
    <span style="font-family:times, serif; font-size:14pt; font-style:bold">
    <ol>
        <li>Create Linear Regressor Instance</li>
        <li>Train the algorithm on training data to create the model</li>
        <li>Make predictions with the model</li>
    </ol>
    </span>
</font>