# Linear Regression

#### Load the packages, import the data, and split the data into an X dataframe and y vector

In [8]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/Regression_Sample_File_2.csv")
data.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition
0,7129300520,221900.0,3,1.0,1180,5650,1.0,0,3
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,3
2,5631500400,180000.0,2,1.0,770,10000,1.0,0,3
3,2487200875,604000.0,4,3.0,1960,5000,1.0,0,5
4,1954400510,510000.0,3,2.0,1680,8080,1.0,0,3


In [9]:
data.columns

Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'condition'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [10]:
X = data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition']]
y = data['price']

#### Split the data into a train_set and test_set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1111)

#### Fit the Linear Regression Model

In [12]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

coef = np.append(lm.intercept_, lm.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,-97268.79
bedrooms,-58674.91
bathrooms,2827.121
sqft_living,316.2431
sqft_lot,-0.3357857
floors,7996.283
waterfront,1027387.0
condition,46832.41


#### Predict on Test Set

In [13]:
y_pred = lm.predict(X_test)

pred_summary = X_test.copy()
pred_summary[y.name] = y_test
pred_summary["y_pred"] = y_pred
pred_summary.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,price,y_pred
5090,4,1.75,2080,7200,1.0,0,4,198000.0,523673.1
2689,5,3.5,7350,12231,2.0,0,3,1110000.0,2096022.0
725,3,2.25,1540,17424,2.0,0,3,315000.0,370721.0
6049,3,1.0,1110,7000,1.5,0,4,302000.0,277537.1
4080,4,2.0,2250,7500,1.0,0,5,550000.0,624872.8


#### Evaluate the Model

In [17]:
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 3))

MSE: 57992675412.3
RMSE: 240816.684248
Explained Variance: 0.487
