# Polynomial Regression

#### Load the packages, import the data, and split the data into an X dataframe and y vector

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/USA_Housing.csv")
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [2]:
data.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [3]:
X = data[['Avg. Area Income','Avg. Area Number of Bedrooms']]
y = data["Price"]

#### Add polynomial terms to X DataFrame

In [4]:
from sklearn.preprocessing import PolynomialFeatures
poly_variable = X[["Avg. Area Number of Bedrooms"]]  # Input variable
poly_degree = 3  # Select degree
poly = PolynomialFeatures(degree = poly_degree)
X_poly = poly.fit_transform(poly_variable)
X_poly = pd.DataFrame(X_poly)

In [6]:
for i in range(2, poly_degree+1):
    col_name = poly_variable.columns[0] + " ^" + str(i)
    X[col_name] = X_poly[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [7]:
X.head()

Unnamed: 0,Avg. Area Income,Avg. Area Number of Bedrooms,Avg. Area Number of Bedrooms ^2,Avg. Area Number of Bedrooms ^3
0,79545.458574,4.09,16.7281,68.417929
1,79248.642455,3.09,9.5481,29.503629
2,61287.067179,5.13,26.3169,135.005697
3,63345.240046,3.26,10.6276,34.645976
4,59982.197226,4.23,17.8929,75.686967


#### Split the data into a train_set and test_set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1111)

#### Fit the Linear Regression Model

In [9]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
pd.DataFrame(lm.coef_, X.columns, columns = ["Coeff"])

Unnamed: 0,Coeff
Avg. Area Income,21.069702
Avg. Area Number of Bedrooms,115782.303557
Avg. Area Number of Bedrooms ^2,-14902.406605
Avg. Area Number of Bedrooms ^3,981.292266


#### Predict on Test Set

In [10]:
y_pred = lm.predict(X_test)
pred_summary = X_test.copy()
pred_summary[y.name] = y_test
pred_summary["y_pred"] = y_pred
pred_summary.head()

Unnamed: 0,Avg. Area Income,Avg. Area Number of Bedrooms,Avg. Area Number of Bedrooms ^2,Avg. Area Number of Bedrooms ^3,Price,y_pred
3652,65966.017208,3.3,10.89,35.937,1342819.0,1147167.0
1862,60288.475915,3.28,10.7584,35.287552,1144938.0,1026551.0
2766,53664.077704,2.19,4.7961,10.503459,996243.4,825306.1
4120,70169.710552,3.36,11.2896,37.933056,1217022.0,1238688.0
3447,73092.741315,2.21,4.8841,10.793861,1336172.0,1235951.0


#### Evaluate the Model

In [11]:
from sklearn import metrics
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Explained Variance:", round(metrics.explained_variance_score(y_test, y_pred), 2))

MSE: 71182897038.7
RMSE: 266801.231329
Explained Variance: 0.43
