# SK Learn 

In [None]:
# Split normalized data into target variable and predictors/explanatory variables/independent variables
X = df_norm.drop(['price','price_millions'], axis=1) # independent variables
y = df_norm['price_millions'] # target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=4)
lr = LinearRegression()
lr.fit(X_train, y_train)
predict_y_train = lr.predict(X_train)
print('The R squared value is: ' + str(metrics.r2_score(y_train, predict_y_train)))

In [None]:
plt.scatter(y_train, predict_y_train)
plt.xlabel('price - millions')
plt.ylabel('predicted price - millions')
plt.show()

In [None]:
plt.scatter(predict_y_train, y_train - predict_y_train)
plt.xlabel('predicted')
plt.ylabel('residuals')

# Statsmodels

In [None]:
house_pred = df_water.drop(['price','price_millions'], axis=1)
house_target = df_water['price_millions']

In [None]:
predictors = sm.add_constant(house_pred)
model = sm.OLS(house_target, predictors).fit()
model.summary()

# Expanded SkLearn

In [None]:
# Split normalized data into target variable and predictors/explanatory variables/independent variables
X = df_norm.drop(['price','price_millions'], axis=1) # independent variables
y = df_norm['price_millions'] # target variable

In [None]:
# Divide into test and train data
# Train data is for us to estimate our OLS model
# Test data is to parameterize our model, using that model to predict y values (price)
# If test_size = .3, that means 30% of our data is set aside for teh testing data
# And 70% of that data for training

# The parameter "random_state" ensures that if there is skew in our data, our 30-70 split is randomly taking from these groups
# So we have good proportions of randomly selected data and our 30-70 split has good representation 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=4)


In [None]:
# Linear Regressor - Ordinary Least Squares regression-type estimate:
lr = LinearRegression()

In [None]:
# Train model based on train dataset:
# Regressing the X's onto the y:
# We're getting a fit here, having used 70% of the data
lr.fit(X_train, y_train)
coef_list = list(lr.coef_)
name_list = list(X_train.columns)
pd.Series(coef_list, index=name_list)

In [None]:
# Model prediction based on train dataset:
predict_y_train = lr.predict(X_train)

In [None]:
# Actual Prices vs. Predicted Prices:
plt.scatter(y_train, predict_y_train)
plt.xlabel('price')
plt.ylabel('predicted price')
plt.show()

### Interpret price vs. predicted price scatterplot:
- The distribution isn't a straight line, and from that we know that there is something non-linear going on in the relationships we've modeled. 
- We don't have a good linear relationship between price and our predictors

In [None]:
# Finally, we take the 30% of that data we set aside for testing, and examine the error for that:
predict_y_test = lr.predict(X_test)
print('The R squared value is: ' + str(metrics.r2_score(y_test, predict_y_test)))


In [None]:
# So looks like in-sample, out-of-sample is not as robust as we might want, 
# since our R-squared value is less than when using train data