In [1]:
import acquire
import evaluate
import split_scale
import feature_selection

In [2]:
import seaborn as sns
from sklearn.linear_model import LinearRegression

## 1 ACQUIRE Section

In [4]:
df = acquire.get_zillow_bite()

In [5]:
df.columns

Index(['id', 'bathrooms', 'bedrooms', 'sq_ft', 'taxvaluedollarcnt',
       'lotsizesquarefeet'],
      dtype='object')

In [6]:
df = df.dropna()

In [7]:
#Making the id be the index
df = df.set_index(['id'])

In [8]:
df.shape


(15957, 5)

In [None]:
sns.pairplot(data=df.head(7000))

# HERE'S WHAT WE GOT:
### df the DataFrame that is 9 columns wide and 16035 records long.
### explicit index (loc) is the id from the DB
### Each row is a house from the table that had a transaction in MAY or JUNE of 2017

# 2 PREP

In [None]:
## Lets look at Sara's stuff


## 3 SPLIT-SCALE Section

In [9]:
train, test = split_scale.split_my_data(df)

In [10]:
train.head()
train.shape

(12765, 5)

In [11]:
test.shape

(3192, 5)

In [12]:
scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test)

In [13]:
y_train=train[['taxvaluedollarcnt']]
X_train=train_scaled.drop('taxvaluedollarcnt' ,axis=1)

In [14]:
y_test=test[['taxvaluedollarcnt']]
X_test=test_scaled.drop('taxvaluedollarcnt',axis=1)

## 4 FEATURE SELECTION

In [None]:
#This will use RFE to whittle down to the optimal amount of featues.
#Too many and you'll definitely overfit
optimal_n_of_features = feature_selection.optimal_number_of_features(X_train, y_train, X_test, y_test)

In [17]:
optimal_n_of_features

3

In [19]:
#So we know 3 is the amount of features to use. Given our x_train feature matrix.
best_features = feature_selection.optimal_features(X_train, y_train, 3)
best_features

  y = column_or_1d(y, warn=True)


Index(['bathrooms', 'bedrooms', 'sq_ft'], dtype='object')

## 5 MODELING AND EVALUATION

In [20]:
#WE"RE GONNA MAKE TWO Lin-Reg MODELS
#ONE TRAINED WITH EVERY FEATURE, ONE WITH THE 3 OPTIMAL FEATURES

#model_1 will be the baseline, with all the features
model_1=LinearRegression()
model_1.fit(X_train,y_train)

#model_1 is now ready to make some predictions on the feature matrix it was created from
#(X_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
y_hat_1=model_1.predict(X_train)

In [22]:
mse1, rmse1, r_squared1 = evaluate.regression_errors(y_train, y_hat_1)

In [24]:
r_squared1

array([[0.44533777]])

### Try it on the test data

In [31]:
y_hat_1_test=model_1.predict(X_test)

In [32]:
mse1_test, rmse1_test, r_squared1_test = evaluate.regression_errors(y_test, y_hat_1_test)

In [33]:
r_squared1_test

array([[0.35746853]])

In [None]:
#model_2 will only use the 3 features picked by our RFE functions
best_features
X_train_optimo = X_train[best_features]

In [35]:
X_test_optimo = X_test[best_features]

In [26]:
model_2=LinearRegression()
model_2.fit(X_train_optimo,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
y_hat_2=model_2.predict(X_train_optimo)

In [28]:
mse2, rmse2, r_squared2 = evaluate.regression_errors(y_train, y_hat_2)

In [29]:
r_squared2

array([[0.44533564]])

In [36]:
y_hat_2_test=model_2.predict(X_test_optimo)

In [37]:
mse2_test, rmse2_test, r_squared2_test = evaluate.regression_errors(y_test, y_hat_2_test)

In [38]:
r_squared2_test

array([[0.357466]])