In [None]:
# import modules needed for data analysis and get them ready for use in the notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from geopy import distance
raw_data = pd.read_csv("kc_house_data.csv")

In [None]:
drop_raw = raw_data.drop(['id', 'date', 'waterfront', 'view', 'condition', 
                                      'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
                                     'sqft_living15', 'sqft_lot15'], axis=1)
clean_data = drop_raw[drop_raw['bedrooms'] != 33].copy()

In [None]:
clean_data['lat_and_long'] = list(zip(clean_data['lat'], clean_data['long']))

In [None]:
clean_data.head()

In [None]:
finder = clean_data[clean_data['zipcode']==98005]
finder

geopy is y, x

In [None]:
avg_lat_98005 = np.mean(finder['lat'])
avg_long_98005 = np.mean(finder['long'])
(avg_lat_98005, avg_long_98005)

In [None]:
distance.distance((47.6525, -122.16), (avg_lat_98005, avg_long_98005)).miles

In [None]:
mid_of_bellevue = (avg_lat_98005, avg_long_98005)

In [None]:
distances_col = [distance.distance(elem, mid_of_bellevue).miles for elem in clean_data['lat_and_long']]

In [None]:
clean_data['dist_from_bellevue'] = distances_col

In [None]:
clean_data_dist = clean_data.drop(['lat', 'long', 'lat_and_long'], axis=1).copy()

In [None]:
clean_data_dist['sqft_living_div_floors_div_sqft_lot'] = (clean_data_dist['sqft_living']/clean_data_dist['floors'])/clean_data_dist['sqft_lot']

In [None]:
clean_data_dist.drop('zipcode', axis=1).corr().loc[['price']]

In [None]:
ohe = OneHotEncoder(drop='first', categories='auto') 
price_zip_trans = ohe.fit_transform(clean_data_dist['zipcode'].values.reshape(-1,1))
zip_sparse = pd.DataFrame(price_zip_trans.todense(), columns=ohe.get_feature_names())

In [None]:
clean_data_dist_no_zip = clean_data_dist.drop(['zipcode', 'sqft_living_div_floors_div_sqft_lot'], axis=1).copy()

In [None]:
clean_data_dist_no_zip['log_price'] = np.log(clean_data_dist_no_zip['price'])
clean_data_dist_no_zip = clean_data_dist_no_zip.drop('price', axis=1)

In [None]:
model_data = zip_sparse.join(clean_data_dist_no_zip, how='inner')
model_data.head()

In [None]:
X = model_data.drop('log_price', axis=1)
y = model_data['log_price']
predictors = sm.add_constant(X)
model_stats = sm.OLS(y, predictors).fit()
model_stats.summary()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                   random_state=10)

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)
lr = LinearRegression()
lr.fit(X_train_sc, y_train)
lr.score(X_test_sc, y_test)

In [None]:
list(zip(lr.coef_, X_train.columns))

In [None]:
price_predict = lr.predict(X_test_sc)
price_predict

In [None]:
y_test_non_log = np.exp(y_test)
price_predict_non_log = np.exp(price_predict)

In [None]:
metrics.mean_squared_error(y_test_non_log, price_predict_non_log)

In [None]:
import math
math.sqrt(metrics.mean_squared_error(y_test_non_log, price_predict_non_log))

In [None]:
from sklearn.feature_selection import RFE

lr_rfe = LinearRegression()
select = RFE(lr_rfe, n_features_to_select=60)
select = select.fit(X = X_train_sc,
                    y = y_train)

In [None]:
columns = [feature[1] for feature in zip(select.support_, model_data.columns) if feature[0]]
X_selected_df = pd.DataFrame(model_data, columns=columns)

In [None]:
X_selected_df.shape

In [None]:
X_selected_df.columns

In [None]:
model_data.shape

In [None]:
X2 = X_selected_df
y2 = model_data['log_price']

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, 
                                                    y2,
                                                   random_state=10)

In [None]:
ss = StandardScaler()
ss.fit(X2_train)
X2_train_sc = ss.transform(X2_train)
X2_test_sc = ss.transform(X2_test)
lr = LinearRegression()
lr.fit(X2_train_sc, y2_train)
lr.score(X2_test_sc, y2_test)

In [None]:
price_predict2 = lr.predict(X2_test_sc)
y2_test_non_log = np.exp(y2_test)
price_predict2_non_log = np.exp(price_predict2)
math.sqrt(metrics.mean_squared_error(y2_test_non_log, price_predict2_non_log))