In [1]:
import random
import numpy as np
from sklearn.linear_model import (Lasso, Ridge, ElasticNet, LinearRegression)
from sklearn.model_selection import (cross_val_score, train_test_split, KFold, GridSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import functions_project_02
from pymongo import MongoClient
from pprint import pprint
%matplotlib inline

In [2]:
client = MongoClient()
db = client.get_database('philadelphia_home_prices')
col = db.get_collection('listing_data')

In [3]:
cursor = db['listing_data'].find({})

In [4]:
df = pd.DataFrame(list(cursor))

In [5]:
df = df.set_index('Address')

In [6]:
df = df.drop(['_id','Locality','Region','Postal Code'],1)

In [7]:
df.head()

Unnamed: 0_level_0,Price,Days on Market,SQFT House,SQFT Lot,Full Baths,Half Baths,Bedrooms,Garage,Master Bath,Cooling,...,Build Year,Remodel Year,Neighborhood,Neighborhood median sales,Neighborhood Median DOM,Neighborhood Price SQFT,Association,Association Monthly,URL,crawl time
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"1211 E Cheltenham Ave,",100000.0,44,1110.0,1080,1,0,3.0,No,No,No,...,1950,2020,Summerdale,Not Found,Not Found,120,No,0,https://www.realtor.com/realestateandhomes-det...,2020-11-07 17:18:22.082
"5254 N Howard St,",161650.0,55,1230.0,1050,2,0,3.0,No,No,[Window Unit],...,1925,2020,Olney,123000,67,113,No,0,https://www.realtor.com/realestateandhomes-det...,2020-11-07 17:18:26.690
"3429 W Queen Ln,",327000.0,36,1383.0,1381,2,0,3.0,No,No,"[Wall Unit Cooling, Window Unit]",...,1929,2020,East Falls,317500,56,220,No,0,https://www.realtor.com/realestateandhomes-det...,2020-11-07 17:18:33.404
"212 Carpenter St Unit C,",415000.0,49,1440.0,785,1,1,3.0,No,No,[Central A/C],...,1920,2020,Queen Village,550500,60,365,No,0,https://www.realtor.com/realestateandhomes-det...,2020-11-07 17:18:39.194
"3340 Tyson Ave,",185000.0,60,1099.0,1678,1,0,3.0,[ 1],No,No,...,1950,2020,Mayfair,207500,74,160,No,0,https://www.realtor.com/realestateandhomes-det...,2020-11-07 17:18:43.820


### Set X and y
    -set X and y
    -set test, train, validation splits. 60/20/20

In [8]:
X, y = df.drop(['Price'], axis= 1), df['Price']

In [9]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.25, random_state=43)

In [10]:
X_train.shape

(785, 22)

In [11]:
X_test.shape

(262, 22)

In [12]:
X_val.shape

(262, 22)

### Scale Values

In [13]:
std = StandardScaler()
std.fit(X_train.values);

ValueError: could not convert string to float: 'No'

In [None]:
#y_reshaped = y.reshape(-1,1)
#y.shape
type(y)

In [None]:
X_tr = std.transform(X_train.values)
X_te = std.transform(X_test.values)
X_val = std.transform(X_val.values)
X_trans = std.transform(X.values)
#y_trans = std.transform(y.values)

### Create Polynomial Features

In [None]:
p = PolynomialFeatures(degree=2)

In [None]:
X_train_poly = p.fit_transform(X_tr)
X_test_poly = p.fit_transform(X_te)
X_val_poly = p.fit_transform(X_val)
X_poly = p.fit_transform(X)

In [None]:
X_poly.size

### Linear Regression

In [None]:
lr = LinearRegression().fit(X_tr, y_train)
lr.score(X_tr, y_train)

In [None]:
lr.score(X_val,y_val)

In [None]:
lr.score(X_test, y_test)

In [None]:
lr.score(X,y)

In [None]:
(list(zip(X_tr, lr.coef_)))

In [None]:
#plt.table(X,lr.coef_)

### LASSO

In [None]:
lasso_model = Lasso(alpha = 9000) # this is a VERY HIGH regularization strength!, wouldn't usually be used
lasso_model.fit(X_train_poly, y_train)

In [None]:
(list(zip(X_train_poly, lasso_model.coef_)))

In [None]:
#Check score of train data
lasso_model.score(X_train_poly, y_train)

In [None]:
#Check score of validation data
lasso_model.score(X_val_poly, y_val)

In [None]:
#Check score of test data
lasso_model.score(X_test_poly, y_test)

### Ridge

In [None]:
ridge_model = Ridge(alpha = 400)
ridge_model.fit(X_train_poly, y_train)

In [None]:
list(zip(X_train_poly, ridge_model.coef_))

In [None]:
ridge_model.score(X_train_poly, y_train)

In [None]:
ridge_model.score(X_val_poly, y_val)

In [None]:
ridge_model.score(X_test_poly, y_test)

In [None]:
#sns.pairplot(df)

### MAE

**Linear:**

In [None]:
np.array[X_train['SQFT House']].reshape(-1,1)
lr_y_pred = lr.predict(X_train['SQFT House'])

In [None]:
mean_absolute_error(y, lr_y_pred)

In [None]:
mean_absolute_error(y, )

**Lasso:**

In [None]:
lasso_y.pred = lasso.predict(X_poly)

### Plot

In [None]:
regression_x = 

In [None]:
plt.scatter(df['Price'], df['Annual Tax']);
plt.title('Sale Price vs. Taxes/Yr')
plt.xlabel('Sale Price ($)')
plt.ylabel('Taxes/Yr ($)')
plt.xticks(rotation=45)
plt.plot()
plt.savefig('salevstax.png')

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
x = df['Build Year']
y = df['Annual Tax']
z = df['Price']
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(x,y,z)

In [None]:
plt.scatter(df['Price'], df['Annual Tax']);
#plt.plot()
plt.title('Sale Price vs. Tax Assessment')
plt.xlabel('Sale Price ($)')
plt.ylabel('Tax Assessment ($)')
#plt.savefig('salevstax.png')

In [None]:
plt.scatter(df['Build Year'][1900:2020], df['Annual Tax'])
plt.title('Sale Price vs. Tax Assessment')
plt.xlabel('Sale Price ($)')
plt.ylabel('Tax Assessment ($)')

In [None]:
df['Build Year Newer'] = df['Build Year'].apply(lambda x: np.nan if x<1950 else x)
df = df.dropna()
plt.scatter(df['Build Year Newer'], df['Annual Tax']);
plt.title('Build Year vs. Taxes/Year')
plt.xlabel('Build Year')
plt.ylabel('Taxes/Year ($)')
plt.savefig('yearvstax.png')

In [None]:
m, b = np.polyfit(X_train['SQFT House'], y_train, 1)

In [None]:
plt.plot(X_train['SQFT House'], m*(X_train['SQFT House'])+b, color = 'orange');
plt.scatter(df['SQFT House'], df['Price'])
plt.title('Livable Space vs. Price')
plt.xlabel('Livable Space (sqft)')
plt.savefig('spacevsprice.png')

In [None]:
plt.scatter(df['SQFT House'], df['Price'])

In [None]:
X_train

In [None]:
X_train['SQFT House']

In [None]:
lasso_model = Lasso(alpha = 5500) # this is a VERY HIGH regularization strength!, wouldn't usually be used
lasso_model.fit(x_train_poly, y_train)

In [None]:
list(zip(X_train.columns, lasso_model.coef_))

In [None]:
lasso_model.score(x_train_poly, y_train)

In [None]:
lasso_model.score(x_val_poly, y_val)

In [None]:
lasso_model.score(X, y)

In [None]:
#y=mx+b