# PyData Carolinas 2016 Tutorial

In [None]:
%matplotlib inline

from datetime import date, datetime, timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from stemgraphic import stem_graphic

### Getting the data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/softporcupine/pandas-matplotlib-example/master/home_data.csv')

In [None]:
df.head()

### Preliminary data munging

In [None]:
df['renovated'] = df.yr_renovated>=1

In [None]:
df.drop('date', axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.columns

## Our target: price

Let's first look at the distribution of price before we look at features

In [None]:
stem_graphic(df.price)

## Features

Some features are really indicators (like waterfront) and most are not continuous, but can be used as such. Afterall, you can have 2.5 baths or 1.5 floors...

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(df[['price', 'sqft_living', 'condition', 'sqft_lot']], hue='condition')

Square footage is probably a good indicator of price...

In [None]:
df.plot(x='sqft_living', y='price', kind='scatter')

Although, in real estate, it's all about LOCATION, LOCATION and LOCATION! Is that really the case?

In [None]:
df.plot(x='lat', y='price', kind='scatter')
df.plot(x='long', y='price', kind='scatter')

### Creating derived data

In [None]:
df['lat_from_center'] = (47.62 - df.lat).abs()
df['long_from_center'] = (-122.32 - df.long).abs()

In [None]:
sns.regplot(x='lat_from_center', y='price', data=df, line_kws={'color':'r'})

In [None]:
df_np = df.drop(['price', 'lat', 'long'], axis=1)

## Model

### Simple Linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split

In [None]:
X = df[['sqft_living', 'condition']]

In [None]:
lr = LinearRegression()
lr.fit(X[::2], df.price[::2])

In [None]:
lr.intercept_

In [None]:
lr.coef_

In [None]:
#price         bedrooms  bathrooms  sqft_living  sqft_lot      floors    waterfront  view      condition
#5.400881e+05  3.370842  2.114757   2079.899736  1.510697e+04  1.494309  0.007542    0.234303  3.409430
condition = 3.41
sqft = 2079.90

lr.intercept_ + lr.coef_[0] * sqft + lr.coef_[1] * condition

In [None]:
lr.score(X[1::2], df.price[1::2])

### More detailed model

In [None]:
from sklearn.feature_selection import RFE

In [None]:
lr = LinearRegression()
rfe = RFE(lr, 6)
rfe = rfe.fit(df_np, df.price)

In [None]:
print(df_np.columns)

In [None]:
print(df_np.columns[rfe.support_])
print(rfe.ranking_)
print(rfe.support_)

In [None]:
X = df[df_np.columns[rfe.support_]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df.price, test_size=0.25, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

### Metrics

In [None]:
print(lr.score(X_test, y_test))

### Visualisation

In [None]:
import matplotlib.pyplot as plt

In [None]:
ax = plt.scatter(lr.predict(X_train), lr.predict(X_train) - y_train, c='r', s=15, alpha=0.2)
plt.scatter(lr.predict(X_test), lr.predict(X_test) - y_test, c='b', s=15, alpha=0.2)