# California Housing with Tax (Example "perfect predictor") 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
sns.set()

In [None]:
# Run this cell if you are working on Cloud Pak For Data
# Fetch the file
my_file = project.get_file("housing-with-tax.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
my_file.seek(0)
original_data = pd.read_csv(my_file)

original_data.head()

In [None]:
# Run this cell if you are working locally 
original_data = pd.read_csv('housing-with-tax.csv')

In [None]:
data_wo_null = original_data.dropna(axis=0)
data_wo_null.isnull().sum()

In [None]:
data_reduced_features = data_wo_null[['housing_median_age', 'total_rooms','population', 'median_income', 'median_house_value','ocean_proximity', 'median_tax']]
data_reduced_features.head()

In [None]:
df_dummies = pd.get_dummies(data_reduced_features, drop_first=True) # 0-1 encoding for categorical values
df_dummies.head()

In [None]:
target = df_dummies['median_house_value'] # feature to be predicted
predictors = df_dummies.drop(['median_house_value'], axis = 1) # all other features are used as predictors

In [None]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=123) # 80-20 split into training and test data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [None]:
print('training performance')
print(reg.score(X_train,y_train))
print('test performance')
print(reg.score(X_test,y_test))

In [None]:
y_pred = reg.predict(X_test)
test = pd.DataFrame({'Predicted':y_pred,'Actual':y_test})
fig= plt.figure(figsize=(16,8))
test = test.reset_index()
test = test.drop(['index'],axis=1)
plt.plot(test[:50])
plt.legend(['Actual','Predicted'])
sns.jointplot(x='Actual',y='Predicted',data=test,kind='reg',);