# Starter

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

housing = pd.read_csv("data.csv")

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=40)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

# Creating a Pipeline

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    #     ..... add as many as you want in your pipeline
    ('std_scaler', StandardScaler()),
])
housing_num_tr = my_pipeline.fit_transform(housing)

# Selecting a desired model

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

model = LinearRegression()
model.fit(housing_num_tr, housing_labels)

LinearRegression()

In [4]:
some_data = housing.iloc[:5]
some_label = housing_labels.iloc[:5]
parpared_data = my_pipeline.transform(some_data)
predicted = model.predict(parpared_data)

print('original:', list(some_label))
print('predicted:', list(predicted))

original: [22.1, 18.4, 26.6, 36.5, 36.2]
predicted: [27.277743207887745, 15.476002053225052, 27.737689181871247, 35.63266074062198, 27.334356691198398]


# Evaluating the model

In [5]:
from sklearn.metrics import mean_squared_error
housing_prediction = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_prediction)
rmse = np.sqrt(mse)
rmse

4.587855581203071

# Better evaluation with Cross Validation

In [6]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
rmse_scores.mean()

4.645841904512391

# Testing the model on test data

In [7]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()

parpared_data = my_pipeline.transform(X_test)
predicted = model.predict(parpared_data)

mse = mean_squared_error(Y_test, predicted)
rmse = np.sqrt(mse)
rmse

5.325906104069355

# saving and loading the model

In [8]:
from joblib import dump, load
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)
dump(model, 'model.joblib')

['model.joblib']

In [9]:
model = load('model.joblib')

In [10]:
features = np.array([[-5.4,4,-1.6,-0.6,-1.4,-11,-49,7.6,-26,-0.5,-0.9,0.4,-66]])
model.predict(features)

array([23.208])