<a href="https://colab.research.google.com/github/eamagnusson/ML-Codeathon-1-Predicting-Boston-Housing-Prices/blob/main/eam7cf_codeathon_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1. Setup

Following Professor Nguyen's example end-to-end Machine Learning project, this project will serve as a system to predict housing prices in Boston, MA. This follows the 8 step process, beginning with checking that Python 3.5 or later is installed, Scikit-Learn 0.20 or later, and other necessary packages.

In [None]:
# Python ≥3.5 is required
#import sys #
# Scikit-Learn ≥0.20 is required
import sklearn # general ml package

# Common imports
import numpy as np # fundamental package for scientific computing
import os # to run file I/O operation 

# to make this notebook's output stable across runs
# any number will do, as long as it is used consistently
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
  """Save the selected figure into disc under an image extention and resolution

  Parameters
  ----------
  fig_id : figure handle
    The id number of the figure
  tight_layout : boolean (True or False)
    Specifies whenther the layout should be tight or not
  fig_extension: string
    Under what extension should the file be saved ("png" or "jpg")
  resolution: int
    the quality of the figure (ie. 100,200,300)

  Returns
  -------
  void
    Does not return any value
  """

  path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
  print("Saving figure", fig_id)
  if tight_layout:
    plt.tight_layout()
  plt.savefig(path, format=fig_extension, dpi=resolution)

# Step 2. Get the Housing Data

In [None]:
from six.moves import urllib # support URL download

DOWNLOAD_ROOT = "https://www.kaggle.com/vikrishnan/boston-house-prices/download"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    """Fetch housing data from a remote URL to Colab file system"""
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    csv_path = os.path.join(housing_path, "housing.csv")
    urllib.request.urlretrieve(housing_url, csv_path)
    # housing_tgz = tarfile.open(tgz_path)
    # housing_tgz.extractall(path=housing_path)
    # housing_tgz.close()

fetch_housing_data()

In [None]:
import pandas as pd # Pandas module to enable data manipulation

def load_housing_data(housing_path=HOUSING_PATH):
    """Load Housing Data into Workspace from a CSV"""
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path, error_bad_lines=False)

housing = load_housing_data()
# Display the first n rows of the data. (n=5 by default, we will use 10 rows)
housing.head(10)

b'Skipping line 6: expected 1 fields, saw 2\nSkipping line 9: expected 1 fields, saw 4\nSkipping line 20: expected 1 fields, saw 2\nSkipping line 32: expected 1 fields, saw 8\nSkipping line 41: expected 1 fields, saw 9\nSkipping line 45: expected 1 fields, saw 2\nSkipping line 46: expected 1 fields, saw 6\nSkipping line 51: expected 1 fields, saw 2\nSkipping line 52: expected 1 fields, saw 3\nSkipping line 53: expected 1 fields, saw 2\nSkipping line 54: expected 1 fields, saw 2\nSkipping line 55: expected 1 fields, saw 2\nSkipping line 74: expected 1 fields, saw 2\nSkipping line 75: expected 1 fields, saw 2\nSkipping line 76: expected 1 fields, saw 2\nSkipping line 77: expected 1 fields, saw 2\nSkipping line 84: expected 1 fields, saw 2\nSkipping line 86: expected 1 fields, saw 2\nSkipping line 94: expected 1 fields, saw 2\nSkipping line 155: expected 1 fields, saw 2\nSkipping line 156: expected 1 fields, saw 2\nSkipping line 157: expected 1 fields, saw 2\nSkipping line 158: expected 1

Unnamed: 0,<!DOCTYPE html>
0,"<html lang=""en"">"
1,<head>
2,<title>Kaggle: Your Home for Data Science<...
3,"<meta charset=""utf-8"" />"
4,"<meta name=""description"" content=""Kaggle i..."
5,"<meta name=""turbolinks-cache-control"" cont..."
6,"<meta name=""theme-color"" content=""#008ABC"" />"
7,"<script nonce=""YxRmX3wWPZDNqYIPX3bYig=="" t..."
8,if ('serviceWorker' in navigator) {
9,navigator.serviceWorker.getRegistr...
