In [7]:
%load_ext autoreload
%autoreload 2

import requests
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import tqdm
from IPython.display import Image, display
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Fetch dataset

In [2]:
link = "http://lib.stat.cmu.edu/datasets/boston"
raw_text = requests.get(link).text # get the raw text
raw_text = raw_text.split('\n') # split into lines
headers = raw_text[7:21]
headers = [h.split()[0] for h in headers]
raw_text = raw_text[22:-1] # discard description

raw_data = []
for i in range(0, len(raw_text), 2):
    # each row is split into 2 lines, so join them first
    row = raw_text[i] + raw_text[i+1]
    raw_data.append(row)

# split each row into columns and convert each cell -> string to float
raw_data = [[float(column) for column in row.split()] for row in raw_data]
raw_data = np.array(raw_data) # no error, implies equal length rows, error-free import

## Split into dev + test

In [11]:
# 1/6 parts in test, 5/6 parts in dev for 5-fold cross validation later
# random seed for reproducibility
raw_train_data, raw_test_data = train_test_split(raw_data, test_size=1/6, random_state=10)
raw_df = pd.DataFrame(raw_train_data,columns=headers)
print(raw_df)
raw_df.describe()

         CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
0     0.06076   0.0  11.93   0.0  0.573  6.976   91.0  2.1675   1.0  273.0   
1     0.05372   0.0  13.92   0.0  0.437  6.549   51.0  5.9604   4.0  289.0   
2     0.05561  70.0   2.24   0.0  0.400  7.041   10.0  7.8278   5.0  358.0   
3     0.05083   0.0   5.19   0.0  0.515  6.316   38.1  6.4584   5.0  224.0   
4    45.74610   0.0  18.10   0.0  0.693  4.519  100.0  1.6582  24.0  666.0   
..        ...   ...    ...   ...    ...    ...    ...     ...   ...    ...   
416   0.16760   0.0   7.38   0.0  0.493  6.426   52.3  4.5404   5.0  287.0   
417   0.62739   0.0   8.14   0.0  0.538  5.834   56.5  4.4986   4.0  307.0   
418   2.37857   0.0  18.10   0.0  0.583  5.871   41.9  3.7240  24.0  666.0   
419   0.16902   0.0  25.65   0.0  0.581  5.986   88.4  1.9929   2.0  188.0   
420   0.76162  20.0   3.97   0.0  0.647  5.560   62.8  1.9865   5.0  264.0   

     PTRATIO       B  LSTAT  MEDV  
0       21.0  396.90   5.64

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0,421.0
mean,3.864835,11.178147,11.395606,0.066508,0.556202,6.239606,69.111876,3.80733,9.767221,413.133017,18.554632,353.513088,12.997387,21.804276
std,9.170657,23.555059,6.903976,0.249465,0.116732,0.667412,27.880585,2.13385,8.836772,171.026499,2.136945,95.672303,7.087777,8.662466
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.137,1.0,187.0,12.6,0.32,1.92,5.0
25%,0.08221,0.0,5.19,0.0,0.449,5.876,45.7,2.0651,4.0,280.0,17.4,374.56,7.43,16.5
50%,0.25387,0.0,9.9,0.0,0.538,6.167,77.0,3.2628,5.0,334.0,19.1,391.27,11.66,20.7
75%,3.8497,12.5,18.1,0.0,0.631,6.565,94.5,5.118,24.0,666.0,20.2,396.21,17.21,24.5
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


         CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
0     0.06076   0.0  11.93   0.0  0.573  6.976   91.0  2.1675   1.0  273.0   
1     0.05372   0.0  13.92   0.0  0.437  6.549   51.0  5.9604   4.0  289.0   
2     0.05561  70.0   2.24   0.0  0.400  7.041   10.0  7.8278   5.0  358.0   
3     0.05083   0.0   5.19   0.0  0.515  6.316   38.1  6.4584   5.0  224.0   
4    45.74610   0.0  18.10   0.0  0.693  4.519  100.0  1.6582  24.0  666.0   
..        ...   ...    ...   ...    ...    ...    ...     ...   ...    ...   
416   0.16760   0.0   7.38   0.0  0.493  6.426   52.3  4.5404   5.0  287.0   
417   0.62739   0.0   8.14   0.0  0.538  5.834   56.5  4.4986   4.0  307.0   
418   2.37857   0.0  18.10   0.0  0.583  5.871   41.9  3.7240  24.0  666.0   
419   0.16902   0.0  25.65   0.0  0.581  5.986   88.4  1.9929   2.0  188.0   
420   0.76162  20.0   3.97   0.0  0.647  5.560   62.8  1.9865   5.0  264.0   

     PTRATIO       B  LSTAT  MEDV  
0       21.0  396.90   5.64