# Loading Diabetes Dataset


In [1]:
from sklearn import datasets

In [6]:
df = datasets.load_diabetes()

In [8]:
print(df.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bra

Feature Names

In [9]:
print(df.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


Create X and Y Data Matrices

In [10]:
X = df.data
Y = df.target

In [11]:
X.shape, Y.shape

((442, 10), (442,))

In [12]:
# All in one step.
X, Y = datasets.load_diabetes(return_X_y=True)

In [13]:
X.shape, Y.shape

((442, 10), (442,))

# Loading Housing Dataset

In [14]:
! wget https://github.com/dataprofessor/data/raw/master/BostonHousing.csv

--2020-09-10 23:01:30--  https://github.com/dataprofessor/data/raw/master/BostonHousing.csv
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/data/master/BostonHousing.csv [following]
--2020-09-10 23:01:30--  https://raw.githubusercontent.com/dataprofessor/data/master/BostonHousing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36242 (35K) [text/plain]
Saving to: ‘BostonHousing.csv’


2020-09-10 23:01:30 (2.52 MB/s) - ‘BostonHousing.csv’ saved [36242/36242]



In [15]:
import pandas as pd

In [16]:
BostonHousing = pd.read_csv("BostonHousing.csv")
BostonHousing.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


# Data Split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# 80/20 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [20]:
X_train.shape, Y_train.shape

((353, 10), (353,))

In [21]:
X_test.shape, Y_test.shape

((89, 10), (89,))

# Linear Regression Model

In [22]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
model = linear_model.LinearRegression()

In [24]:
model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
Y_pred = model.predict(X_test)

# Results

In [26]:
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_test, Y_pred))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_test, Y_pred))

Coefficients: [ -26.18554801 -231.55479249  494.33720597  359.89819876 -490.92444074
  237.03271747    3.78933134  166.57039367  658.85734356   37.85831572]
Intercept: 150.14160727989156
Mean squared error (MSE): 2679.10
Coefficient of determination (R^2): 0.51


In [27]:
r2_score(Y_test, Y_pred)

0.5105688234388033

In [28]:
'%.2f' % 0.523810833536016

'0.52'