# Diabetes Dataset

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes

In [13]:
diabetes = load_diabetes()
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bra

In [15]:
data = pd.DataFrame(data=diabetes.data,columns=diabetes.feature_names)
target = pd.DataFrame(data=diabetes.target,columns=["target"])
df = pd.concat([data,target],axis=1)
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [16]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,442.0,-3.634285e-16,0.047619,-0.107226,-0.037299,0.005383,0.038076,0.110727
sex,442.0,1.308343e-16,0.047619,-0.044642,-0.044642,-0.044642,0.05068,0.05068
bmi,442.0,-8.045349e-16,0.047619,-0.090275,-0.034229,-0.007284,0.031248,0.170555
bp,442.0,1.281655e-16,0.047619,-0.1124,-0.036656,-0.005671,0.035644,0.132044
s1,442.0,-8.835316000000001e-17,0.047619,-0.126781,-0.034248,-0.004321,0.028358,0.153914
s2,442.0,1.327024e-16,0.047619,-0.115613,-0.030358,-0.003819,0.029844,0.198788
s3,442.0,-4.574646e-16,0.047619,-0.102307,-0.035117,-0.006584,0.029312,0.181179
s4,442.0,3.777301e-16,0.047619,-0.076395,-0.039493,-0.002592,0.034309,0.185234
s5,442.0,-3.830854e-16,0.047619,-0.126097,-0.033249,-0.001948,0.032433,0.133599
s6,442.0,-3.412882e-16,0.047619,-0.137767,-0.033179,-0.001078,0.027917,0.135612


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [18]:
X = df.drop("target",axis=1).values
y = df["target"].values

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

### Linear Regression 

In [25]:
lin = LinearRegression()
lin.fit(X_train,y_train)
lin_tr_pre = lin.predict(X_train)
lin_ts_pre = lin.predict(X_test)
print("MSE of train set prediction : ",mean_squared_error(y_train,lin_tr_pre))
print("MSE of test set prediction : ",mean_squared_error(y_test,lin_ts_pre))

MSE of train set prediction :  2734.7292724031436
MSE of test set prediction :  3424.3166882137334


### Ridge Regression

In [34]:
rid = Ridge(alpha=0.001)
rid.fit(X_train,y_train)
rid_tr_pre = rid.predict(X_train)
rid_ts_pre = rid.predict(X_test)
print("MSE of train set prediction : ",mean_squared_error(y_train,rid_tr_pre))
print("MSE of test set prediction : ",mean_squared_error(y_test,rid_ts_pre))

MSE of train set prediction :  2734.9554020063297
MSE of test set prediction :  3427.4008436567783


### K Neigbors Regressor

In [43]:
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train,y_train)
knn_tr_pre = knn.predict(X_train)
knn_ts_pre = knn.predict(X_test)
print("MSE of train set prediction : ",mean_squared_error(y_train,knn_tr_pre))
print("MSE of test set prediction : ",mean_squared_error(y_test,knn_ts_pre))

MSE of train set prediction :  1904.8265659427132
MSE of test set prediction :  4415.551810237203


### Decision Tree Regressor 

In [44]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)
dtr_tr_pre = dtr.predict(X_train)
dtr_ts_pre = dtr.predict(X_test)
print("MSE of train set prediction : ",mean_squared_error(y_train,dtr_tr_pre))
print("MSE of test set prediction : ",mean_squared_error(y_test,dtr_ts_pre))

MSE of train set prediction :  0.0
MSE of test set prediction :  8111.730337078651
