## Homework 1:

1- Build a Keras Model for linear regression (check: https://keras.io/activations/). Use Boston Housing Dataset to train and test your model

2- Build a Keras Model for logistic regression. Use diabetes.csv to train and test

Comments:

1- Build the **simplest model** for linear regression with Keras and compare your model performance with `from sklearn.linear_model import LinearRegression`

2- Build the **simplest model** for logistic regression with Keras and compare your model performance with `from sklearn.linear_model import LogisticRegression`

3- **Add more complexity to your models in (1) and (2)** and compare with previous results

### Linear Regression using Boston Housing Dataset

In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

boston = load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [2]:
bos = pd.DataFrame(boston.data)
bos.columns = boston.feature_names
bos['PRICE'] = boston.target

X = bos.drop('PRICE', axis = 1)
y = bos['PRICE']

import sklearn.model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.33, random_state = 5)

scaler = StandardScaler()
# scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


#### Linear Regression using ML/SciKit Learn

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lm = LinearRegression()

lm.fit(X_train, y_train)

y_pred = lm.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred) 
print(f'r^2:  {r2}')
print(f'MSE:  {mse}')
ml_y_pred = lm.predict(X_test)

r^2:  0.6956551656111603
MSE:  28.53045876597462


#### Linear Regression Using NN/Keras

In [4]:
# Build linreg model using keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
y_train_one_hot = np_utils.to_categorical(y_train)
y_test_one_hot = np_utils.to_categorical(y_test)

model = Sequential()
# model.add(Dense(13, input_shape=(13,), kernel_initializer='normal'))
model.add(Dense(51, input_shape=(13,), activation = 'linear'))
# model.add(Activation('linear'))
model.compile(optimizer='adam', loss='mse', metrics=["mse"])
model.fit(X_train, y_train_one_hot, epochs=30, batch_size=1, verbose=0);
loss, accuracy = model.evaluate(X_test, y_test_one_hot, verbose=0)
print("MSE = {:.2f}".format(accuracy))
model.evaluate(X_test, y_test_one_hot, verbose=0)







MSE = 0.02


[0.019306458856263562, 0.019306458856263562]

In [6]:
NN_y_pred = model.predict(X_test)

#### Compare NN and ML models

In [7]:
y_train_one_hot
# y_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Logistic Regression on Pima Diabetes Dataset

#### Load Diabetes / Data Preparation

In [12]:
# Import Pida Diabetes Dataset
diabetes = pd.read_csv('../notebooks/datasets/diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

X = diabetes[feature_cols]
y = diabetes['Outcome']

scaler = StandardScaler()
# scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.25, random_state=0)

y_test.value_counts()
print(diabetes.shape)

(768, 9)


In [13]:
# Metrics function for classification (for later use)
def metric(arr):
    """
    Input: 2X2 confusion matrix
    Outputs:  Classifier metrics
                - Accuracy
                - Precision
                - Recall
    """
    # Accuracy = correct / everything
    accuracy = (arr[0][0] + arr[1][1]) / (arr[0][0] + arr[0][1] + arr[1][1] + arr[1][0])
    
    # Precision = true positive / (true positive + false positive)
    precision = (arr[0][0]) / (arr[0][0] + arr[0][1])

    # Recall = true positive / (true positive + false negative)
    recall = arr[0][0] / (arr[0][0] + arr[1][0])
    
    return print(f' accuracy: {accuracy},\n precision: {precision},\n recall: {recall}')

#### Use SciKit Learn and ML

In [14]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
# clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

In [15]:
# Observe metrics and confusion matrix for model

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

arr = confusion_matrix(y_test, y_pred)
f1= f1_score(y_test, y_pred, average='macro')
print(f'f1: {f1}')
print(arr)
metric(arr)

f1: 0.5633162142333089
[[114  16]
 [ 46  16]]
 accuracy: 0.6770833333333334,
 precision: 0.8769230769230769,
 recall: 0.7125


#### LogReg Implementation using Keras/NN

In [16]:
y_train_one_hot = utils.to_categorical(y_train)
y_test_one_hot = utils.to_categorical(y_test)

model = Sequential()
model.add(Dense(4, input_dim=9, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=["accuracy"])
model.fit(X_train, y_train, epochs=100, batch_size=1)

# model.add(Dense(51, input_shape=(13,), activation = 'linear'))

NameError: name 'utils' is not defined