In [1]:
%run 'utils.ipynb'

In [14]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import random

In [5]:
def get_ultrasound_dataset():
  return pd.read_csv('../training_ultrasound.csv')

def clean_ultrasound_dataset(df):
  # only consider pre-birth entries
  return df.query('AGEDAYS < 0')

def get_cleaned_ultrasound_dataset():
  return clean_ultrasound_dataset(get_ultrasound_dataset())

def score(actual, predictions):
  return mean_squared_error(actual, predictions)

In [26]:
def error_margin(actual, prediction):
  return abs(actual - prediction) / actual

def error_rate(actual, prediction, margin):
  pred_pos = len([x for ind, x in enumerate(prediction) if error_margin(x, actual[ind]) < margin])
  return pred_pos * 1.0 / len(prediction)

def error_margin_curve(actual, prediction, margins=[0.05, 0.10, 0.15, 0.20]):
  error_rates = [error_rate(actual, prediction, margin) for margin in margins]
  return pd.DataFrame({
            'error margin': pd.Series(margins) * 100,
            'error rate': pd.Series(error_rates) * 100
        })

def plot_roc_curve(actual, prediction, margins=[0.05, 0.10, 0.15, 0.20]):
  if hasattr(actual, 'values'):
    actual = actual.values
  sns.pointplot(x="error margin", y="error rate",
    data=error_margin_curve(actual, predictions, margins))


In [25]:
def train_and_evaluate_model(model, X, y, roc_curve=False):
  X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25)

  # we need the .as_matrix() for Keras unfortunately
  model.fit(X_train.as_matrix(), Y_train.as_matrix())

  print("R^2 on the train set:")
  print(score(Y_train, model.predict(X_train.as_matrix())))

  predictions = model.predict(X_test.as_matrix())

  print("\nR^2 on the test set:")
  print(score(Y_test, predictions))

  df = pd.DataFrame({
    'actual': pd.Series(Y_test.values),
    'predictions': pd.Series(predictions)
  })
  sns.jointplot('actual', 'predictions', df)

  print "columns:", X_test.columns.values

  df = pd.DataFrame({
    'actual - est': pd.Series(Y_test.values) - pd.Series(predictions),
    'GAGEDAYS': pd.Series(X_test['GAGEDAYS'])
  })
  sns.jointplot('GAGEDAYS', 'actual - est', df)
  plt.show()
  if (roc_curve):
    plot_roc_curve(Y_test, predictions)
  return model, Y_test, predictions
