In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [55]:
def load_data(path_to_file):
    return pd.read_csv(path_to_file)
hosp_data = load_data(os.path.join(os.getcwd(), 'hospital_data.csv'))
hosp_data.head(100)
hosp_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9013 entries, 0 to 9012
Data columns (total 87 columns):
Prov Number                  9013 non-null int64
Clinic                       9013 non-null object
Division                     9013 non-null object
District                     9013 non-null object
Specialty                    9013 non-null object
Region                       9013 non-null object
Prenatal Care                9013 non-null object
Postpartum Care              9013 non-null object
Imm: Child Combo 2           9013 non-null object
Antidep Med: Contacts        9013 non-null object
Antidep Med: 84 Days         9013 non-null object
Antidep Med: 180 Days        9013 non-null object
Mental Hosp FU: 7 Day        9013 non-null object
Total PMPM                   9013 non-null object
PCP consult ratio n M'car    9013 non-null object
PCP consult ratio M'care     9013 non-null object
% Female-non medicare        9013 non-null object
% Female-medicare            9013 non-null o

In [184]:
#Let's start by creating our pipeline of transformations
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

class RowRemover(BaseEstimator, TransformerMixin):
    def __init__(self, rows_to_remove):
        self.rm_rows = rows_to_remove
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(self.rm_rows, axis=0)

class RegexReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, find, replace):
        self.find = find
        self.replace  = replace
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.replace(self.find, self.replace, regex=True)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes_names_to_keep):
        self.attribute_names = attributes_names_to_keep
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values


class CleanNaN(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        mask = pd.notnull(X).all(axis=1)
        return X[mask]

cleaning_pipeline = Pipeline([
    ('rowremover', RowRemover(0)),
    ('nanreplacer', RegexReplacer(r'^\s+', np.nan)),
])


In [205]:
from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer

#First try: Let's try to use regression on 2 simple features
features = ['% Generic Rx','Clinician rating (0-10)' ]
cat_attrs = ['Specialty']

category_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrs)),
    ('labelbin', LabelBinarizer()),
])



all_pipeline = Pipeline([
    ('cleaning', cleaning_pipeline),
    ('features', FeatureUnion(transformer_list=[
        ('cat_bin', category_pipeline),
        ('selector', DataFrameSelector(features)),
    ])),
    ('nan', CleanNaN())
])

data_prep = all_pipeline.fit_transform(hosp_data.copy())
data_prep



array([[0, 0, 0, ..., 0, '0.9863013699', '0.7083333333'],
       [0, 0, 0, ..., 0, '0.870846731', '0.8'],
       [0, 0, 0, ..., 0, '0.9790794979', '0.8181818182'],
       ..., 
       [0, 0, 0, ..., 0, '1', '0.7142857143'],
       [0, 0, 0, ..., 0, '0.5', '1'],
       [0, 0, 0, ..., 0, '1', '0.7']], dtype=object)

In [209]:
data_prep.shape
labels = data_prep[:,76]
train_data = data_prep[:,:76]

In [211]:
#Train a linear classifier on the data
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def display_results(res):
    print("Scores: {}\nMean: {}\nStd dev: {}\n".format(res, res.mean(), res.std()))

def cross_validate_model(regression, data, labels):
    scores = cross_val_score(regression, data, labels, scoring='neg_mean_squared_error', cv=10)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores


def calculate_rmse(labels, predictions, name_of_model):
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    print('For model {} mse is: {} and rmse is: {}'.format(name_of_model, mse, rmse))


lin_reg = LinearRegression()
lin_reg.fit(train_data, labels)

lin_pred = lin_reg.predict(train_data)
display_results(cross_validate_model(lin_reg, train_data, labels))

rndFor = RandomForestRegressor()
display_results(cross_validate_model(rndFor, train_data, labels))


Scores: [  1.54871878e+10   1.42938935e+10   3.33732305e+10   1.44956947e-01
   3.17760796e+09   1.51624712e-01   1.47903290e+11   1.76482895e-01
   2.12314205e-01   2.36254739e-01]
Mean: 21423520950.138893
Std dev: 43420903596.8452

Scores: [ 0.16843909  0.1506678   0.1741056   0.16334371  0.17023277  0.19245962
  0.20207383  0.20693412  0.22193207  0.24577372]
Mean: 0.18959623275878773
Std dev: 0.028165855017783972

