# Model Evaluation

In [1]:
# Import useful libraries.
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Read in the file URL.
fileURL = 'https://raw.githubusercontent.com/cse44648/cse44648/master/datasets/hypothyroid.csv'

# Read the file into a DataFrame.
data = pd.read_csv(fileURL)

# Output the first few rows of the dataset.
data.head()

Unnamed: 0,Age,Sex,On Thyroxine,Query on Thyroxine,On Antithyroid Medication,Thyroid Surgery,Query Hypothyroid,Query Hyperthyroid,Pregnant,Sick,...,T3,TT4 Measured,TT4,T4U Measured,T4U,FTI Measured,FTI,TBG Measured,TBG,Class
0,72.0,M,f,f,f,f,f,f,f,f,...,0.6,y,15.0,y,1.48,y,10.0,n,,hypothyroid
1,15.0,F,t,f,f,f,f,f,f,f,...,1.7,y,19.0,y,1.13,y,17.0,n,,hypothyroid
2,24.0,M,f,f,f,f,f,f,f,f,...,0.2,y,4.0,y,1.0,y,0.0,n,,hypothyroid
3,24.0,F,f,f,f,f,f,f,f,f,...,0.4,y,6.0,y,1.04,y,6.0,n,,hypothyroid
4,77.0,M,f,f,f,f,f,f,f,f,...,1.2,y,57.0,y,1.28,y,44.0,n,,hypothyroid


### ROC Curves

In [2]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder

df = data.copy()

# Separate features (all but last row) and classes (last row).
X = df.ix[:, :-1]
y = df.ix[:, -1]

def impute(X):
    # Initialize an Imputer with the appropriate imputation strategy.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for col in X.select_dtypes(include=['float64']).columns:
        # Fit the imputer and transform/impute the feature.
        imputed_Xnp = imp.fit_transform(X[col].reshape(-1,1))

        # Replace the original feature values with the imputed values.
        imputed_X = pd.DataFrame(imputed_Xnp, columns=[col])
        X[col] = imputed_X
    return X

def dummify(X):
    # Encode nominal features with indicator/dummy variables.
    dummy_cols = [X.columns[i] for i, tp in enumerate(X.dtypes) if tp == 'object']
    for col in dummy_cols:
        #print('Encoding feature \"' + col + '\" ...')
        #print('Old dataset shape: ' + str(X.shape))
        temp = pd.get_dummies(X[col], prefix=col)
        X = pd.concat([X, temp], axis=1).drop(col, axis=1)
        #print('New dataset shape: ' + str(X.shape))
        #unique_vals, X[col] = np.unique(X[col], return_inverse=True)
    return X

def encode(y): 
    # Encode and transform class values to numeric type.
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y).astype(int)
    y = pd.Series(y, name=df.columns[-1])
    return y

X = impute(X)
X = dummify(X)
y = encode(y)

clf = KNeighborsClassifier(n_neighbors=5)

for rs in range(5):
    skf = StratifiedKFold(y, n_folds=10, random_state=rs)
    skf_auc = 0
    for train_index, test_index in skf:
        X_train, X_test = X.ix[train_index], X.ix[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        skf_auc += roc_auc_score(y_test, y_pred)
    print(clf)
    print(skf_auc / 10)
    print("")

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.8785510724

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.8785510724

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.8785510724

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.8785510724

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.8785510724

