In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
def transform_features(df):
    df.drop(['SERV_AREA_ID','ApptDate','har','DEPARTMENT_NAME',
            'DEPARTMENT_ID','Smoker','Hispanic','Language','pat_id','DepSpecialty'],axis=1,inplace=True)
    df = df[df.Status != 'Scheduled']
    df = df[df.LeadDays > 0]
    df = simplify_lead(df)
    df = simplify_bmi(df)
    df = simplify_ages(df)
    df.Religion = df.Religion.fillna('Unknown')
    df.Race = df.Race.fillna('Patient Declined')
    df.Status = df.Status.map({'Completed':0,'No Show':1})
    df.rename(columns={'Status':'NoShow'},inplace=True)
    return df

def simplify_lead(df):
    
    bins = (0,9,22,52,10000)
    group_names = ['1_Quartile','2_Quartile','3_Quartile','4_Quartile']
    categories = pd.cut(df.LeadDays,bins,labels=group_names)
    df.LeadDays = categories
    return df

def simplify_bmi(df):
    df.BMI = df.BMI.fillna(-1)
    bins = (-10,0,23,28,35,10000)
    group_names = ['Unknown','1_Quartile','2_Quartile','3_Quartile','4_Quartile']
    categories = pd.cut(df.BMI,bins,labels=group_names)
    df.BMI = categories
    return df

def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

df = pd.read_csv('FMtest.csv')
df = transform_features(df)
df = df[df['BMI'].notnull()]

In [42]:
def split_set(df):
    msk = np.random.rand(len(df)) < 0.8
    train = df[msk]
    test = df[~msk]
    return train, test
    
#train, test = split_set(df)

In [19]:
from sklearn import preprocessing
def encode_features(_df):#df_train, df_test):
    features = ['DayOfWeek','LeadDays','Age','Race',
                'Religion','BMI']
    #df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        #le = le.fit(df_combined[feature])
        le = le.fit(_df[feature])
        _df[feature] = le.transform(_df[feature])
        #df_test[feature] = le.transform(df_test[feature])
    return _df

#train, test = encode_features(train, test)
df = encode_features(df)

In [22]:
#sns.barplot(x='BMI',y='NoShow',data=df)

In [12]:
from sklearn.model_selection import train_test_split

x_all = df.drop(['NoShow','csn'],axis=1)
y_all = df['NoShow']

num_test = 0.20

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all,
test_size = num_test, random_state = 23)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring = acc_scorer)
grid_obj = grid_obj.fit(x_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=6, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [14]:
prob = clf.predict_proba(x_test)

In [13]:
predictions = clf.predict(x_test)
print(accuracy_score(y_test,predictions))

0.798365122616


In [21]:
from sklearn.cross_validation import KFold

def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        x_train, x_test = x_all.values[train_index], x_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(x_train, y_train)
        predictions = clf.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome))
    
#run_kfold(clf)

In [23]:
#y_test.head()

In [16]:
predictions

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [17]:
prob2 = np.array([i[1] for i in prob])

In [20]:
#ids = test['csn']
ids = y_test

predictions = clf.predict(x_test)

output = pd.DataFrame({'Label' : ids, 'Prediction': predictions, 'Probability': prob2 }).join(df['csn'])

output.to_csv('predictions.csv', index = False)
df.to_csv('df.csv', index=False)
#output