## CSC 8515 - Machine Learning Project  
**Topic: Predicting success in rehabilitation  
Author: James Fung  **

In [None]:
#General.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

#One hot encoder.
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Split methods.
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn import svm
#from xgboost import XGBClassifier

#Reduction Techniques.
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

#Visualizations
import seaborn as sns
sns.set(style="ticks", color_codes=True)
%matplotlib inline
#Figure size.
from matplotlib import rcParams
# figure size in inches
rcParams['figure.figsize'] = 7,5

#Statistial packages.
from scipy.stats import chi2_contingency

#Metrics
from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

## Import Data, Feature Exploration

In [None]:
#Import the rehab file as a pandas file.

#Read the data.
rehab = pd.read_csv('Rehab.csv', header=0)

In [None]:
#Quick look at the data.
rehab.head()

In [None]:
#Drop the first three columns as they provide no information. Also drop some "FLG" columns, as they are nearly uniform.
rehabclean = rehab.drop(['Unnamed: 0','CASEID','DISYR','METHFLG','PCPFLG','HALLFLG','AMPHFLG','STIMFLG','TRNQFLG','BARBFLG','SEDHPFLG','INHFLG','OTCFLG','STFIPS','CBSA','REGION'],1)

In [None]:
#What are the column names?
print(rehabclean.columns)

In [None]:
#How do the frequencies of the features look?
#for i in rehabclean.columns:
#    print('Information for '+i+':')
#    print('')
#    print((rehabclean[i].value_counts()/len(rehabclean))*100)
#    print('----------------------------------')

Some columns contain too many categories which may lead to noise. Will need to recategorize at some point.

In [None]:
#We're interested in if they completed a treatment, or if they did not due to some personal reason.
#Filter rows to only those that interesting outcomes.

#rehabclean = rehabclean.query('REASON in ["TREATMENT COMPLETED","TERMINATED BY FACILITY","LEFT AGAINST PROFESSIONAL ADVICE","INCARCERATED","DEATH"]')
rehabclean = rehabclean.query('REASON in ["TREATMENT COMPLETED","LEFT AGAINST PROFESSIONAL ADVICE"]')

In [None]:
#Using the codebook provided by the CDC, missing values exist as "MISSING/UNKNOWN/NOT COLLECTED/INVALID" or -9.
#Convert these values into "NA"
rehabclean = rehabclean.replace("MISSING/UNKNOWN/NOT COLLECTED/INVALID",np.NaN)
rehabclean = rehabclean.replace(-9,np.NaN)

#rehabclean = rehabclean.replace("MISSING/UNKNOWN/NOT COLLECTED/INVALID",'MISSING')
#rehabclean = rehabclean.replace(-9,'MISSING')

## Feature Exploration

What categories could potentially be related to whether they complete rehabilitation or not?

In [None]:
#Is there a relationship between the substance and the reason for leaving?
sns.countplot(x="SUB1", hue="REASON",data=rehabclean)

In [None]:
#Do these FLG columns provide any extra information apart from SUB?

sns.countplot(x='MARFLG',hue='SUB1',data=rehabclean)

#No - it appears that the column disappears if the flg is not reported.

In [None]:
#Let's drop all of the remaining flg columns.
rehabclean = rehabclean.drop(['ALCFLG','COKEFLG','MARFLG','HERFLG','OPSYNFLG','MTHAMFLG','BENZFLG','OTHERFLG'],axis=1)

In [None]:
#Route dependent?
sns.countplot(x="ROUTE1", hue="REASON",data=rehabclean)

In [None]:
#Is there a relationship here to SUB1?
sns.countplot(x="ROUTE1", hue="SUB1",data=rehabclean)

These features pretty much directly relate to SUB. Let's try removing these from the dataset.

In [None]:
rehabclean = rehabclean.drop(['ROUTE1','ROUTE2','ROUTE3'],axis=1)

In [None]:
#Psychological problem?

sns.countplot(x="PSYPROB", hue="REASON",data=rehabclean)

If the patient has an addiction to alcohol, they seem to be able to complete rehab at a muc higher rate than more extreme drugs.

In [None]:
#Age?
sns.countplot(x="AGE", hue="REASON", order=['12-14','15-17','18-20','21-24','25-29','30-34','35-39',
                                           '40-44','45-49','50-54','55 AND OVER'], data=rehabclean)

Very young rehabilitation patients do not seem to be able to complete it as opposed to older patients (below 20).

In [None]:
#RACE?
sns.countplot(x="RACE", hue="REASON", data=rehabclean)

Minorities seem to complete it at a lower rate as well.

In [None]:
#GENDER?
sns.countplot(x="GENDER", hue="REASON", data=rehabclean)

Females seem to complete it at a lower rate.

In [None]:
#HOMELESSNESS?
sns.countplot(x="LIVARAG", hue="REASON", data=rehabclean)

In [None]:
#MARSTAT?
sns.countplot(x="MARSTAT", hue="REASON", data=rehabclean)

In [None]:
#PREG?
sns.countplot(x="PREG", hue="REASON", data=rehabclean)

## Missing Value Imputation

In [None]:
#How many missing values are in each column? What is the proportion?
missing = ((rehabclean.isnull().sum()/len(rehabclean))*100).to_dict()
missingsort = sorted(missing.items(),key=lambda kv: kv[1])

In [None]:
missingsort

In [None]:
#For features with less than 10% of missing values, append the to a list.
autoimputelist = []

for i in missingsort:
    if i[1] > 0 and i[1] < 10:
        autoimputelist.append(i[0])

In [None]:
#For features with less than 10% of missing values, drop from dataset.

rehabclean = rehabclean.dropna(subset=autoimputelist)

In [None]:
#Produce a new list of missing values.
missing = ((rehabclean.isnull().sum()/len(rehabclean))*100).to_dict()
missingsort = sorted(missing.items(),key=lambda kv: kv[1])

largemissing = []

for col in missingsort:
    if col[1]>0:
        print(col)
        largemissing.append(col[0])

What should I do about these features?

In [None]:
#For features 60% and over, let's examine some of them.
rcParams['figure.figsize'] = 7,5
#PREG?
sns.countplot(x="PREG", hue="REASON", data=rehabclean)

In [None]:
#DSMCRIT?
rcParams['figure.figsize'] = 14,10
g = sns.countplot(x="DSMCRIT", hue="REASON", data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

In [None]:
sns.countplot(x="DETNLF", hue="REASON", data=rehabclean)

In [None]:
g=sns.countplot(x="DETCRIM", hue="REASON", data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

In [None]:
sns.countplot(x="FRSTUSE3", hue="REASON", data=rehabclean)

In [None]:
g = sns.countplot(x="PRIMPAY", hue="PRIMINC", data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

In [None]:
#Drop features with missing values >60%.

droplist = []

for i in missingsort:
    if i[1] > 60:
        droplist.append(i[0])

for i in droplist:
    rehabclean = rehabclean.drop(i,axis=1)

In [None]:
#Produce a new list of missing values.
missing = ((rehabclean.isnull().sum()/len(rehabclean))*100).to_dict()
missingsort = sorted(missing.items(),key=lambda kv: kv[1])

largemissing = []

for col in missingsort:
    if col[1]>0:
        print(col)
        largemissing.append(col[0])

In [None]:
#Are these columns worth imputing? Let's see how well they relate to the class label.

#Perform a chi-squared test for PSYPROB on REASON.
contingence = pd.crosstab(rehabclean['PSYPROB'],rehabclean['REASON'])

chi2_contingency(contingence)

#The p-value is 0, suggesting that there might be a relationship.

#What about the other columns?
for col in largemissing:
    cont = pd.crosstab(rehabclean[col],rehabclean['REASON'])
    val,pval,dof,exp = chi2_contingency(cont)
    print(str(col) + " : " + str(round(pval,2)))

Nearly all of these columns are statistically significant - will need to determine how to impute these.

In [None]:
sns.countplot(x="HLTHINS", hue="REASON", data=rehabclean)

#Doesn't seem significant - drop.
rehabclean = rehabclean.drop(['HLTHINS'],axis=1)

In [None]:
sns.countplot(x="PRIMINC", hue="REASON", data=rehabclean)

In [None]:
sns.countplot(x="PRIMINC", hue="EMPLOY", data=rehabclean)

In [None]:
#For those feature columns, replace with the most common label.
#for column in largemissing:
#    mode = rehabclean[str(column)].value_counts().idxmax()
#    rehabclean[str(column)].fillna(mode,inplace=True)

In [None]:
#Try replacing all these missing values with -1.

rehabclean = rehabclean.replace(np.NaN,-1)

### Feature Recategorization

In [None]:
#Length of stay variable is sparse, must impute manually.
#LOS < 30 is sparse, and is categorical, recombine into <30 days.
LOSrecode = list(range(1,31))
LOSrecode = list(map(str,LOSrecode))
rehabclean['LOS'] = rehabclean['LOS'].replace(LOSrecode,'LESS THAN 30')

#Replace missing values in LOS with most common.
LOSmode = rehabclean['LOS'].value_counts().idxmax()
rehabclean['LOS'].fillna(LOSmode, inplace=True)

In [None]:
sns.countplot(x='LOS',hue='REASON',data=rehabclean)

In [None]:
#Daywait is sparse, let's recode.
rehabclean['DAYCAT'] = 'No Wait'
rehabclean['DAYCAT'][rehabclean['DAYWAIT'] == 0] = 'No Wait'
rehabclean['DAYCAT'][rehabclean['DAYWAIT'] > 0] = 'More than 0 days'

rehabclean = rehabclean.drop('DAYWAIT',axis=1)

In [None]:
sns.countplot(x='DAYCAT',hue='REASON',data=rehabclean)

In [None]:
#SUB's

rehabclean.loc[rehabclean['SUB1'].value_counts()[rehabclean['SUB1']].values < 10000,'SUB1'] = "OTHER"
rehabclean.loc[rehabclean['SUB2'].value_counts()[rehabclean['SUB2']].values < 10000,'SUB2'] = "OTHER"

In [None]:
g=sns.countplot(x='SUB1',hue='REASON',data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

In [None]:
sns.countplot(x='SUB2',hue='REASON',data=rehabclean)

In [None]:
sns.countplot(x='FREQ1',hue='REASON',data=rehabclean)

In [None]:
sns.countplot(x='FREQ2',hue='REASON',data=rehabclean)

It seems like SUB2 doesn't really have as strong of a relationship to completion as SUB1. FREQ doesn't look very strong as well.

Let's drop SUB2, SUB3, FREQ2, and FREQ3.

In [None]:
rehabclean = rehabclean.drop(['SUB2','SUB3','FREQ2'],axis=1)

In [None]:
#What about ethnicity and race?

g = sns.countplot(x='RACE',hue='REASON',data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

#Need to recode race.
rehabclean.loc[rehabclean['RACE'].value_counts()[rehabclean['RACE']].values < 25000,'RACE'] = "OTHER"

#Replot.
g = sns.countplot(x='RACE',hue='REASON',data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

In [None]:
#Drop ethnicity as this is essentially correalated to race.

rehabclean = rehabclean.drop(['ETHNIC'],axis=1)

In [None]:
#Drop ALCDRUG, as SUB1 covers this already.
rehabclean = rehabclean.drop(['ALCDRUG'],axis=1)

In [None]:
#Drop FRSTUSE2, as FIRSTUSE1 covers.
rehabclean = rehabclean.drop(['FRSTUSE2'],axis=1)

In [None]:
#SERVSETD.
rehabclean.loc[rehabclean['SERVSETD'].value_counts()[rehabclean['SERVSETD']].values < 10000,'SERVSETD'] = "OTHER"

g=sns.countplot(x='SERVSETD',hue='REASON',data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

In [None]:
#PSOURCE.
rehabclean.loc[rehabclean['PSOURCE'].value_counts()[rehabclean['PSOURCE']].values < 10000,'PSOURCE'] = "OTHER"

g=sns.countplot(x='PSOURCE',hue='PRIMINC',data=rehabclean)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

### Model Testing

#### Encoding section:

In [None]:
#Manually encode ordinal variables to keep natural ordering.

rehabclean['AGE'] = rehabclean['AGE'].map({'12-14':0,'15-17':1,'18-20':2,'21-24':3,
                                                 '25-29':4,'30-34':5,'35-39':6,'40-44':7,
                                                 '45-49':8,'50-54':9,'55 AND OVER':10})

In [None]:
rehabclean['EDUC'] = rehabclean['EDUC'].map({'8 YEARS OR LESS':0,'9-11':1,'12':2,'13-15':3,
                                                 '16 OR MORE':4})

In [None]:
rehabclean['ARRESTS'] = rehabclean['ARRESTS'].map({'NONE':0,'ONCE':1,'2 OR MORE TIMES':2})

In [None]:
rehabclean['LIVARAG'] = rehabclean['LIVARAG'].map({'HOMELESS':0,'DEPENDENT LIVING':1,'INDEPENDENT LIVING':2})

In [None]:
rehabclean['FREQ1'] = rehabclean['FREQ1'].map({'NO USE IN THE PAST MONTH':0,
                                                   '1-3 TIMES IN THE PAST MONTH':1,
                                                   '1-2 TIMES IN THE PAST WEEK':2,
                                                  '3-6 TIMES IN THE PAST WEEK':3,
                                                  'DAILY':4})

In [None]:
rehabclean['NOPRIOR'] = rehabclean['NOPRIOR'].map({'NO PRIOR TREATMENT EPISODE':0,
                                                   '1 PRIOR TREATMENT EPISODES':1,
                                                   '2 PRIOR TREATMENT EPISODES':2,
                                                  '3 PRIOR TREATMENT EPISODES':3,
                                                  '4 PRIOR TREATMENT EPISODES':4,
                                                  '5 OR MORE PRIOR TREATMENT EPISODES':5})

In [None]:
rehabclean['FRSTUSE1'] = rehabclean['FRSTUSE1'].map({'11 AND UNDER':0,'12-14':1,'15-17':2,'18-20':3,'21-24':4,
                                                 '25-29':5,'30-34':6,'35-39':7,'40-44':8,
                                                 '45-49':9,'50-54':10,'55 AND OVER':11})

In [None]:
rehabclean['LOS'] = rehabclean['LOS'].map({'LESS THAN 30':0,'31 TO 45 DAYS':1,'46 TO 60 DAYS':2,
                                           '61 TO 90 DAYS':3,'91 TO 120 DAYS':4,
                                                 '121 TO 180 DAYS':5,'181 TO 365 DAYS':6,'MORE THAN A YEAR':7})

In [None]:
rehabtest = rehabclean

In [None]:
#Encode the binary columns.
def recat(colnames):
    for col in colnames:
        rehabtest[col] = rehabtest[col].astype('category')
        rehabtest[col] = rehabtest[col].cat.codes

recat(['GENDER','VET','METHUSE','LOS','PSYPROB','DAYCAT']) 

In [None]:
#One hot encode multicategory.

rehabtest = pd.get_dummies(rehabtest,columns=['RACE','MARSTAT','EMPLOY','VET',
                                                     'PRIMINC','DIVISION','SERVSETD','PSOURCE','SUB1'])

### Dimensionality Reduction

Could we reduce this dataset into two components? Will it mean anything?

In [None]:
X=rehabtest.iloc[:,rehabtest.columns != 'REASON']
Y=rehabtest.iloc[:,rehabtest.columns == 'REASON']

In [None]:
# set up a PCA learner
pca = PCA(n_components = 2)
eigenbasis = pca.fit(X)
rehab2d = eigenbasis.transform(X)

In [None]:
# let's also look at how much of the total variance we were able to cover with 2 dimensions
print('percentage of variance explained:', sum(pca.explained_variance_ratio_))

In [None]:
sns.scatterplot(rehab2d[:,0],rehab2d[:,1],hue=Y.values.flatten())

In [None]:
#Utilizer a TSNE learner.

#Randomly sample the data, as many of these algorithms took way too long to run on 750k.

sample = rehabtest.sample(n=10000)

Xs=sample.iloc[:,rehabtest.columns != 'REASON']
Ys=sample.iloc[:,rehabtest.columns == 'REASON']

td = TSNE(n_components=2).fit_transform(Xs)

In [None]:
#Split this into features and labels, and run a test algorithm.

PCAlabels = Y.values

X_train, X_test, y_train, y_test = train_test_split(rehab2d,PCAlabels,test_size=.3)

#Train on a neural network.
neural = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(15,),random_state=1)
neural = neural.fit(X_train,y_train)
y_pred = neural.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
#Train on a random forest.

clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

from sklearn import metrics
print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

### Feature Selection

In [None]:
#Use recursive feature elimination.
#Create a baseline classifier from a robust model, used ot evaluate a subset of attributes.

rf = LogisticRegression()
#rf = RandomForestClassifier(n_estimators=10)

#Create the RFE model and select 3 attributes.
rfe = RFE(rf, n_features_to_select=5)
rfe = rfe.fit(X_train,y_train)

#Summarise the selection of the attributes.
print(rfe.support_)
print sorted(zip(map(lambda x:round(x,4),rfe.ranking_),rehabtest.columns)

### Baseline Models

In [None]:
rehabclean['REASON'].value_counts()/len(rehabclean)

In [None]:
#Split the data into features and labels, and split into training and testing data.

X=rehabtest.iloc[:,rehabtest.columns != 'REASON']
Y=rehabtest.iloc[:,rehabtest.columns == 'REASON']

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=.3)

In [None]:
#Train on a random forest.

rf=RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

from sklearn import metrics
print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
print(classification_report(y_test, y_pred, target_names=['LEFT AGAINST PROFESSIONAL ADVICE','SUCCESFUL']))

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

In [None]:
scores = {}

#Train on a decision tree, checking to see the optimal depth.
for i in range(0,len(X.columns)):
    tr = tree.DecisionTreeClassifier(max_depth=i+1)
    tr = tr.fit(X_train,y_train.values.ravel())
    y_pred = tr.predict(X_test)

    print('Accuracy:',metrics.accuracy_score(y_test,y_pred))
    accuracy = metrics.accuracy_score(y_test,y_pred)
    scores[i+1] = accuracy

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
#tr = tree.DecisionTreeClassifier(max_depth=14)
tr = tree.DecisionTreeClassifier(max_depth=3)
tr = tr.fit(X_train,y_train.values.ravel())
y_pred = tr.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
import graphviz
dot_data = tree.export_graphviz(tr, out_file = None,
                                feature_names = X.columns,
                               class_names = Y['REASON'].unique(),
                               filled = True, rounded = True,
                               special_characters = True)
graph = graphviz.Source(dot_data)
graph.render('Rehab Tree')

In [None]:
#For neural networks, one would need to normalize the features. Let's use sklearn to do this.

from sklearn import preprocessing

normalizedX = preprocessing.normalize(X_train)

In [None]:
#Train on a neural network.
#neural = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(15,),random_state=1)
neural = MLPClassifier()
neural = neural.fit(X_train,y_train.values.ravel())
y_pred = neural.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

print(classification_report(y_test, y_pred, target_names=['LEFT AGAINST PROFESSIONAL ADVICE','SUCCESFUL']))

In [None]:
#Adaboost on full set.

ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train,y_train)
y_pred=ada.predict(X_test)

from sklearn import metrics
print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

print(classification_report(y_test, y_pred, target_names=['LEFT AGAINST PROFESSIONAL ADVICE','SUCCESFUL']))

In [None]:
#Train on SVM.
X_train, X_test, y_train, y_test = train_test_split(Xs,Ys,test_size=.3)

from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

from sklearn import metrics
print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
#Train on kNN.
neigh = KNeighborsClassifier(n_neighbors=3)
neigh = neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
#Train on XGBoost.
boost = XGBClassifier()
boost = boost.fit(X_train,y_train)
y_pred = boost.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

print(classification_report(y_test, y_pred, target_names=['LEFT AGAINST PROFESSIONAL ADVICE','SUCCESFUL']))

In [None]:
#Train on logistic regression.
logreg = LogisticRegression()
logreg = logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=['LEFT AGAINST PROFESSIONAL ADVICE','SUCCESFUL']))

### k-fold Cross Validation

From above, I determined that the model with the best goal in mind, predicting unsuccesful patients, was the neural network as that had the lowest recall.

Lets' perform k-fold cross validation to check the stability of the model.

In [None]:
rkf = RepeatedKFold(n_splits = 5, n_repeats = 4)

neural = MLPClassifier()

nnScores = cross_val_score(neural, X, Y.values.ravel(), cv = rkf)

In [None]:
nnScores

In [None]:
from statistics import mean
from statistics import variance

print(mean(nnScores))
print(variance(nnScores))

This looks pretty stable!

## Hyperparameter Tuning

In [None]:
from sklearn.grid_search import RandomizedSearchCV
from scipy import stats

neural = MLPClassifier()

rs = RandomizedSearchCV(neural, param_distributions={
    'learning_rate_init': stats.uniform(0.001, 0.05),
    'hidden_layer_sizes': stats.randint(4, 200),
    'activation': ["logistic", "tanh", "relu"]})
rs.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = rs.predict(X_test)

print('Accuracy:',metrics.accuracy_score(y_test,y_pred))

In [None]:
print(classification_report(y_test, y_pred, target_names=['LEFT AGAINST PROFESSIONAL ADVICE','SUCCESFUL']))