In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

characteristics = pd.read_csv('/workspaces/D2I-Jupyter-Notebook-Tools/ml-data science tutorials/data/ChildCharacteristics.csv')
identifiers = pd.read_csv('/workspaces/D2I-Jupyter-Notebook-Tools/ml-data science tutorials/data/ChildIdentifiers.csv')
cpps = pd.read_csv('/workspaces/D2I-Jupyter-Notebook-Tools/ml-data science tutorials/data/CINdetails.csv')

print(characteristics.head())
print(identifiers.head())
print(cpps.head())

   Unnamed: 0        LAchildID Ethnicity
0           0  RND000215205141      WIRT
1           1  RND000824303014      WROM
2           2  RND000750143123      AOTH
3           3  RND000909164501      MWBC
4           4  RND000382171815      APKN
   Unnamed: 0        LAchildID            UPN  FormerUPN  UPNunknown  \
0           0  RND000215205141  A850728973744        NaN         NaN   
1           1  RND000824303014  A141396438491        NaN         NaN   
2           2  RND000750143123  A929946861554        NaN         NaN   
3           3  RND000909164501  A612330267292        NaN         NaN   
4           4  RND000382171815  A604459366806        NaN         NaN   

  PersonBirthDate  ExpectedPersonBirthDate  GenderCurrent PersonDeathDate  
0      2019-12-06                      NaN              1             NaN  
1      2011-04-27                      NaN              9             NaN  
2      2017-06-06                      NaN              1             NaN  
3      2014-10-03

Task: build a decision tree to predict if a child will have at least one child protection plan

In [2]:
# Join tables on LAchildID

char_ident = identifiers.merge(characteristics, how='left', on='LAchildID')
char_ident = char_ident[['LAchildID', 'GenderCurrent', 'Ethnicity']]
char_ident

Unnamed: 0,LAchildID,GenderCurrent,Ethnicity
0,RND000215205141,1,WIRT
1,RND000824303014,9,WROM
2,RND000750143123,1,AOTH
3,RND000909164501,0,MWBC
4,RND000382171815,2,APKN
...,...,...,...
327,RND000112711501,2,WOTH
328,RND000513120794,2,WROM
329,RND000541643134,1,BCRB
330,RND000404939452,2,AIND


In [3]:
# Every child in cpps has had at least one child protection plan, so we need to remove duplicates and give all child in this table a value of 1
cpps.drop_duplicates('LAchildID', inplace=True)
cpps['CP plan'] = 1
cpps = cpps[['LAchildID', 'CP plan']]
df = char_ident.merge(cpps, how='left', on='LAchildID')
df['CP plan'].fillna(0, inplace=True)
df

Unnamed: 0,LAchildID,GenderCurrent,Ethnicity,CP plan
0,RND000215205141,1,WIRT,1.0
1,RND000824303014,9,WROM,1.0
2,RND000750143123,1,AOTH,1.0
3,RND000909164501,0,MWBC,1.0
4,RND000382171815,2,APKN,1.0
...,...,...,...,...
327,RND000112711501,2,WOTH,0.0
328,RND000513120794,2,WROM,0.0
329,RND000541643134,1,BCRB,0.0
330,RND000404939452,2,AIND,0.0


In [4]:
# Unpivot the table (create binary dummary variables for each category)
# For ethnicity, the first letter of the category is an ethnic main group

def ethnicity_check(row):
    if row[0] == 'W':
        return 'white'
    if (row[0] == 'A') | (row[0] == 'C'):
        return 'asian'
    if row[0] == 'M':
        return 'mixed'
    if row[0] == 'B':
        return 'black'
    else:
        return 'unknown'

df['Ethnicity'] = df['Ethnicity'].apply(ethnicity_check)
df


Unnamed: 0,LAchildID,GenderCurrent,Ethnicity,CP plan
0,RND000215205141,1,white,1.0
1,RND000824303014,9,white,1.0
2,RND000750143123,1,asian,1.0
3,RND000909164501,0,mixed,1.0
4,RND000382171815,2,asian,1.0
...,...,...,...,...
327,RND000112711501,2,white,0.0
328,RND000513120794,2,white,0.0
329,RND000541643134,1,black,0.0
330,RND000404939452,2,asian,0.0


In [5]:
def gender_check(row):
    if row == 1:
        return 'male'
    if row == 2:
        return 'female'
    else:
        return 'other'

df['GenderCurrent'] = df['GenderCurrent'].apply(gender_check)
df

Unnamed: 0,LAchildID,GenderCurrent,Ethnicity,CP plan
0,RND000215205141,male,white,1.0
1,RND000824303014,other,white,1.0
2,RND000750143123,male,asian,1.0
3,RND000909164501,other,mixed,1.0
4,RND000382171815,female,asian,1.0
...,...,...,...,...
327,RND000112711501,female,white,0.0
328,RND000513120794,female,white,0.0
329,RND000541643134,male,black,0.0
330,RND000404939452,female,asian,0.0


In [6]:
df_encoded = pd.get_dummies(df[['GenderCurrent', 'Ethnicity']]) # Unpivot to get dummy variables
cp = df[['CP plan']]
full_encoded_df = cp.merge(df_encoded, how='left', left_index=True, right_index=True)
full_encoded_df

Unnamed: 0,CP plan,GenderCurrent_female,GenderCurrent_male,GenderCurrent_other,Ethnicity_asian,Ethnicity_black,Ethnicity_mixed,Ethnicity_unknown,Ethnicity_white
0,1.0,False,True,False,False,False,False,False,True
1,1.0,False,False,True,False,False,False,False,True
2,1.0,False,True,False,True,False,False,False,False
3,1.0,False,False,True,False,False,True,False,False
4,1.0,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...
327,0.0,True,False,False,False,False,False,False,True
328,0.0,True,False,False,False,False,False,False,True
329,0.0,False,True,False,False,True,False,False,False
330,0.0,True,False,False,True,False,False,False,False


In [7]:
# Now we can start to build the model
feature_cols = ['GenderCurrent_female',
                'GenderCurrent_male',
                'GenderCurrent_other',
                'Ethnicity_asian',
                'Ethnicity_black',
                'Ethnicity_mixed',
                'Ethnicity_unknown',
                'Ethnicity_white']
X = full_encoded_df[feature_cols]
y = full_encoded_df['CP plan']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Iterate through three things to determin the best parameters for the decision tree classifier model:
# criteria: gini, entropy
# max depth
# min leaves

best_acc = 0

for criterion in 'gini', 'entropy':
    for max_depth in [2, 3, 4, 5, 6]:
        for min_samples_leaf in [5, 10, 20, 30]:
            clf = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf)
            clf = clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc = metrics.accuracy_score(y_test, y_pred)
            if acc > best_acc:
                best_params = f'{criterion}, {max_depth}, {min_samples_leaf}'
                best_acc = acc

print(best_params)
print(acc)

gini, 2, 5
0.77


In [None]:
# Use the best parameters to build the classifier
clf = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_leaf=5)
clf = clf.fit(X_train, y_train)