In [6]:
# you'll need to import a module we use every week
import pandas as pd

filepath = "https://github.com/data-to-insight/ERN-sessions/raw/main/data/903_xlsx_errors.xlsx"

# read in the excel, reading in every sheet
dfs = pd.read_excel(filepath, sheet_name=None)

# dict_1 = {"Key_1":"value_1"}

# dfs.keys()

# start to investigate the data
# access the dict by giving dict name [key name]
dfs['header'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676 entries, 0 to 2675
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CHILD   2676 non-null   object 
 1   SEX     2676 non-null   int64  
 2   DOB     2676 non-null   object 
 3   ETHNIC  2676 non-null   object 
 4   UPN     2676 non-null   object 
 5   MOTHER  1 non-null      float64
 6   MC_DOB  1 non-null      object 
dtypes: float64(1), int64(1), object(5)
memory usage: 146.5+ KB


In [13]:
# .notna()
# .isna()
# 102 - date of birth is not a valid date

header = dfs['header']

# header.info()
# header.head() # its' d/m/y

header['DOB'] = pd.to_datetime(header['DOB'], format="%d/%m/%Y", errors='coerce')


# invalid_dobs = header['DOB'].notna()
# error_rows = header[~invalid_dobs]

invalid_dobs = header['DOB'].isna()
error_rows = header[invalid_dobs]

error_rows



Unnamed: 0,CHILD,SEX,DOB,ETHNIC,UPN,MOTHER,MC_DOB
2675,bad_entry_1,7,NaT,Alien,UPDOG,,


In [16]:
# 115 - Date of Local Authority's (LA) decision that a child should be placed for adoption is not a valid date."
# table is placed_for_adoption
p4a = dfs['placed_for_adoption']
# column is DATE_PLACED
p4a['DATE_PLACED'] = pd.to_datetime(p4a['DATE_PLACED'], format="%d/%m/%Y", errors="coerce")

invalid_date = p4a['DATE_PLACED'].isna()

error_rows = p4a[invalid_date]
error_rows

Unnamed: 0,CHILD,DOB,DATE_PLACED,DATE_PLACED_CEASED,REASON_PLACED_CEASED
29,bad_entry_1,Not a date,NaT,This also isn't a date,oop


In [21]:
# .isin()
# 103 - The ethnicity code is either not valid or has not been entered.

ethncity_codes = [
        "WBRI",
        "WIRI",
        "WOTH",
        "WIRT",
        "WROM",
        "MWBC",
        "MWBA",
        "MWAS",
        "MOTH",
        "AIND",
        "APKN",
        "ABAN",
        "AOTH",
        "BCRB",
        "BAFR",
        "BOTH",
        "CHNE",
        "OOTH",
        "REFU",
        "NOBT",
    ]

valid_ethnicity = header['ETHNIC'].isin(ethncity_codes)

error_rows = header[~valid_ethnicity]

error_rows




Unnamed: 0,CHILD,SEX,DOB,ETHNIC,UPN,MOTHER,MC_DOB
2675,bad_entry_1,7,NaT,Alien,UPDOG,,


In [27]:
# 114 - Data entry to record the status of former carer(s) of an adopted child is invalid.
# Whenw e do an isin we need to check the datatype of the list is the same as the values
# valid codes
code_list = ["0", "1"]

# table is ad1
ad1 = dfs['ad1']

ad1['FOSTER_CARE'] = ad1['FOSTER_CARE'].astype('str')

# column is FOSTER_CARE
valid_codes = ad1['FOSTER_CARE'].isin(code_list)

error_rows = ad1[~valid_codes]

error_rows

Unnamed: 0,CHILD,DOB,DATE_INT,DATE_MATCH,FOSTER_CARE,NB_ADOPTR,SEX_ADOPTR,LS_ADOPTR
29,bad_entry,datedate,notdate,datenot,777,Q,Will,let's go


In [34]:
# 184 - Date of decision that a child should be placed for adoption is before the child was born.

df = pd.merge(p4a, header, how='left', on='CHILD', suffixes=[None, "_header"])

df['DOB'] = pd.to_datetime(df['DOB'], format="%d/%m/%Y", errors='coerce')
df['DATE_PLACED'] = pd.to_datetime(df['DATE_PLACED'], format="%Y-%m-%d", errors='coerce')

placed_before_birth = (df['DATE_PLACED'] >= df['DOB']) | df['DATE_PLACED'].isna()

error_rows = df[~placed_before_birth]

error_rows



Unnamed: 0,CHILD,DOB,DATE_PLACED,DATE_PLACED_CEASED,REASON_PLACED_CEASED,SEX,DOB_header,ETHNIC,UPN,MOTHER,MC_DOB


In [43]:
# 302 - First episode starts before child was born.
# tables are header and episodes
# DOB from header and DECOM from episodes

episodes = dfs['episodes']

episodes['DECOM'] = pd.to_datetime(episodes['DECOM'], format="%Y-%m-%d", errors='coerce')

# sorts values
episodes = episodes.sort_values(['CHILD', 'DECOM'], ascending=True)

# drops duplicates
episodes.drop_duplicates('CHILD', keep='first', inplace=True)

df = pd.merge(episodes, header, how='left', on='CHILD', suffixes=['_epi', '_hdr'])

error_rows = df[df['DECOM'] < df['DOB']]
error_rows

Unnamed: 0,CHILD,DECOM,RNE,LS,CIN,PLACE,PLACE_PROVIDER,DEC,REC,REASON_PLACE_CHANGE,HOME_POST,PL_POST,URN
4801,32,2017-02-12,T,J1,N3,K2,PR1,15/04/2017,E3,,ED8 1OX,WI5 9CC,7876040
4041,982,2016-11-27,P,E1,N7,A5,PR3,07/10/2017,X1,LAREQ,EP10 1XZ,WI5 8XU,6162294
437,1243,2017-01-02,L,V4,N5,Z1,PR0,18/05/2017,E9,,EC5 5MZ,WC30 9OS,7270736
1794,1407,2015-11-28,L,L2,N3,P2,PR1,29/04/2017,X1,ALLEG,EG16 2ZL,WU19 1JS,6492318
3848,2431,2016-02-20,S,V2,N3,P2,PR5,18/11/2017,X1,APPRR,EK12 3JI,WV1 1FJ,5227516
...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,997201,2016-05-18,P,L3,N4,T3,PR3,07/06/2017,X1,CUSTOD,EL12 3BA,WV22 6GC,3780837
2766,997445,2016-06-20,P,J1,N2,U3,PR3,24/09/2017,E48,,EP2 2VO,WE6 1FZ,3786993
154,998192,2015-11-27,P,C2,N2,P1,PR5,08/07/2017,X1,ALLEG,EJ11 4FS,WT21 5RE,6722585
156,998777,2015-10-11,P,C2,N2,U1,PR2,17/03/2018,X1,CLOSE,ER24 5VS,WJ7 5PQ,1112628


In [53]:
# 188 - Child is aged under 4 years at the end of the year but a Strengths and Difficulties (SDQ) score or a reason for no SDQ score has been completed
oc2 = dfs['oc2']

oc2['DOB'] = pd.to_datetime(oc2['DOB'], format="%d/%m/%Y", errors='coerce')

collection_end = pd.to_datetime('31/03/2020', dayfirst=True)

oc2['4bday'] = oc2['DOB'] + pd.DateOffset(years=4)

error_mask = (oc2['4bday'] > collection_end) & (oc2['SDQ_SCORE'].notna() | oc2['SDQ_REASON'].notna())

error_rows = oc2[error_mask]

error_rows

# 189 - Child is aged 17 years or over at the beginning of the year, but an Strengths
#  and Difficulties (SDQ) score or a reason for no Strengths and Difficulties (SDQ) score has been completed.

# 01/04/2020
# age is 17 


Unnamed: 0,CHILD,DOB,SDQ_SCORE,SDQ_REASON,CONVICTED,HEALTH_CHECK,IMMUNISATIONS,TEETH_CHECK,HEALTH_ASSESSMENT,SUBSTANCE_MISUSE,INTERVENTION_RECEIVED,INTERVENTION_OFFERED,4bday
2675,bad_child,2024-12-05,oops,bad,no,yes,maybe,i,don't,know,possibly,yes,2028-12-05


In [52]:
# 189 - Child is aged 17 years or over at the beginning of the year, but an Strengths
#  and Difficulties (SDQ) score or a reason for no Strengths and Difficulties (SDQ) score has been completed.

collection_start = pd.to_datetime('01/04/2019', dayfirst=True)

over_17_start = oc2['DOB'] + pd.DateOffset(years=17) < collection_start
sdq_score_reason = oc2['SDQ_SCORE'].notna() | oc2['SDQ_REASON'].notna()

error_mask = over_17_start & sdq_score_reason

error_rows = oc2[error_mask]
error_rows

Unnamed: 0,CHILD,DOB,SDQ_SCORE,SDQ_REASON,CONVICTED,HEALTH_CHECK,IMMUNISATIONS,TEETH_CHECK,HEALTH_ASSESSMENT,SUBSTANCE_MISUSE,INTERVENTION_RECEIVED,INTERVENTION_OFFERED,4bday
20,947046,2001-08-22,,SDQ5,0,1,0,1,0,1,0,0,2005-08-22
33,714655,2001-10-01,,SDQ2,1,1,0,1,1,0,1,0,2005-10-01
35,819792,2001-04-14,25,,1,1,1,1,0,1,1,1,2005-04-14
43,197474,2001-03-30,5,,0,1,1,1,0,1,1,0,2005-03-30
58,552166,2001-11-09,34,,1,1,1,1,1,1,1,1,2005-11-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,97512,2001-12-27,,SDQ1,0,0,1,0,1,0,0,1,2005-12-27
2653,413076,2001-05-14,32,,1,0,1,0,0,0,1,1,2005-05-14
2659,482205,2001-04-26,32,,1,0,0,0,0,0,0,0,2005-04-26
2660,473732,2001-06-17,,SDQ1,0,1,0,1,0,1,0,0,2005-06-17
