In [38]:
# you'll need to import a module 
import pandas as pd

filepath = "https://github.com/data-to-insight/ERN-sessions/raw/main/data/903_xlsx_errors.xlsx"

# read in EVERY sheet of the excel file
dfs = pd.read_excel(filepath, sheet_name=None)

# start to have a look at/investigate the data
dfs['ad1'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CHILD        30 non-null     object
 1   DOB          30 non-null     object
 2   DATE_INT     30 non-null     object
 3   DATE_MATCH   30 non-null     object
 4   FOSTER_CARE  30 non-null     int64 
 5   NB_ADOPTR    30 non-null     object
 6   SEX_ADOPTR   30 non-null     object
 7   LS_ADOPTR    30 non-null     object
dtypes: int64(1), object(7)
memory usage: 2.0+ KB


In [39]:
# 102 - date of birth is not a valid date
# .notna() / .isna()

header = dfs['header']

header['DOB'] = pd.to_datetime(header['DOB'], format="%d/%m/%Y", errors='coerce')

# invalid_dob = header['DOB'].isna()
# error_rows = header[invalid_dob]

valid_dob = header['DOB'].notna()
error_rows = header[~valid_dob]

error_rows

Unnamed: 0,CHILD,SEX,DOB,ETHNIC,UPN,MOTHER,MC_DOB
2675,bad_entry_1,7,NaT,Alien,UPDOG,,


In [40]:
# 115 - Date of Local Authority's (LA) decision that a child should be placed for adoption is not a valid date.
# Table - placed_for_adoption
# column - DATE_PLACED

p4a = dfs['placed_for_adoption']

p4a['DATE_PLACED'] = pd.to_datetime(p4a['DATE_PLACED'], format="%d/%m/%Y", errors='coerce')

invalid_dp = p4a['DATE_PLACED'].isna()

error_rows = p4a[invalid_dp]

error_rows

Unnamed: 0,CHILD,DOB,DATE_PLACED,DATE_PLACED_CEASED,REASON_PLACED_CEASED
29,bad_entry_1,Not a date,NaT,This also isn't a date,oop


In [41]:
# .isin()
# returns rows where the value is in a list of values
# value == another_value | value == another_another_value
# 103 - The ethnicity code is either not valid or has not been entered.
codes = [
        "WBRI",
        "WIRI",
        "WOTH",
        "WIRT",
        "WROM",
        "MWBC",
        "MWBA",
        "MWAS",
        "MOTH",
        "AIND",
        "APKN",
        "ABAN",
        "AOTH",
        "BCRB",
        "BAFR",
        "BOTH",
        "CHNE",
        "OOTH",
        "REFU",
        "NOBT",
    ]
valid_eth_codes = header['ETHNIC'].isin(codes)
error_rows = header[~valid_eth_codes]
error_rows



Unnamed: 0,CHILD,SEX,DOB,ETHNIC,UPN,MOTHER,MC_DOB
2675,bad_entry_1,7,NaT,Alien,UPDOG,,


In [42]:
#114 - Data entry to record the status of former carer(s) of an adopted child is invalid.
# table is ad1, column is FOSTER_CARE, 0, 1

ad1 = dfs['ad1']

# ad1['FOSTER_CARE'] = ad1['FOSTER_CARE'].astype('str')

allowed_codes = [0, 1]

valid_foster = ad1['FOSTER_CARE'].isin(allowed_codes)

error_rows = ad1[~valid_foster]

error_rows

Unnamed: 0,CHILD,DOB,DATE_INT,DATE_MATCH,FOSTER_CARE,NB_ADOPTR,SEX_ADOPTR,LS_ADOPTR
29,bad_entry,datedate,notdate,datenot,777,Q,Will,let's go


In [43]:
# 184 - Date of decision that a child should be placed for adoption is before the child was born.

df = pd.merge(p4a.reset_index(), header, how='left', on='CHILD', suffixes=["_P4A", None])

invalid_date_placed = df['DATE_PLACED'] <= df['DOB']

error_rows = df[invalid_date_placed]

error_rows

Unnamed: 0,index,CHILD,DOB_P4A,DATE_PLACED,DATE_PLACED_CEASED,REASON_PLACED_CEASED,SEX,DOB,ETHNIC,UPN,MOTHER,MC_DOB


In [44]:
# 302 - First episode starts before child was born.
# header, episodes, DOB from header, DECOM from episodes

episodes= dfs['episodes']

episodes['DECOM'] = pd.to_datetime(episodes['DECOM'], dayfirst=True, errors='coerce')

df = pd.merge(episodes, header, how='left', on='CHILD', suffixes=["_epi", "_hd"])

invalid_decom = df['DOB'] >= df['DECOM']

error_rows = df[invalid_decom]

error_rows

Unnamed: 0,CHILD,DECOM,RNE,LS,CIN,PLACE,PLACE_PROVIDER,DEC,REC,REASON_PLACE_CHANGE,HOME_POST,PL_POST,URN,SEX,DOB,ETHNIC,UPN,MOTHER,MC_DOB


In [53]:
# 188 - Child is aged under 4 years at 
# the end of the year but a Strengths and Difficulties (SDQ) score 
# or a reason for no SDQ score has been completed

oc2 = dfs['oc2'][['CHILD', 'DOB', 'SDQ_SCORE', "SDQ_REASON"]].copy()

oc2['DOB'] = pd.to_datetime(oc2['DOB'], dayfirst=True, errors='coerce')

collection_end = pd.to_datetime("31/03/2020", dayfirst=True)

oc2['4bday'] = oc2['DOB'] + pd.DateOffset(years=4)

under_4 = oc2['4bday'] > collection_end

sdq_score_reason = oc2['SDQ_REASON'].notna() | oc2['SDQ_SCORE']

error_rows = oc2[under_4 & sdq_score_reason]

error_rows








Unnamed: 0,CHILD,DOB,SDQ_SCORE,SDQ_REASON,4bday
2675,bad_child,2024-12-05,oops,bad,2028-12-05


In [55]:
# 189 - Child is aged 17 years or over at the beginning of the year,
#  but an Strengths and Difficulties (SDQ) score or a reason for no 
# Strengths and Difficulties (SDQ) score has been completed.
collection_start =  pd.to_datetime("01/04/2019", dayfirst=True)

over_17 = (oc2['DOB'] + pd.DateOffset(years=17)) < collection_start

error_rows = oc2[over_17 & sdq_score_reason]

error_rows


Unnamed: 0,CHILD,DOB,SDQ_SCORE,SDQ_REASON,4bday
20,947046,2001-08-22,,SDQ5,2005-08-22
33,714655,2001-10-01,,SDQ2,2005-10-01
35,819792,2001-04-14,25,,2005-04-14
43,197474,2001-03-30,5,,2005-03-30
58,552166,2001-11-09,34,,2005-11-09
...,...,...,...,...,...
2634,97512,2001-12-27,,SDQ1,2005-12-27
2653,413076,2001-05-14,32,,2005-05-14
2659,482205,2001-04-26,32,,2005-04-26
2660,473732,2001-06-17,,SDQ1,2005-06-17
