In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

In [2]:
ChildProtectionPlans = "ChildProtectionPlans"
CINdetails = "CINdetails"
Section47 = "Section47"
CPPstartDate = "CPPstartDate"
LAchildID = "LAchildID"
CINdetailsID = "CINdetailsID"
DateOfInitialCPC = "DateOfInitialCPC"
Header = "Header"
ReferenceDate = "ReferenceDate"

In [3]:
df_cpp = pd.DataFrame(
        [    # child1: Simulates multiple cin modules with one out of census period, and multiple CPPs within the same CIN.
            {   "LAchildID": "child1",                
                "CINdetailsID": "cinID1",
                "CPPstartDate": "26/05/2021",  #  passes in section47
            },
            {   "LAchildID": "child1",                
                "CINdetailsID": "cinID1",      # simulates multiple CPPs with same LAchildID-CINdetailsID where some fail, some pass.
                "CPPstartDate": "26/06/2021",  #  # fail. fails both section47 and cin
            },
            {   "LAchildID": "child1",
                "CINdetailsID": "cinID2",
                "CPPstartDate": "27/06/2002",  # ignore. would've failed but ignored. Not in period of census
            },
            # child2: fail in section47, pass in cin
            {   "LAchildID": "child2",
                "CINdetailsID": "cinID1",
                "CPPstartDate": "26/05/2021",  # fail. fails in section47, pass in cin not considered.
            },
            # child3: multiple cin details modules.
            {   "LAchildID": "child3",
                "CINdetailsID": "cinID1",
                "CPPstartDate": "26/05/2021",  # fail. fails in both section47 and cin
            },
            {  
                "LAchildID": "child3",
                "CINdetailsID": "cinID2",
                "CPPstartDate": pd.NA,  # ignore. cppstartdate is absent
            },
            {   "LAchildID": "child3",
                "CINdetailsID": "cinID3",
                "CPPstartDate": "07/02/2022",  # fail. fails both section47 and cin
            },
            {   "LAchildID": "child3",
                "CINdetailsID": "cinID4",
                "CPPstartDate": "14/03/2022",  # fail. fails in section47, pass in cin not considered.
            },
            # child 5: date present in section47 and absent in cin
            {
                "LAchildID": "child5",
                "CINdetailsID": "cinID4",
                "CPPstartDate": "19/07/2021", # passes in section47
            },
            # child 6: no DateOfInitialCPC recorded in cin or section47 table
            {
                "LAchildID": "child6",
                "CINdetailsID": "cinID4",
                "CPPstartDate": "19/07/2021", # fail
            },
            # child 8: multiple section47s in the same cin module where some pass and others fail.
            {
                "LAchildID": "child8",
                "CINdetailsID": "cinID1",
                "CPPstartDate": "20/10/2021", # passes in section_47
            },
            # child 9: present in cin, absent in section47
            {
                "LAchildID": "child9",
                "CINdetailsID": "cinID1",
                "CPPstartDate": "20/10/2021",  # passes in cin
            },
        ]
    )

In [4]:
df_47 = pd.DataFrame(
        [
            {  # 0 pass
                "LAchildID": "child1",
                "DateOfInitialCPC": "26/05/2021", # pass. same as cppstartdate
                "CINdetailsID": "cinID1",
            },
            {  # 1 ignored
                "LAchildID": "child1",
                "DateOfInitialCPC": "26/05/2021", # ignore. cppstartdate not in period of census
                "CINdetailsID": "cinID2",
            },
            {  # 2 pass
                "LAchildID": "child2",
                "DateOfInitialCPC": "30/05/2021", # fail. not the same
                "CINdetailsID": "cinID1",
            },
            {  # 4 absent, ignored
                "LAchildID": "child3",
                "DateOfInitialCPC": "26/05/2021", # ignore. cppstartdate is absent.
                "CINdetailsID": "cinID2",
            },
            {  # 5 fail
                "LAchildID": "child3",
                "DateOfInitialCPC": "26/05/2021", # fail. not the same
                "CINdetailsID": "cinID3",
            },
            {  # 6 pass
                "LAchildID": "child3",
                "DateOfInitialCPC": pd.NA, # fail. not the same
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child5",
                "DateOfInitialCPC": "19/07/2021", # pass. same as cppstartdate
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child5",
                "DateOfInitialCPC": pd.NA, # pass since other section47 in same modeule passes.
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child6",
                "DateOfInitialCPC": pd.NA, # fail. not the same
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child8",
                "DateOfInitialCPC": "20/10/2021", # pass. same as cpp_start_date
                "CINdetailsID": "cinID1",
            },
            {
                "LAchildID": "child8",
                "DateOfInitialCPC": "22/07/2021", # pass since other section47 in the same CINmodule passes.
                "CINdetailsID": "cinID1",
            },
        ]
    )

In [5]:
df_cin = pd.DataFrame(
        [
            {  # 0 pass
                "LAchildID": "child1",
                "DateOfInitialCPC": "26/10/2020", # ignore fail. not the same but present in section47 table
                "CINdetailsID": "cinID1",
            },
            {  # 1 ignore
                "LAchildID": "child1",
                "DateOfInitialCPC": "26/05/2021", # ignore. cppstartdate not in period of census
                "CINdetailsID": "cinID2",
            },
            {  # 2 pass
                "LAchildID": "child2",
                "DateOfInitialCPC": "26/05/2021", # ignore fail. could've passed but present in section47
                "CINdetailsID": "cinID1",
            },
            {  # 3 fail
                "LAchildID": "child3",
                "DateOfInitialCPC": "28/05/2021", # fail. not the same and no corresponding section47
                "CINdetailsID": "cinID1",
            },
            {  # 4 ignore
                "LAchildID": "child3",
                "DateOfInitialCPC": "26/05/2021", # ignore. cppstartdate is absent
                "CINdetailsID": "cinID2",
            },
            {  # 5 fail
                "LAchildID": "child3",
                "DateOfInitialCPC": "26/05/2003", # ignore fail. not the same and has corresponding section47.
                "CINdetailsID": "cinID3",
            },
            {  # 6 pass
                "LAchildID": "child3",
                "DateOfInitialCPC": "14/03/2022", # ignore fail. could've passed but present in section47
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child5",
                "DateOfInitialCPC": pd.NA, # ignore fail. not the same  and has corresponding section47.
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child6",
                "DateOfInitialCPC": pd.NA, # ignore fail. not the same and has corresponding section47
                "CINdetailsID": "cinID4",
            },
            {
                "LAchildID": "child7",
                "DateOfInitialCPC": pd.NA, # ignore. not present in cpp table.
                "CINdetailsID": "cinID1",
            },
            {
                "LAchildID": "child8",
                "DateOfInitialCPC": pd.NA, # passes in section47
                "CINdetailsID": "cinID1",
            },
            {
                "LAchildID": "child9",
                "CINdetailsID": "cinID1",
                "DateOfInitialCPC": "20/10/2021",  # passes in cin
            },
        ]
    )

In [6]:
df_cpp[CPPstartDate] = pd.to_datetime(
    df_cpp[CPPstartDate], format="%d/%m/%Y", errors="coerce"
)
df_47["DateOfInitialCPC"] = pd.to_datetime(
    df_47["DateOfInitialCPC"], format="%d/%m/%Y", errors="coerce"
)
df_cin["DateOfInitialCPC"] = pd.to_datetime(
    df_cin["DateOfInitialCPC"], format="%d/%m/%Y", errors="coerce"
)

In [7]:
# Before you begin, rename the index so that the initial row positions can be kept intact.
df_cpp.index.name = "ROW_ID"
df_47.index.name = "ROW_ID"
df_cin.index.name = "ROW_ID"

# Resetting the index causes the ROW_IDs to become columns of their respective DataFrames
# so that they can come along when the merge is done.

df_cpp.reset_index(inplace=True)
df_47.reset_index(inplace=True)
df_cin.reset_index(inplace=True)

In [8]:
df_cpp

Unnamed: 0,ROW_ID,LAchildID,CINdetailsID,CPPstartDate
0,0,child1,cinID1,2021-05-26
1,1,child1,cinID1,2021-06-26
2,2,child1,cinID2,2002-06-27
3,3,child2,cinID1,2021-05-26
4,4,child3,cinID1,2021-05-26
5,5,child3,cinID2,NaT
6,6,child3,cinID3,2022-02-07
7,7,child3,cinID4,2022-03-14
8,8,child5,cinID4,2021-07-19
9,9,child6,cinID4,2021-07-19


In [9]:
# CPPstartDate is nan
df_cpp = df_cpp[df_cpp[CPPstartDate].notna()]

# CPPstartDate is not in period of census
df_cpp[CPPstartDate] = pd.to_datetime(df_cpp[CPPstartDate], format="%d/%m/%Y", errors="coerce")

collection_start= pd.to_datetime("01/04/2021", format="%d/%m/%Y", errors="coerce") 
collection_end= pd.to_datetime("31/03/2022", format="%d/%m/%Y", errors="coerce")

start_date_present = df_cpp[CPPstartDate].notna()
within_period = (df_cpp[CPPstartDate] >= collection_start) & (df_cpp[CPPstartDate] <= collection_end)
df_cpp = df_cpp[start_date_present & within_period]
df_cpp

Unnamed: 0,ROW_ID,LAchildID,CINdetailsID,CPPstartDate
0,0,child1,cinID1,2021-05-26
1,1,child1,cinID1,2021-06-26
3,3,child2,cinID1,2021-05-26
4,4,child3,cinID1,2021-05-26
6,6,child3,cinID3,2022-02-07
7,7,child3,cinID4,2022-03-14
8,8,child5,cinID4,2021-07-19
9,9,child6,cinID4,2021-07-19
10,10,child8,cinID1,2021-10-20
11,11,child9,cinID1,2021-10-20


If CPPstartDate is present, it should be equal to at least one Section47 DateOfInitialCPC

In [10]:
df_cpp_47 = df_cpp.merge(df_47, on=[LAchildID, CINdetailsID],how="inner", suffixes=["_cpp", "_47"])
df_cpp_47

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC
0,0,child1,cinID1,2021-05-26,0,2021-05-26
1,1,child1,cinID1,2021-06-26,0,2021-05-26
2,3,child2,cinID1,2021-05-26,2,2021-05-30
3,6,child3,cinID3,2022-02-07,4,2021-05-26
4,7,child3,cinID4,2022-03-14,5,NaT
5,8,child5,cinID4,2021-07-19,6,2021-07-19
6,8,child5,cinID4,2021-07-19,7,NaT
7,9,child6,cinID4,2021-07-19,8,NaT
8,10,child8,cinID1,2021-10-20,9,2021-10-20
9,10,child8,cinID1,2021-10-20,10,2021-07-22


children who pass the rule

In [11]:
df_cpp_47_pass = df_cpp_47[df_cpp_47[CPPstartDate] == df_cpp_47[DateOfInitialCPC]]
df_cpp_47_pass

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC
0,0,child1,cinID1,2021-05-26,0,2021-05-26
5,8,child5,cinID4,2021-07-19,6,2021-07-19
8,10,child8,cinID1,2021-10-20,9,2021-10-20


In [12]:
df_cpp_47_failable = (df_cpp_47[df_cpp_47[CPPstartDate] != df_cpp_47[DateOfInitialCPC]])
df_cpp_47_failable

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC
1,1,child1,cinID1,2021-06-26,0,2021-05-26
2,3,child2,cinID1,2021-05-26,2,2021-05-30
3,6,child3,cinID3,2022-02-07,4,2021-05-26
4,7,child3,cinID4,2022-03-14,5,NaT
6,8,child5,cinID4,2021-07-19,7,NaT
7,9,child6,cinID4,2021-07-19,8,NaT
9,10,child8,cinID1,2021-10-20,10,2021-07-22


In [13]:
df_cpp_47_failable["ERROR_ID"] = tuple(zip(df_cpp_47_failable[LAchildID], df_cpp_47_failable[CINdetailsID]))
df_cpp_47_failable

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC,ERROR_ID
1,1,child1,cinID1,2021-06-26,0,2021-05-26,"(child1, cinID1)"
2,3,child2,cinID1,2021-05-26,2,2021-05-30,"(child2, cinID1)"
3,6,child3,cinID3,2022-02-07,4,2021-05-26,"(child3, cinID3)"
4,7,child3,cinID4,2022-03-14,5,NaT,"(child3, cinID4)"
6,8,child5,cinID4,2021-07-19,7,NaT,"(child5, cinID4)"
7,9,child6,cinID4,2021-07-19,8,NaT,"(child6, cinID4)"
9,10,child8,cinID1,2021-10-20,10,2021-07-22,"(child8, cinID1)"


In [14]:
df_cpp_47_pass["ERROR_ID"] = tuple(zip(df_cpp_47_pass[LAchildID], df_cpp_47_pass[CINdetailsID]))
df_cpp_47_pass

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC,ERROR_ID
0,0,child1,cinID1,2021-05-26,0,2021-05-26,"(child1, cinID1)"
5,8,child5,cinID4,2021-07-19,6,2021-07-19,"(child5, cinID4)"
8,10,child8,cinID1,2021-10-20,9,2021-10-20,"(child8, cinID1)"


In [15]:
df_cpp_47_fail = df_cpp_47_failable[~(df_cpp_47_failable["ERROR_ID"].isin(df_cpp_47_pass["ERROR_ID"]))]
df_cpp_47_fail

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC,ERROR_ID
2,3,child2,cinID1,2021-05-26,2,2021-05-30,"(child2, cinID1)"
3,6,child3,cinID3,2022-02-07,4,2021-05-26,"(child3, cinID3)"
4,7,child3,cinID4,2022-03-14,5,NaT,"(child3, cinID4)"
7,9,child6,cinID4,2021-07-19,8,NaT,"(child6, cinID4)"


The pass/fail of a CPPstartDate should not affect the pass/fail of other CPPstartDate in its CIN module. Hence, another fail dataset is created which includes CPPstartDate in the filter. This ensures that within one CIN details module, it is possible to have a CPPstartDate that passes and one that fails.

In [16]:
df_cpp_47_failable["ERROR_startdate"] = tuple(zip(df_cpp_47_failable[LAchildID], df_cpp_47_failable[CINdetailsID], df_cpp_47_failable[CPPstartDate]))
df_cpp_47_pass["ERROR_startdate"] = tuple(zip(df_cpp_47_pass[LAchildID], df_cpp_47_pass[CINdetailsID], df_cpp_47_pass[CPPstartDate]))
df_cpp_47_startdate_fail = df_cpp_47_failable[~(df_cpp_47_failable["ERROR_startdate"].isin(df_cpp_47_pass["ERROR_startdate"]))]
df_cpp_47_startdate_fail

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC,ERROR_ID,ERROR_startdate
1,1,child1,cinID1,2021-06-26,0,2021-05-26,"(child1, cinID1)","(child1, cinID1, 2021-06-26 00:00:00)"
2,3,child2,cinID1,2021-05-26,2,2021-05-30,"(child2, cinID1)","(child2, cinID1, 2021-05-26 00:00:00)"
3,6,child3,cinID3,2022-02-07,4,2021-05-26,"(child3, cinID3)","(child3, cinID3, 2022-02-07 00:00:00)"
4,7,child3,cinID4,2022-03-14,5,NaT,"(child3, cinID4)","(child3, cinID4, 2022-03-14 00:00:00)"
7,9,child6,cinID4,2021-07-19,8,NaT,"(child6, cinID4)","(child6, cinID4, 2021-07-19 00:00:00)"


In [17]:
df_cpp_cin = df_cpp.merge(df_cin, on=[LAchildID, CINdetailsID], suffixes=["_cpp", "_cin"])
df_cpp_cin_pass = df_cpp_cin[df_cpp_cin[CPPstartDate] == df_cpp_cin[DateOfInitialCPC]]
df_cpp_cin_failable = df_cpp_cin[df_cpp_cin[CPPstartDate] != df_cpp_cin[DateOfInitialCPC]]

df_cpp_cin_failable["ERROR_ID"] = tuple(zip(df_cpp_cin_failable[LAchildID], df_cpp_cin_failable[CINdetailsID]))
df_cpp_cin_pass["ERROR_ID"] = tuple(zip(df_cpp_cin_pass[LAchildID], df_cpp_cin_pass[CINdetailsID]))

df_cpp_cin_failable["ERROR_startdate"] = tuple(zip(df_cpp_cin_failable[LAchildID], df_cpp_cin_failable[CINdetailsID], df_cpp_cin_failable[CPPstartDate]))
df_cpp_cin_pass["ERROR_startdate"] = tuple(zip(df_cpp_cin_pass[LAchildID], df_cpp_cin_pass[CINdetailsID], df_cpp_cin_pass[CPPstartDate]))


df_cpp_cin_fail = df_cpp_cin_failable[~(df_cpp_cin_failable["ERROR_ID"].isin(df_cpp_cin_pass["ERROR_ID"]))]

df_cpp_cin_fail

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_cin,DateOfInitialCPC,ERROR_ID,ERROR_startdate
0,0,child1,cinID1,2021-05-26,0,2020-10-26,"(child1, cinID1)","(child1, cinID1, 2021-05-26 00:00:00)"
1,1,child1,cinID1,2021-06-26,0,2020-10-26,"(child1, cinID1)","(child1, cinID1, 2021-06-26 00:00:00)"
3,4,child3,cinID1,2021-05-26,3,2021-05-28,"(child3, cinID1)","(child3, cinID1, 2021-05-26 00:00:00)"
4,6,child3,cinID3,2022-02-07,5,2003-05-26,"(child3, cinID3)","(child3, cinID3, 2022-02-07 00:00:00)"
6,8,child5,cinID4,2021-07-19,7,NaT,"(child5, cinID4)","(child5, cinID4, 2021-07-19 00:00:00)"
7,9,child6,cinID4,2021-07-19,8,NaT,"(child6, cinID4)","(child6, cinID4, 2021-07-19 00:00:00)"
8,10,child8,cinID1,2021-10-20,10,NaT,"(child8, cinID1)","(child8, cinID1, 2021-10-20 00:00:00)"


In [18]:
df_cpp_cin_startdate_fail = df_cpp_cin_failable[~(df_cpp_cin_failable["ERROR_startdate"].isin(df_cpp_cin_pass["ERROR_startdate"]))]
df_cpp_cin_startdate_fail

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_cin,DateOfInitialCPC,ERROR_ID,ERROR_startdate
0,0,child1,cinID1,2021-05-26,0,2020-10-26,"(child1, cinID1)","(child1, cinID1, 2021-05-26 00:00:00)"
1,1,child1,cinID1,2021-06-26,0,2020-10-26,"(child1, cinID1)","(child1, cinID1, 2021-06-26 00:00:00)"
3,4,child3,cinID1,2021-05-26,3,2021-05-28,"(child3, cinID1)","(child3, cinID1, 2021-05-26 00:00:00)"
4,6,child3,cinID3,2022-02-07,5,2003-05-26,"(child3, cinID3)","(child3, cinID3, 2022-02-07 00:00:00)"
6,8,child5,cinID4,2021-07-19,7,NaT,"(child5, cinID4)","(child5, cinID4, 2021-07-19 00:00:00)"
7,9,child6,cinID4,2021-07-19,8,NaT,"(child6, cinID4)","(child6, cinID4, 2021-07-19 00:00:00)"
8,10,child8,cinID1,2021-10-20,10,NaT,"(child8, cinID1)","(child8, cinID1, 2021-10-20 00:00:00)"


In [19]:
df_47["ERROR_ID"] = tuple(zip(df_47[LAchildID], df_47[CINdetailsID]))
df_47

Unnamed: 0,ROW_ID,LAchildID,DateOfInitialCPC,CINdetailsID,ERROR_ID
0,0,child1,2021-05-26,cinID1,"(child1, cinID1)"
1,1,child1,2021-05-26,cinID2,"(child1, cinID2)"
2,2,child2,2021-05-30,cinID1,"(child2, cinID1)"
3,3,child3,2021-05-26,cinID2,"(child3, cinID2)"
4,4,child3,2021-05-26,cinID3,"(child3, cinID3)"
5,5,child3,NaT,cinID4,"(child3, cinID4)"
6,6,child5,2021-07-19,cinID4,"(child5, cinID4)"
7,7,child5,NaT,cinID4,"(child5, cinID4)"
8,8,child6,NaT,cinID4,"(child6, cinID4)"
9,9,child8,2021-10-20,cinID1,"(child8, cinID1)"


In [20]:
df_cpp_cin_fail_no_47 = df_cpp_cin_fail[~(df_cpp_cin_fail["ERROR_ID"].isin(df_47["ERROR_ID"]))]
df_cpp_cin_fail_no_47

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_cin,DateOfInitialCPC,ERROR_ID,ERROR_startdate
3,4,child3,cinID1,2021-05-26,3,2021-05-28,"(child3, cinID1)","(child3, cinID1, 2021-05-26 00:00:00)"


In [21]:
df_cpp_cin_startdate_fail_no47 = df_cpp_cin_startdate_fail[~(df_cpp_cin_startdate_fail["ERROR_ID"].isin(df_47["ERROR_ID"]))]
df_cpp_cin_startdate_fail_no47

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_cin,DateOfInitialCPC,ERROR_ID,ERROR_startdate
3,4,child3,cinID1,2021-05-26,3,2021-05-28,"(child3, cinID1)","(child3, cinID1, 2021-05-26 00:00:00)"


In [22]:
df_cpp_47_fail

Unnamed: 0,ROW_ID_cpp,LAchildID,CINdetailsID,CPPstartDate,ROW_ID_47,DateOfInitialCPC,ERROR_ID
2,3,child2,cinID1,2021-05-26,2,2021-05-30,"(child2, cinID1)"
3,6,child3,cinID3,2022-02-07,4,2021-05-26,"(child3, cinID3)"
4,7,child3,cinID4,2022-03-14,5,NaT,"(child3, cinID4)"
7,9,child6,cinID4,2021-07-19,8,NaT,"(child6, cinID4)"


In [23]:
df_cpp["ERROR_ID"] = tuple(zip(df_cpp[LAchildID], df_cpp[CINdetailsID]))
df_cpp

Unnamed: 0,ROW_ID,LAchildID,CINdetailsID,CPPstartDate,ERROR_ID
0,0,child1,cinID1,2021-05-26,"(child1, cinID1)"
1,1,child1,cinID1,2021-06-26,"(child1, cinID1)"
3,3,child2,cinID1,2021-05-26,"(child2, cinID1)"
4,4,child3,cinID1,2021-05-26,"(child3, cinID1)"
6,6,child3,cinID3,2022-02-07,"(child3, cinID3)"
7,7,child3,cinID4,2022-03-14,"(child3, cinID4)"
8,8,child5,cinID4,2021-07-19,"(child5, cinID4)"
9,9,child6,cinID4,2021-07-19,"(child6, cinID4)"
10,10,child8,cinID1,2021-10-20,"(child8, cinID1)"
11,11,child9,cinID1,2021-10-20,"(child9, cinID1)"


In [24]:
df_cin["ERROR_ID"] = tuple(zip(df_cin[LAchildID], df_cin[CINdetailsID]))
df_cin

Unnamed: 0,ROW_ID,LAchildID,DateOfInitialCPC,CINdetailsID,ERROR_ID
0,0,child1,2020-10-26,cinID1,"(child1, cinID1)"
1,1,child1,2021-05-26,cinID2,"(child1, cinID2)"
2,2,child2,2021-05-26,cinID1,"(child2, cinID1)"
3,3,child3,2021-05-28,cinID1,"(child3, cinID1)"
4,4,child3,2021-05-26,cinID2,"(child3, cinID2)"
5,5,child3,2003-05-26,cinID3,"(child3, cinID3)"
6,6,child3,2022-03-14,cinID4,"(child3, cinID4)"
7,7,child5,NaT,cinID4,"(child5, cinID4)"
8,8,child6,NaT,cinID4,"(child6, cinID4)"
9,9,child7,NaT,cinID1,"(child7, cinID1)"


In [25]:
df_cpp_issues = df_cpp[(df_cpp["ROW_ID"].isin(df_cpp_47_startdate_fail["ROW_ID_cpp"])) | (df_cpp["ROW_ID"].isin(df_cpp_cin_startdate_fail_no47["ROW_ID_cpp"]))] 
df_cpp_issues

Unnamed: 0,ROW_ID,LAchildID,CINdetailsID,CPPstartDate,ERROR_ID
1,1,child1,cinID1,2021-06-26,"(child1, cinID1)"
3,3,child2,cinID1,2021-05-26,"(child2, cinID1)"
4,4,child3,cinID1,2021-05-26,"(child3, cinID1)"
6,6,child3,cinID3,2022-02-07,"(child3, cinID3)"
7,7,child3,cinID4,2022-03-14,"(child3, cinID4)"
9,9,child6,cinID4,2021-07-19,"(child6, cinID4)"


In [26]:
df_47_issues = df_47[df_47["ROW_ID"].isin(df_cpp_47_fail["ROW_ID_47"])].groupby("ERROR_ID", group_keys=False)["ROW_ID"].apply(list).reset_index()
df_47_issues

Unnamed: 0,ERROR_ID,ROW_ID
0,"(child2, cinID1)",[2]
1,"(child3, cinID3)",[4]
2,"(child3, cinID4)",[5]
3,"(child6, cinID4)",[8]


In [27]:
df_cin_issues = df_cin[df_cin["ROW_ID"].isin(df_cpp_cin_fail_no_47["ROW_ID_cin"])]
df_cin_issues

Unnamed: 0,ROW_ID,LAchildID,DateOfInitialCPC,CINdetailsID,ERROR_ID
3,3,child3,2021-05-28,cinID1,"(child3, cinID1)"
