In [1]:
import pandas as pd

# Use import os to access files from local machine
# import os
# fp = "c://data/903/2025/list_1.csv"

# the point of the r is to tell python to read every character in the links below exactly as they are
#child_identifiers = pd.read_csv(r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildIdentifiers.csv")
#child_characteristics = pd.read_csv(r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildCharacteristics.csv")

# this is better practice as it allows you to make use of more pandas objects
ci_filepath = r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildIdentifiers.csv"
cc_filepath = r"https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/ChildCharacteristics.csv"


child_identifiers = pd.read_csv(ci_filepath)
child_characteristics = pd.read_csv(cc_filepath)

In [2]:
child_identifiers.info()
# child_identifiers.head()
# child_characteristics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               332 non-null    int64  
 1   LAchildID                332 non-null    object 
 2   UPN                      332 non-null    object 
 3   FormerUPN                0 non-null      float64
 4   UPNunknown               0 non-null      float64
 5   PersonBirthDate          332 non-null    object 
 6   ExpectedPersonBirthDate  7 non-null      object 
 7   GenderCurrent            332 non-null    int64  
 8   PersonDeathDate          21 non-null     object 
dtypes: float64(2), int64(2), object(5)
memory usage: 23.5+ KB


In [3]:
# Creating your own dataframe

# One way is a dictionary {} of lists [], another is a List [] of dictionaries {}

child_info = pd.DataFrame({'ChildId':['id1', 'id2', 'id3', 'id4', 'id5'],
                   'Age first contact':[6,12,11,1,19],
                   'Gender':['M','m', 'F', '', 'F' ],
                   'Birthday':['01/01/2002', '02/02/2003', pd.NA, '03/03/2023', '06/01/2012'],
                   'CP Plan?':['Y', 'n', 'N', 'No', 'yES'],})

nhs_numbers = pd.DataFrame([
                            {'ChildId':'id1',
                            'NHS Number': '303',},
                            {'ChildId':'id2',
                            'NHS Number': '3u5029',},
                            {'ChildId':'id3',
                            'NHS Number': 'gqw3',},
                            {'ChildId':'id4',
                            'NHS Number': 'avsgvb',},
                            {'ChildId':'id5',
                            'NHS Number': 'varwvw',},
                            ])
child_info['Birthday'] = pd.to_datetime(child_info['Birthday'], format="%d/%m/%Y", errors="coerce")


child_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ChildId            5 non-null      object        
 1   Age first contact  5 non-null      int64         
 2   Gender             5 non-null      object        
 3   Birthday           4 non-null      datetime64[ns]
 4   CP Plan?           5 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 332.0+ bytes


In [4]:
# strftime - string for time

child_identifiers["PersonBirthDate"] = pd.to_datetime(child_identifiers["PersonBirthDate"], format="%Y-%m-%d", errors="coerce")

child_identifiers["PersonDeathDate"] = pd.to_datetime(child_identifiers["PersonDeathDate"], format="%Y-%m-%d", errors="coerce")

child_identifiers["ExpectedPersonBirthDate"] = pd.to_datetime(child_identifiers["ExpectedPersonBirthDate"], format="%Y-%m-%d", errors="coerce")

child_identifiers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Unnamed: 0               332 non-null    int64         
 1   LAchildID                332 non-null    object        
 2   UPN                      332 non-null    object        
 3   FormerUPN                0 non-null      float64       
 4   UPNunknown               0 non-null      float64       
 5   PersonBirthDate          332 non-null    datetime64[ns]
 6   ExpectedPersonBirthDate  7 non-null      datetime64[ns]
 7   GenderCurrent            332 non-null    int64         
 8   PersonDeathDate          20 non-null     datetime64[ns]
dtypes: datetime64[ns](3), float64(2), int64(2), object(2)
memory usage: 23.5+ KB


In [8]:
from dateutil.relativedelta import relativedelta

# import numpy as np
# np.timedelta64(1, 'Y')

child_info['Birthday_dt'] = pd.to_datetime(child_info['Birthday'], dayfirst=True)

child_info['Age'] = child_info['Birthday_dt'].apply(lambda row: relativedelta(pd.to_datetime("today"),row).years)

#child_info['Age'] = pd.to_datetime('today') - child_info['Birthday']
#child_info['Age'] = child_info['Age'] / pd.Timedelta(365.25, 'd')

#child_info['Age'] = child_info['Age'].astype('int', errors='ignore')

child_info['Age']

AssertionError: 

In [9]:
child_identifiers['Age'] = pd.to_datetime('today') - child_identifiers['PersonBirthDate']
child_identifiers['Age'] = child_identifiers['Age'] / pd.Timedelta(365.25, 'd')
child_identifiers['Age'] = child_identifiers['Age'].astype('int', errors='ignore')

child_identifiers['Age']

0       5
1      14
2       8
3      11
4       6
       ..
327    15
328     7
329     3
330    12
331     4
Name: Age, Length: 332, dtype: int64

In [10]:
# reading in excel - standard is to read first sheet, but you can specify with sheet_name
# sheet_name=None will bring back all worksheets

import openpyxl

filepath = r"https://github.com/data-to-insight/ERN-sessions/raw/main/data/small%20excel.xlsx"

df_dict = pd.read_excel(filepath, sheet_name=None)

df_dict['Child Data']

Unnamed: 0,ChildID,Sex,DOB,EHCP issued
0,id1,M,2014-12-05,N
1,id6,m,2016-01-06,n
2,id4,m,2013-11-09,Y
3,id3,F,2022-12-24,Y
4,id5,F,2020-01-15,
5,id7,f,2022-09-17,y


In [11]:
# to check columns in datsets we can compare using
# print(child_info.columns)
# print(nhs_numbers.columns)


# We can use the Pandas .merge function to join left hand table to right hand table
# identifying columns to join on and our type of join 
child_info = pd.merge(child_info, nhs_numbers, left_on='ChildId', right_on='ChildId', how='left')

# Alternative approach
# child_info = child_info.merge(nhs_numbers, on='LAchildID', how='left', suffixes=['_ident','_char'])

child_info

Unnamed: 0,ChildId,Age first contact,Gender,Birthday,CP Plan?,Birthday_dt,NHS Number
0,id1,6,M,2002-01-01,Y,2002-01-01,303
1,id2,12,m,2003-02-02,n,2003-02-02,3u5029
2,id3,11,F,NaT,N,NaT,gqw3
3,id4,1,,2023-03-03,No,2023-03-03,avsgvb
4,id5,19,F,2012-01-06,yES,2012-01-06,varwvw


In [None]:
# child_identifiers = pd.merge(child_identifiers, child_characteristics, how='left', on='LAchildID', suffixes=("_ident", "_char"))
child_identifiers = pd.merge(child_identifiers, child_characteristics, how='left', on=['LAchildID', 'Unnamed: 0'])


# Alternative approach
# child_identifiers = child_identifiers.merge(nhs_numbers, on='LAchildID', how='left', suffixes=['_ident','_char'])
# child_identifiers = child_identifiers.merge(child_characteristicscharacteristics, on=['LAchildID', 'Unnamed: 0'], how='left')


child_identifiers

In [None]:
df = pd.merge(df_dict['Child Data'], df_dict['Child extras'], how="left", on="ChildID")
df = pd.merge(df, df_dict['Sheet3'], how='left', on='Assessment Code')

df

In [None]:
> # greater than
< # less than
== # equals

~ # not
>=/<= # greater equals/less equals 
!= # does not equal

& # and
| # or

# .isna() / .notna()
# .isin()
# .str.contains()

In [None]:
# | or, & and

over_5_cond = child_info['Age'] > 5
under_20_cond = child_info['Age'] < 20

between_5_20 = child_info[over_5_cond & under_20_cond]

not_between_5_20 = child_info[(child_info['Age'] < 5) | (child_info['Age'] > 20)]

In [12]:
condition = child_info['Age'] < 5

over_5 = child_info[~condition]

# over_5

over_or_16 = child_info[child_info['Age'] >= 16]

over_or_16

# .min(), .max(), .mean(), .median()

# make a slice of the child_identifiers table and take only children over the mean age using .mean()

KeyError: 'Age'

In [None]:
condition_1 = child_info['Age'] > 16
condition_2 = child_info['Gender'].str.lower() == "m"

# child_info['Gender'] = child_info['Gender'].str.lower()

over_16_m = child_info[condition_1 & condition_2]

over_16_m

In [None]:
# find everyone who is either gender current is 1 OR ethnicity is WROM
wrom_or_1 = child_identifiers[(child_identifiers['GenderCurrent'] == 1) | (child_identifiers['Ethnicity'] == 'WROM')]

wrom_or_1